1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25/* Used for function call generation. */ 26#define TCG_TARGET_STACK_ALIGN 16 27#if defined(_WIN64) 28#define TCG_TARGET_CALL_STACK_OFFSET 32 29#else 30#define TCG_TARGET_CALL_STACK_OFFSET 0 31#endif 32#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL 33#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL 34#if defined(_WIN64) 35# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF 36# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_VEC 37#elif TCG_TARGET_REG_BITS == 64 38# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 39# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL 40#else 41# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL 42# define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF 43#endif 44 45#ifdef CONFIG_DEBUG_TCG 46static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 47#if TCG_TARGET_REG_BITS == 64 48 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 49#else 50 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 51#endif 52 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 53 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 54#if TCG_TARGET_REG_BITS == 64 55 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 56 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 57#endif 58}; 59#endif 60 61static const int tcg_target_reg_alloc_order[] = { 62#if TCG_TARGET_REG_BITS == 64 63 TCG_REG_RBP, 64 TCG_REG_RBX, 65 TCG_REG_R12, 66 TCG_REG_R13, 67 TCG_REG_R14, 68 TCG_REG_R15, 69 TCG_REG_R10, 70 TCG_REG_R11, 71 TCG_REG_R9, 72 TCG_REG_R8, 73 TCG_REG_RCX, 74 TCG_REG_RDX, 75 TCG_REG_RSI, 76 TCG_REG_RDI, 77 TCG_REG_RAX, 78#else 79 TCG_REG_EBX, 80 TCG_REG_ESI, 81 TCG_REG_EDI, 82 TCG_REG_EBP, 83 TCG_REG_ECX, 84 TCG_REG_EDX, 85 TCG_REG_EAX, 86#endif 87 TCG_REG_XMM0, 88 TCG_REG_XMM1, 89 TCG_REG_XMM2, 90 TCG_REG_XMM3, 91 TCG_REG_XMM4, 92 TCG_REG_XMM5, 93#ifndef _WIN64 94 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 95 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 96 TCG_REG_XMM6, 97 TCG_REG_XMM7, 98#if TCG_TARGET_REG_BITS == 64 99 TCG_REG_XMM8, 100 TCG_REG_XMM9, 101 TCG_REG_XMM10, 102 TCG_REG_XMM11, 103 TCG_REG_XMM12, 104 TCG_REG_XMM13, 105 TCG_REG_XMM14, 106 TCG_REG_XMM15, 107#endif 108#endif 109}; 110 111#define TCG_TMP_VEC TCG_REG_XMM5 112 113static const int tcg_target_call_iarg_regs[] = { 114#if TCG_TARGET_REG_BITS == 64 115#if defined(_WIN64) 116 TCG_REG_RCX, 117 TCG_REG_RDX, 118#else 119 TCG_REG_RDI, 120 TCG_REG_RSI, 121 TCG_REG_RDX, 122 TCG_REG_RCX, 123#endif 124 TCG_REG_R8, 125 TCG_REG_R9, 126#else 127 /* 32 bit mode uses stack based calling convention (GCC default). */ 128#endif 129}; 130 131static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 132{ 133 switch (kind) { 134 case TCG_CALL_RET_NORMAL: 135 tcg_debug_assert(slot >= 0 && slot <= 1); 136 return slot ? TCG_REG_EDX : TCG_REG_EAX; 137#ifdef _WIN64 138 case TCG_CALL_RET_BY_VEC: 139 tcg_debug_assert(slot == 0); 140 return TCG_REG_XMM0; 141#endif 142 default: 143 g_assert_not_reached(); 144 } 145} 146 147/* Constants we accept. */ 148#define TCG_CT_CONST_S32 0x100 149#define TCG_CT_CONST_U32 0x200 150#define TCG_CT_CONST_I32 0x400 151#define TCG_CT_CONST_WSZ 0x800 152#define TCG_CT_CONST_TST 0x1000 153#define TCG_CT_CONST_ZERO 0x2000 154 155/* Registers used with L constraint, which are the first argument 156 registers on x86_64, and two random call clobbered registers on 157 i386. */ 158#if TCG_TARGET_REG_BITS == 64 159# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 160# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 161#else 162# define TCG_REG_L0 TCG_REG_EAX 163# define TCG_REG_L1 TCG_REG_EDX 164#endif 165 166#if TCG_TARGET_REG_BITS == 64 167# define ALL_GENERAL_REGS 0x0000ffffu 168# define ALL_VECTOR_REGS 0xffff0000u 169# define ALL_BYTEL_REGS ALL_GENERAL_REGS 170#else 171# define ALL_GENERAL_REGS 0x000000ffu 172# define ALL_VECTOR_REGS 0x00ff0000u 173# define ALL_BYTEL_REGS 0x0000000fu 174#endif 175#define SOFTMMU_RESERVE_REGS \ 176 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 177 178#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 179#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 180 181static const tcg_insn_unit *tb_ret_addr; 182 183static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 184 intptr_t value, intptr_t addend) 185{ 186 value += addend; 187 switch(type) { 188 case R_386_PC32: 189 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 190 if (value != (int32_t)value) { 191 return false; 192 } 193 /* FALLTHRU */ 194 case R_386_32: 195 tcg_patch32(code_ptr, value); 196 break; 197 case R_386_PC8: 198 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 199 if (value != (int8_t)value) { 200 return false; 201 } 202 tcg_patch8(code_ptr, value); 203 break; 204 default: 205 g_assert_not_reached(); 206 } 207 return true; 208} 209 210/* test if a constant matches the constraint */ 211static bool tcg_target_const_match(int64_t val, int ct, 212 TCGType type, TCGCond cond, int vece) 213{ 214 if (ct & TCG_CT_CONST) { 215 return 1; 216 } 217 if (type == TCG_TYPE_I32) { 218 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 219 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 220 return 1; 221 } 222 } else { 223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 224 return 1; 225 } 226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 227 return 1; 228 } 229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 230 return 1; 231 } 232 /* 233 * This will be used in combination with TCG_CT_CONST_S32, 234 * so "normal" TESTQ is already matched. Also accept: 235 * TESTQ -> TESTL (uint32_t) 236 * TESTQ -> BT (is_power_of_2) 237 */ 238 if ((ct & TCG_CT_CONST_TST) 239 && is_tst_cond(cond) 240 && (val == (uint32_t)val || is_power_of_2(val))) { 241 return 1; 242 } 243 } 244 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 245 return 1; 246 } 247 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EbIb (0x80) 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVB_Ib (0xb0) 307#define OPC_MOVL_Iv (0xb8) 308#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 309#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 310#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 311#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 312#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 313#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 314#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 315#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 316#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 318#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 319#define OPC_MOVSBL (0xbe | P_EXT) 320#define OPC_MOVSWL (0xbf | P_EXT) 321#define OPC_MOVSLQ (0x63 | P_REXW) 322#define OPC_MOVZBL (0xb6 | P_EXT) 323#define OPC_MOVZWL (0xb7 | P_EXT) 324#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 325#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 326#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 327#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 328#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 329#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 330#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 331#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 332#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 333#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 334#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 335#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 336#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 337#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 338#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 339#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 340#define OPC_PAND (0xdb | P_EXT | P_DATA16) 341#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 342#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 343#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 344#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 345#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 346#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 347#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 348#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 349#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 350#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 351#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 352#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 353#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 354#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 355#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 356#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 358#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 359#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 360#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 362#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 363#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 364#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 366#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 367#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 368#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 369#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 370#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 371#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 372#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 373#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 374#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 375#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 376#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 377#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 378#define OPC_POR (0xeb | P_EXT | P_DATA16) 379#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 380#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 381#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 382#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 383#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 384#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 385#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 386#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 387#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 388#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 389#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 390#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 391#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 392#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 393#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 394#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 395#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 396#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 397#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 398#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 399#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 400#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 401#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 402#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 404#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 405#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 406#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 407#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 408#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 409#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 410#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 411#define OPC_PXOR (0xef | P_EXT | P_DATA16) 412#define OPC_POP_r32 (0x58) 413#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 414#define OPC_PUSH_r32 (0x50) 415#define OPC_PUSH_Iv (0x68) 416#define OPC_PUSH_Ib (0x6a) 417#define OPC_RET (0xc3) 418#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 419#define OPC_SHIFT_1 (0xd1) 420#define OPC_SHIFT_Ib (0xc1) 421#define OPC_SHIFT_cl (0xd3) 422#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 423#define OPC_SHUFPS (0xc6 | P_EXT) 424#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 425#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 426#define OPC_SHRD_Ib (0xac | P_EXT) 427#define OPC_STC (0xf9) 428#define OPC_TESTB (0x84) 429#define OPC_TESTL (0x85) 430#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 431#define OPC_UD2 (0x0b | P_EXT) 432#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 433#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 434#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 435#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 436#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 437#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 438#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 439#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 440#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 442#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 443#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 444#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 446#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 447#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 448#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 449#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 450#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 451#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 452#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 453#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 454#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 455#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 456#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 457#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 458#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 459#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 460#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 461#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 462#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 463#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 464#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 465#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 466#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 468#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 469#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 470#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 471#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 472#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 473#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 474#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 475#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 476#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 477#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 478#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 479#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 480#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 481#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 482#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 483#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 484#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 485#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 486#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 487#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 488#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 489#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 490#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 491#define OPC_VZEROUPPER (0x77 | P_EXT) 492#define OPC_XCHG_ax_r32 (0x90) 493#define OPC_XCHG_EvGv (0x87) 494 495#define OPC_GRP3_Eb (0xf6) 496#define OPC_GRP3_Ev (0xf7) 497#define OPC_GRP5 (0xff) 498#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 499#define OPC_GRPBT (0xba | P_EXT) 500 501#define OPC_GRPBT_BT 4 502#define OPC_GRPBT_BTS 5 503#define OPC_GRPBT_BTR 6 504#define OPC_GRPBT_BTC 7 505 506/* Group 1 opcode extensions for 0x80-0x83. 507 These are also used as modifiers for OPC_ARITH. */ 508#define ARITH_ADD 0 509#define ARITH_OR 1 510#define ARITH_ADC 2 511#define ARITH_SBB 3 512#define ARITH_AND 4 513#define ARITH_SUB 5 514#define ARITH_XOR 6 515#define ARITH_CMP 7 516 517/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 518#define SHIFT_ROL 0 519#define SHIFT_ROR 1 520#define SHIFT_SHL 4 521#define SHIFT_SHR 5 522#define SHIFT_SAR 7 523 524/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 525#define EXT3_TESTi 0 526#define EXT3_NOT 2 527#define EXT3_NEG 3 528#define EXT3_MUL 4 529#define EXT3_IMUL 5 530#define EXT3_DIV 6 531#define EXT3_IDIV 7 532 533/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 534#define EXT5_INC_Ev 0 535#define EXT5_DEC_Ev 1 536#define EXT5_CALLN_Ev 2 537#define EXT5_JMPN_Ev 4 538 539/* Condition codes to be added to OPC_JCC_{long,short}. */ 540#define JCC_JMP (-1) 541#define JCC_JO 0x0 542#define JCC_JNO 0x1 543#define JCC_JB 0x2 544#define JCC_JAE 0x3 545#define JCC_JE 0x4 546#define JCC_JNE 0x5 547#define JCC_JBE 0x6 548#define JCC_JA 0x7 549#define JCC_JS 0x8 550#define JCC_JNS 0x9 551#define JCC_JP 0xa 552#define JCC_JNP 0xb 553#define JCC_JL 0xc 554#define JCC_JGE 0xd 555#define JCC_JLE 0xe 556#define JCC_JG 0xf 557 558static const uint8_t tcg_cond_to_jcc[] = { 559 [TCG_COND_EQ] = JCC_JE, 560 [TCG_COND_NE] = JCC_JNE, 561 [TCG_COND_LT] = JCC_JL, 562 [TCG_COND_GE] = JCC_JGE, 563 [TCG_COND_LE] = JCC_JLE, 564 [TCG_COND_GT] = JCC_JG, 565 [TCG_COND_LTU] = JCC_JB, 566 [TCG_COND_GEU] = JCC_JAE, 567 [TCG_COND_LEU] = JCC_JBE, 568 [TCG_COND_GTU] = JCC_JA, 569 [TCG_COND_TSTEQ] = JCC_JE, 570 [TCG_COND_TSTNE] = JCC_JNE, 571}; 572 573#if TCG_TARGET_REG_BITS == 64 574static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 575{ 576 int rex; 577 578 if (opc & P_GS) { 579 tcg_out8(s, 0x65); 580 } 581 if (opc & P_DATA16) { 582 /* We should never be asking for both 16 and 64-bit operation. */ 583 tcg_debug_assert((opc & P_REXW) == 0); 584 tcg_out8(s, 0x66); 585 } 586 if (opc & P_SIMDF3) { 587 tcg_out8(s, 0xf3); 588 } else if (opc & P_SIMDF2) { 589 tcg_out8(s, 0xf2); 590 } 591 592 rex = 0; 593 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 594 rex |= (r & 8) >> 1; /* REX.R */ 595 rex |= (x & 8) >> 2; /* REX.X */ 596 rex |= (rm & 8) >> 3; /* REX.B */ 597 598 /* P_REXB_{R,RM} indicates that the given register is the low byte. 599 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 600 as otherwise the encoding indicates %[abcd]h. Note that the values 601 that are ORed in merely indicate that the REX byte must be present; 602 those bits get discarded in output. */ 603 rex |= opc & (r >= 4 ? P_REXB_R : 0); 604 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 605 606 if (rex) { 607 tcg_out8(s, (uint8_t)(rex | 0x40)); 608 } 609 610 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 611 tcg_out8(s, 0x0f); 612 if (opc & P_EXT38) { 613 tcg_out8(s, 0x38); 614 } else if (opc & P_EXT3A) { 615 tcg_out8(s, 0x3a); 616 } 617 } 618 619 tcg_out8(s, opc); 620} 621#else 622static void tcg_out_opc(TCGContext *s, int opc) 623{ 624 if (opc & P_DATA16) { 625 tcg_out8(s, 0x66); 626 } 627 if (opc & P_SIMDF3) { 628 tcg_out8(s, 0xf3); 629 } else if (opc & P_SIMDF2) { 630 tcg_out8(s, 0xf2); 631 } 632 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 633 tcg_out8(s, 0x0f); 634 if (opc & P_EXT38) { 635 tcg_out8(s, 0x38); 636 } else if (opc & P_EXT3A) { 637 tcg_out8(s, 0x3a); 638 } 639 } 640 tcg_out8(s, opc); 641} 642/* Discard the register arguments to tcg_out_opc early, so as not to penalize 643 the 32-bit compilation paths. This method works with all versions of gcc, 644 whereas relying on optimization may not be able to exclude them. */ 645#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 646#endif 647 648static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 649{ 650 tcg_out_opc(s, opc, r, rm, 0); 651 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 652} 653 654static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 655 int rm, int index) 656{ 657 int tmp; 658 659 if (opc & P_GS) { 660 tcg_out8(s, 0x65); 661 } 662 /* Use the two byte form if possible, which cannot encode 663 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 664 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 665 && ((rm | index) & 8) == 0) { 666 /* Two byte VEX prefix. */ 667 tcg_out8(s, 0xc5); 668 669 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 670 } else { 671 /* Three byte VEX prefix. */ 672 tcg_out8(s, 0xc4); 673 674 /* VEX.m-mmmm */ 675 if (opc & P_EXT3A) { 676 tmp = 3; 677 } else if (opc & P_EXT38) { 678 tmp = 2; 679 } else if (opc & P_EXT) { 680 tmp = 1; 681 } else { 682 g_assert_not_reached(); 683 } 684 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 685 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 686 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 687 tcg_out8(s, tmp); 688 689 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 690 } 691 692 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 693 /* VEX.pp */ 694 if (opc & P_DATA16) { 695 tmp |= 1; /* 0x66 */ 696 } else if (opc & P_SIMDF3) { 697 tmp |= 2; /* 0xf3 */ 698 } else if (opc & P_SIMDF2) { 699 tmp |= 3; /* 0xf2 */ 700 } 701 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 702 tcg_out8(s, tmp); 703 tcg_out8(s, opc); 704} 705 706static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 707 int rm, int index, int aaa, bool z) 708{ 709 /* The entire 4-byte evex prefix; with R' and V' set. */ 710 uint32_t p = 0x08041062; 711 int mm, pp; 712 713 tcg_debug_assert(have_avx512vl); 714 715 /* EVEX.mm */ 716 if (opc & P_EXT3A) { 717 mm = 3; 718 } else if (opc & P_EXT38) { 719 mm = 2; 720 } else if (opc & P_EXT) { 721 mm = 1; 722 } else { 723 g_assert_not_reached(); 724 } 725 726 /* EVEX.pp */ 727 if (opc & P_DATA16) { 728 pp = 1; /* 0x66 */ 729 } else if (opc & P_SIMDF3) { 730 pp = 2; /* 0xf3 */ 731 } else if (opc & P_SIMDF2) { 732 pp = 3; /* 0xf2 */ 733 } else { 734 pp = 0; 735 } 736 737 p = deposit32(p, 8, 2, mm); 738 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 739 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 740 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 741 p = deposit32(p, 16, 2, pp); 742 p = deposit32(p, 19, 4, ~v); 743 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 744 p = deposit32(p, 24, 3, aaa); 745 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 746 p = deposit32(p, 31, 1, z); 747 748 tcg_out32(s, p); 749 tcg_out8(s, opc); 750} 751 752static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 753{ 754 if (opc & P_EVEX) { 755 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 756 } else { 757 tcg_out_vex_opc(s, opc, r, v, rm, 0); 758 } 759 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 760} 761 762static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 763 int r, int v, int rm, TCGType type) 764{ 765 if (type == TCG_TYPE_V256) { 766 opc |= P_VEXL; 767 } 768 tcg_out_vex_modrm(s, opc, r, v, rm); 769} 770 771static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 772 int rm, int aaa, bool z, TCGType type) 773{ 774 if (type == TCG_TYPE_V256) { 775 opc |= P_VEXL; 776 } 777 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 778 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 779} 780 781/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 782 We handle either RM and INDEX missing with a negative value. In 64-bit 783 mode for absolute addresses, ~RM is the size of the immediate operand 784 that will follow the instruction. */ 785 786static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 787 int shift, intptr_t offset) 788{ 789 int mod, len; 790 791 if (index < 0 && rm < 0) { 792 if (TCG_TARGET_REG_BITS == 64) { 793 /* Try for a rip-relative addressing mode. This has replaced 794 the 32-bit-mode absolute addressing encoding. */ 795 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 796 intptr_t disp = offset - pc; 797 if (disp == (int32_t)disp) { 798 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 799 tcg_out32(s, disp); 800 return; 801 } 802 803 /* Try for an absolute address encoding. This requires the 804 use of the MODRM+SIB encoding and is therefore larger than 805 rip-relative addressing. */ 806 if (offset == (int32_t)offset) { 807 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 808 tcg_out8(s, (4 << 3) | 5); 809 tcg_out32(s, offset); 810 return; 811 } 812 813 /* ??? The memory isn't directly addressable. */ 814 g_assert_not_reached(); 815 } else { 816 /* Absolute address. */ 817 tcg_out8(s, (r << 3) | 5); 818 tcg_out32(s, offset); 819 return; 820 } 821 } 822 823 /* Find the length of the immediate addend. Note that the encoding 824 that would be used for (%ebp) indicates absolute addressing. */ 825 if (rm < 0) { 826 mod = 0, len = 4, rm = 5; 827 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 828 mod = 0, len = 0; 829 } else if (offset == (int8_t)offset) { 830 mod = 0x40, len = 1; 831 } else { 832 mod = 0x80, len = 4; 833 } 834 835 /* Use a single byte MODRM format if possible. Note that the encoding 836 that would be used for %esp is the escape to the two byte form. */ 837 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 838 /* Single byte MODRM format. */ 839 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 840 } else { 841 /* Two byte MODRM+SIB format. */ 842 843 /* Note that the encoding that would place %esp into the index 844 field indicates no index register. In 64-bit mode, the REX.X 845 bit counts, so %r12 can be used as the index. */ 846 if (index < 0) { 847 index = 4; 848 } else { 849 tcg_debug_assert(index != TCG_REG_ESP); 850 } 851 852 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 853 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 854 } 855 856 if (len == 1) { 857 tcg_out8(s, offset); 858 } else if (len == 4) { 859 tcg_out32(s, offset); 860 } 861} 862 863static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 864 int index, int shift, intptr_t offset) 865{ 866 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 867 tcg_out_sib_offset(s, r, rm, index, shift, offset); 868} 869 870static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 871 int rm, int index, int shift, 872 intptr_t offset) 873{ 874 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 875 tcg_out_sib_offset(s, r, rm, index, shift, offset); 876} 877 878/* A simplification of the above with no index or shift. */ 879static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 880 int rm, intptr_t offset) 881{ 882 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 883} 884 885static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 886 int v, int rm, intptr_t offset) 887{ 888 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 889} 890 891/* Output an opcode with an expected reference to the constant pool. */ 892static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 893{ 894 tcg_out_opc(s, opc, r, 0, 0); 895 /* Absolute for 32-bit, pc-relative for 64-bit. */ 896 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 897 tcg_out32(s, 0); 898} 899 900/* Output an opcode with an expected reference to the constant pool. */ 901static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 902{ 903 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 904 /* Absolute for 32-bit, pc-relative for 64-bit. */ 905 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 906 tcg_out32(s, 0); 907} 908 909/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 910static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 911{ 912 /* Propagate an opcode prefix, such as P_REXW. */ 913 int ext = subop & ~0x7; 914 subop &= 0x7; 915 916 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 917} 918 919static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 920{ 921 int rexw = 0; 922 923 if (arg == ret) { 924 return true; 925 } 926 switch (type) { 927 case TCG_TYPE_I64: 928 rexw = P_REXW; 929 /* fallthru */ 930 case TCG_TYPE_I32: 931 if (ret < 16) { 932 if (arg < 16) { 933 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 934 } else { 935 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 936 } 937 } else { 938 if (arg < 16) { 939 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 940 } else { 941 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 942 } 943 } 944 break; 945 946 case TCG_TYPE_V64: 947 tcg_debug_assert(ret >= 16 && arg >= 16); 948 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 949 break; 950 case TCG_TYPE_V128: 951 tcg_debug_assert(ret >= 16 && arg >= 16); 952 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 953 break; 954 case TCG_TYPE_V256: 955 tcg_debug_assert(ret >= 16 && arg >= 16); 956 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 957 break; 958 959 default: 960 g_assert_not_reached(); 961 } 962 return true; 963} 964 965static const int avx2_dup_insn[4] = { 966 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 967 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 968}; 969 970static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 971 TCGReg r, TCGReg a) 972{ 973 if (have_avx2) { 974 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 975 } else { 976 switch (vece) { 977 case MO_8: 978 /* ??? With zero in a register, use PSHUFB. */ 979 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 980 a = r; 981 /* FALLTHRU */ 982 case MO_16: 983 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 984 a = r; 985 /* FALLTHRU */ 986 case MO_32: 987 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 988 /* imm8 operand: all output lanes selected from input lane 0. */ 989 tcg_out8(s, 0); 990 break; 991 case MO_64: 992 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 993 break; 994 default: 995 g_assert_not_reached(); 996 } 997 } 998 return true; 999} 1000 1001static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 1002 TCGReg r, TCGReg base, intptr_t offset) 1003{ 1004 if (have_avx2) { 1005 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1006 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 1007 r, 0, base, offset); 1008 } else { 1009 switch (vece) { 1010 case MO_64: 1011 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 1012 break; 1013 case MO_32: 1014 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 1015 break; 1016 case MO_16: 1017 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1018 tcg_out8(s, 0); /* imm8 */ 1019 tcg_out_dup_vec(s, type, vece, r, r); 1020 break; 1021 case MO_8: 1022 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1023 tcg_out8(s, 0); /* imm8 */ 1024 tcg_out_dup_vec(s, type, vece, r, r); 1025 break; 1026 default: 1027 g_assert_not_reached(); 1028 } 1029 } 1030 return true; 1031} 1032 1033static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1034 TCGReg ret, int64_t arg) 1035{ 1036 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1037 1038 if (arg == 0) { 1039 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1040 return; 1041 } 1042 if (arg == -1) { 1043 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1044 return; 1045 } 1046 1047 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1048 if (have_avx2) { 1049 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1050 } else { 1051 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1052 } 1053 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1054 } else { 1055 if (type == TCG_TYPE_V64) { 1056 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1057 } else if (have_avx2) { 1058 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1059 } else { 1060 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1061 } 1062 if (TCG_TARGET_REG_BITS == 64) { 1063 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1064 } else { 1065 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1066 } 1067 } 1068} 1069 1070static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1071 TCGReg ret, tcg_target_long arg) 1072{ 1073 if (arg == 0) { 1074 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1075 return; 1076 } 1077 if (arg == -1) { 1078 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1079 return; 1080 } 1081 1082 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1083 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1084 if (TCG_TARGET_REG_BITS == 64) { 1085 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1086 } else { 1087 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1088 } 1089} 1090 1091static void tcg_out_movi_int(TCGContext *s, TCGType type, 1092 TCGReg ret, tcg_target_long arg) 1093{ 1094 tcg_target_long diff; 1095 1096 if (arg == 0 && !s->carry_live) { 1097 tgen_arithr(s, ARITH_XOR, ret, ret); 1098 return; 1099 } 1100 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1101 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1102 tcg_out32(s, arg); 1103 return; 1104 } 1105 if (arg == (int32_t)arg) { 1106 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1107 tcg_out32(s, arg); 1108 return; 1109 } 1110 1111 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1112 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1113 if (diff == (int32_t)diff) { 1114 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1115 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1116 tcg_out32(s, diff); 1117 return; 1118 } 1119 1120 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1121 tcg_out64(s, arg); 1122} 1123 1124static void tcg_out_movi(TCGContext *s, TCGType type, 1125 TCGReg ret, tcg_target_long arg) 1126{ 1127 switch (type) { 1128 case TCG_TYPE_I32: 1129#if TCG_TARGET_REG_BITS == 64 1130 case TCG_TYPE_I64: 1131#endif 1132 if (ret < 16) { 1133 tcg_out_movi_int(s, type, ret, arg); 1134 } else { 1135 tcg_out_movi_vec(s, type, ret, arg); 1136 } 1137 break; 1138 default: 1139 g_assert_not_reached(); 1140 } 1141} 1142 1143static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1144{ 1145 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1146 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1147 return true; 1148} 1149 1150static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1151 tcg_target_long imm) 1152{ 1153 /* This function is only used for passing structs by reference. */ 1154 tcg_debug_assert(imm == (int32_t)imm); 1155 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1156} 1157 1158static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1159{ 1160 if (val == (int8_t)val) { 1161 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1162 tcg_out8(s, val); 1163 } else if (val == (int32_t)val) { 1164 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1165 tcg_out32(s, val); 1166 } else { 1167 g_assert_not_reached(); 1168 } 1169} 1170 1171static void tcg_out_mb(TCGContext *s, unsigned a0) 1172{ 1173 /* Given the strength of x86 memory ordering, we only need care for 1174 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1175 faster than "mfence", so don't bother with the sse insn. */ 1176 if (a0 & TCG_MO_ST_LD) { 1177 tcg_out8(s, 0xf0); 1178 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1179 tcg_out8(s, 0); 1180 } 1181} 1182 1183static inline void tcg_out_push(TCGContext *s, int reg) 1184{ 1185 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1186} 1187 1188static inline void tcg_out_pop(TCGContext *s, int reg) 1189{ 1190 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1191} 1192 1193static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1194 TCGReg arg1, intptr_t arg2) 1195{ 1196 switch (type) { 1197 case TCG_TYPE_I32: 1198 if (ret < 16) { 1199 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1200 } else { 1201 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1202 } 1203 break; 1204 case TCG_TYPE_I64: 1205 if (ret < 16) { 1206 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1207 break; 1208 } 1209 /* FALLTHRU */ 1210 case TCG_TYPE_V64: 1211 /* There is no instruction that can validate 8-byte alignment. */ 1212 tcg_debug_assert(ret >= 16); 1213 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1214 break; 1215 case TCG_TYPE_V128: 1216 /* 1217 * The gvec infrastructure is asserts that v128 vector loads 1218 * and stores use a 16-byte aligned offset. Validate that the 1219 * final pointer is aligned by using an insn that will SIGSEGV. 1220 */ 1221 tcg_debug_assert(ret >= 16); 1222 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1223 break; 1224 case TCG_TYPE_V256: 1225 /* 1226 * The gvec infrastructure only requires 16-byte alignment, 1227 * so here we must use an unaligned load. 1228 */ 1229 tcg_debug_assert(ret >= 16); 1230 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1231 ret, 0, arg1, arg2); 1232 break; 1233 default: 1234 g_assert_not_reached(); 1235 } 1236} 1237 1238static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1239 TCGReg arg1, intptr_t arg2) 1240{ 1241 switch (type) { 1242 case TCG_TYPE_I32: 1243 if (arg < 16) { 1244 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1245 } else { 1246 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1247 } 1248 break; 1249 case TCG_TYPE_I64: 1250 if (arg < 16) { 1251 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1252 break; 1253 } 1254 /* FALLTHRU */ 1255 case TCG_TYPE_V64: 1256 /* There is no instruction that can validate 8-byte alignment. */ 1257 tcg_debug_assert(arg >= 16); 1258 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1259 break; 1260 case TCG_TYPE_V128: 1261 /* 1262 * The gvec infrastructure is asserts that v128 vector loads 1263 * and stores use a 16-byte aligned offset. Validate that the 1264 * final pointer is aligned by using an insn that will SIGSEGV. 1265 * 1266 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1267 * for _WIN64, which must have SSE2 but may not have AVX. 1268 */ 1269 tcg_debug_assert(arg >= 16); 1270 if (have_avx1) { 1271 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1272 } else { 1273 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1274 } 1275 break; 1276 case TCG_TYPE_V256: 1277 /* 1278 * The gvec infrastructure only requires 16-byte alignment, 1279 * so here we must use an unaligned store. 1280 */ 1281 tcg_debug_assert(arg >= 16); 1282 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1283 arg, 0, arg1, arg2); 1284 break; 1285 default: 1286 g_assert_not_reached(); 1287 } 1288} 1289 1290static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1291 TCGReg base, intptr_t ofs) 1292{ 1293 int rexw = 0; 1294 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1295 if (val != (int32_t)val) { 1296 return false; 1297 } 1298 rexw = P_REXW; 1299 } else if (type != TCG_TYPE_I32) { 1300 return false; 1301 } 1302 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1303 tcg_out32(s, val); 1304 return true; 1305} 1306 1307static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1308{ 1309 /* Propagate an opcode prefix, such as P_DATA16. */ 1310 int ext = subopc & ~0x7; 1311 subopc &= 0x7; 1312 1313 if (count == 1) { 1314 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1315 } else { 1316 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1317 tcg_out8(s, count); 1318 } 1319} 1320 1321static inline void tcg_out_bswap32(TCGContext *s, int reg) 1322{ 1323 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1324} 1325 1326static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1327{ 1328 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1329} 1330 1331static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1332{ 1333 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1334 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1335 if (dest >= 4) { 1336 tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest); 1337 tcg_out32(s, 0xff); 1338 return; 1339 } 1340 src = dest; 1341 } 1342 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1343} 1344 1345static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1346{ 1347 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1348 1349 if (TCG_TARGET_REG_BITS == 32 && src >= 4) { 1350 tcg_out_mov(s, TCG_TYPE_I32, dest, src); 1351 if (dest >= 4) { 1352 tcg_out_shifti(s, SHIFT_SHL, dest, 24); 1353 tcg_out_shifti(s, SHIFT_SAR, dest, 24); 1354 return; 1355 } 1356 src = dest; 1357 } 1358 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1359} 1360 1361static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1362{ 1363 /* movzwl */ 1364 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1365} 1366 1367static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1368{ 1369 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1370 /* movsw[lq] */ 1371 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1372} 1373 1374static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1375{ 1376 /* 32-bit mov zero extends. */ 1377 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1378} 1379 1380static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1381{ 1382 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1383 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1384} 1385 1386static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1387{ 1388 tcg_out_ext32s(s, dest, src); 1389} 1390 1391static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1392{ 1393 if (dest != src) { 1394 tcg_out_ext32u(s, dest, src); 1395 } 1396} 1397 1398static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1399{ 1400 tcg_out_ext32u(s, dest, src); 1401} 1402 1403static inline void tcg_out_bswap64(TCGContext *s, int reg) 1404{ 1405 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1406} 1407 1408static void tgen_arithi(TCGContext *s, int c, int r0, 1409 tcg_target_long val, int cf) 1410{ 1411 int rexw = 0; 1412 1413 if (TCG_TARGET_REG_BITS == 64) { 1414 rexw = c & -8; 1415 c &= 7; 1416 } 1417 1418 switch (c) { 1419 case ARITH_ADD: 1420 case ARITH_SUB: 1421 if (!cf) { 1422 /* 1423 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1424 * partial flags update stalls on Pentium4 and are not recommended 1425 * by current Intel optimization manuals. 1426 */ 1427 if (val == 1 || val == -1) { 1428 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1429 if (TCG_TARGET_REG_BITS == 64) { 1430 /* 1431 * The single-byte increment encodings are re-tasked 1432 * as the REX prefixes. Use the MODRM encoding. 1433 */ 1434 tcg_out_modrm(s, OPC_GRP5 + rexw, 1435 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1436 } else { 1437 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1438 } 1439 return; 1440 } 1441 if (val == 128) { 1442 /* 1443 * Facilitate using an 8-bit immediate. Carry is inverted 1444 * by this transformation, so do it only if cf == 0. 1445 */ 1446 c ^= ARITH_ADD ^ ARITH_SUB; 1447 val = -128; 1448 } 1449 } 1450 break; 1451 1452 case ARITH_AND: 1453 if (TCG_TARGET_REG_BITS == 64) { 1454 if (val == 0xffffffffu) { 1455 tcg_out_ext32u(s, r0, r0); 1456 return; 1457 } 1458 if (val == (uint32_t)val) { 1459 /* AND with no high bits set can use a 32-bit operation. */ 1460 rexw = 0; 1461 } 1462 } 1463 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1464 tcg_out_ext8u(s, r0, r0); 1465 return; 1466 } 1467 if (val == 0xffffu) { 1468 tcg_out_ext16u(s, r0, r0); 1469 return; 1470 } 1471 break; 1472 1473 case ARITH_OR: 1474 case ARITH_XOR: 1475 if (val >= 0x80 && val <= 0xff 1476 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1477 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1478 tcg_out8(s, val); 1479 return; 1480 } 1481 break; 1482 } 1483 1484 if (val == (int8_t)val) { 1485 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1486 tcg_out8(s, val); 1487 return; 1488 } 1489 if (rexw == 0 || val == (int32_t)val) { 1490 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1491 tcg_out32(s, val); 1492 return; 1493 } 1494 1495 g_assert_not_reached(); 1496} 1497 1498static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1499{ 1500 if (val != 0) { 1501 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1502 } 1503} 1504 1505/* Set SMALL to force a short forward branch. */ 1506static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1507{ 1508 int32_t val, val1; 1509 1510 if (l->has_value) { 1511 val = tcg_pcrel_diff(s, l->u.value_ptr); 1512 val1 = val - 2; 1513 if ((int8_t)val1 == val1) { 1514 if (opc == -1) { 1515 tcg_out8(s, OPC_JMP_short); 1516 } else { 1517 tcg_out8(s, OPC_JCC_short + opc); 1518 } 1519 tcg_out8(s, val1); 1520 } else { 1521 tcg_debug_assert(!small); 1522 if (opc == -1) { 1523 tcg_out8(s, OPC_JMP_long); 1524 tcg_out32(s, val - 5); 1525 } else { 1526 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1527 tcg_out32(s, val - 6); 1528 } 1529 } 1530 } else if (small) { 1531 if (opc == -1) { 1532 tcg_out8(s, OPC_JMP_short); 1533 } else { 1534 tcg_out8(s, OPC_JCC_short + opc); 1535 } 1536 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1537 s->code_ptr += 1; 1538 } else { 1539 if (opc == -1) { 1540 tcg_out8(s, OPC_JMP_long); 1541 } else { 1542 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1543 } 1544 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1545 s->code_ptr += 4; 1546 } 1547} 1548 1549static void tcg_out_br(TCGContext *s, TCGLabel *l) 1550{ 1551 tcg_out_jxx(s, JCC_JMP, l, 0); 1552} 1553 1554static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1555 TCGArg arg2, int const_arg2, int rexw) 1556{ 1557 int jz, js; 1558 1559 if (!is_tst_cond(cond)) { 1560 if (!const_arg2) { 1561 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1562 } else if (arg2 == 0) { 1563 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1564 } else { 1565 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1566 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1567 } 1568 return tcg_cond_to_jcc[cond]; 1569 } 1570 1571 jz = tcg_cond_to_jcc[cond]; 1572 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1573 1574 if (!const_arg2) { 1575 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1576 return jz; 1577 } 1578 1579 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1580 if (arg2 == 0x80) { 1581 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1582 return js; 1583 } 1584 if (arg2 == 0xff) { 1585 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1586 return jz; 1587 } 1588 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1589 tcg_out8(s, arg2); 1590 return jz; 1591 } 1592 1593 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1594 if (arg2 == 0x8000) { 1595 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1596 return js; 1597 } 1598 if (arg2 == 0xff00) { 1599 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1600 return jz; 1601 } 1602 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1603 tcg_out8(s, arg2 >> 8); 1604 return jz; 1605 } 1606 1607 if (arg2 == 0xffff) { 1608 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1609 return jz; 1610 } 1611 if (arg2 == 0xffffffffu) { 1612 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1613 return jz; 1614 } 1615 1616 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1617 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1618 int sh = ctz64(arg2); 1619 1620 rexw = (sh & 32 ? P_REXW : 0); 1621 if ((sh & 31) == 31) { 1622 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1623 return js; 1624 } else { 1625 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1626 tcg_out8(s, sh); 1627 return jc; 1628 } 1629 } 1630 1631 if (rexw) { 1632 if (arg2 == (uint32_t)arg2) { 1633 rexw = 0; 1634 } else { 1635 tcg_debug_assert(arg2 == (int32_t)arg2); 1636 } 1637 } 1638 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1639 tcg_out32(s, arg2); 1640 return jz; 1641} 1642 1643static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1644 TCGArg arg1, TCGArg arg2, int const_arg2, 1645 TCGLabel *label, bool small) 1646{ 1647 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1648 tcg_out_jxx(s, jcc, label, small); 1649} 1650 1651static void tgen_brcond(TCGContext *s, TCGType type, TCGCond cond, 1652 TCGReg arg1, TCGReg arg2, TCGLabel *label) 1653{ 1654 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1655 tcg_out_brcond(s, rexw, cond, arg1, arg2, false, label, false); 1656} 1657 1658static void tgen_brcondi(TCGContext *s, TCGType type, TCGCond cond, 1659 TCGReg arg1, tcg_target_long arg2, TCGLabel *label) 1660{ 1661 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1662 tcg_out_brcond(s, rexw, cond, arg1, arg2, true, label, false); 1663} 1664 1665static const TCGOutOpBrcond outop_brcond = { 1666 .base.static_constraint = C_O0_I2(r, reT), 1667 .out_rr = tgen_brcond, 1668 .out_ri = tgen_brcondi, 1669}; 1670 1671static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGReg al, 1672 TCGReg ah, TCGArg bl, bool blconst, 1673 TCGArg bh, bool bhconst, 1674 TCGLabel *label_this, bool small) 1675{ 1676 TCGLabel *label_next = gen_new_label(); 1677 1678 switch (cond) { 1679 case TCG_COND_EQ: 1680 case TCG_COND_TSTEQ: 1681 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1682 al, bl, blconst, label_next, true); 1683 tcg_out_brcond(s, 0, cond, ah, bh, bhconst, label_this, small); 1684 break; 1685 1686 case TCG_COND_NE: 1687 case TCG_COND_TSTNE: 1688 tcg_out_brcond(s, 0, cond, al, bl, blconst, label_this, small); 1689 tcg_out_brcond(s, 0, cond, ah, bh, bhconst, label_this, small); 1690 break; 1691 1692 default: 1693 tcg_out_brcond(s, 0, tcg_high_cond(cond), 1694 ah, bh, bhconst, label_this, small); 1695 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1696 tcg_out_brcond(s, 0, tcg_unsigned_cond(cond), 1697 al, bl, blconst, label_this, small); 1698 break; 1699 } 1700 tcg_out_label(s, label_next); 1701} 1702 1703static void tgen_brcond2(TCGContext *s, TCGCond cond, TCGReg al, 1704 TCGReg ah, TCGArg bl, bool blconst, 1705 TCGArg bh, bool bhconst, TCGLabel *l) 1706{ 1707 tcg_out_brcond2(s, cond, al, ah, bl, blconst, bh, bhconst, l, false); 1708} 1709 1710#if TCG_TARGET_REG_BITS != 32 1711__attribute__((unused)) 1712#endif 1713static const TCGOutOpBrcond2 outop_brcond2 = { 1714 .base.static_constraint = C_O0_I4(r, r, ri, ri), 1715 .out = tgen_brcond2, 1716}; 1717 1718static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond, 1719 TCGReg dest, TCGReg arg1, TCGArg arg2, 1720 bool const_arg2, bool neg) 1721{ 1722 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1723 int cmp_rexw = rexw; 1724 bool inv = false; 1725 bool cleared; 1726 int jcc; 1727 1728 switch (cond) { 1729 case TCG_COND_NE: 1730 inv = true; 1731 /* fall through */ 1732 case TCG_COND_EQ: 1733 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1734 if (const_arg2 && arg2 == 0) { 1735 arg2 = 1; 1736 goto do_ltu; 1737 } 1738 break; 1739 1740 case TCG_COND_TSTNE: 1741 inv = true; 1742 /* fall through */ 1743 case TCG_COND_TSTEQ: 1744 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1745 if (const_arg2 && arg2 == 0xffffffffu) { 1746 arg2 = 1; 1747 cmp_rexw = 0; 1748 goto do_ltu; 1749 } 1750 break; 1751 1752 case TCG_COND_LEU: 1753 inv = true; 1754 /* fall through */ 1755 case TCG_COND_GTU: 1756 /* If arg2 is a register, swap for LTU/GEU. */ 1757 if (!const_arg2) { 1758 TCGReg t = arg1; 1759 arg1 = arg2; 1760 arg2 = t; 1761 goto do_ltu; 1762 } 1763 break; 1764 1765 case TCG_COND_GEU: 1766 inv = true; 1767 /* fall through */ 1768 case TCG_COND_LTU: 1769 do_ltu: 1770 /* 1771 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1772 * We can then use NEG or INC to produce the desired result. 1773 * This is always smaller than the SETCC expansion. 1774 */ 1775 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1776 1777 /* X - X - C = -C = (C ? -1 : 0) */ 1778 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1779 if (inv && neg) { 1780 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1781 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1782 } else if (inv) { 1783 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1784 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1785 } else if (!neg) { 1786 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1787 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1788 } 1789 return; 1790 1791 case TCG_COND_GE: 1792 inv = true; 1793 /* fall through */ 1794 case TCG_COND_LT: 1795 /* If arg2 is 0, extract the sign bit. */ 1796 if (const_arg2 && arg2 == 0) { 1797 tcg_out_mov(s, type, dest, arg1); 1798 if (inv) { 1799 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1800 } 1801 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1802 dest, rexw ? 63 : 31); 1803 return; 1804 } 1805 break; 1806 1807 default: 1808 break; 1809 } 1810 1811 /* 1812 * If dest does not overlap the inputs, clearing it first is preferred. 1813 * The XOR breaks any false dependency for the low-byte write to dest, 1814 * and is also one byte smaller than MOVZBL. 1815 */ 1816 cleared = false; 1817 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1818 tgen_arithr(s, ARITH_XOR, dest, dest); 1819 cleared = true; 1820 } 1821 1822 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1823 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1824 1825 if (!cleared) { 1826 tcg_out_ext8u(s, dest, dest); 1827 } 1828 if (neg) { 1829 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1830 } 1831} 1832 1833static void tgen_setcond(TCGContext *s, TCGType type, TCGCond cond, 1834 TCGReg dest, TCGReg arg1, TCGReg arg2) 1835{ 1836 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, false); 1837} 1838 1839static void tgen_setcondi(TCGContext *s, TCGType type, TCGCond cond, 1840 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1841{ 1842 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, false); 1843} 1844 1845static const TCGOutOpSetcond outop_setcond = { 1846 .base.static_constraint = C_O1_I2(q, r, reT), 1847 .out_rrr = tgen_setcond, 1848 .out_rri = tgen_setcondi, 1849}; 1850 1851static void tgen_negsetcond(TCGContext *s, TCGType type, TCGCond cond, 1852 TCGReg dest, TCGReg arg1, TCGReg arg2) 1853{ 1854 tcg_out_setcond(s, type, cond, dest, arg1, arg2, false, true); 1855} 1856 1857static void tgen_negsetcondi(TCGContext *s, TCGType type, TCGCond cond, 1858 TCGReg dest, TCGReg arg1, tcg_target_long arg2) 1859{ 1860 tcg_out_setcond(s, type, cond, dest, arg1, arg2, true, true); 1861} 1862 1863static const TCGOutOpSetcond outop_negsetcond = { 1864 .base.static_constraint = C_O1_I2(q, r, reT), 1865 .out_rrr = tgen_negsetcond, 1866 .out_rri = tgen_negsetcondi, 1867}; 1868 1869static void tgen_setcond2(TCGContext *s, TCGCond cond, TCGReg ret, 1870 TCGReg al, TCGReg ah, 1871 TCGArg bl, bool const_bl, 1872 TCGArg bh, bool const_bh) 1873{ 1874 TCGLabel *label_over = gen_new_label(); 1875 1876 if (ret == al || ret == ah 1877 || (!const_bl && ret == bl) 1878 || (!const_bh && ret == bh)) { 1879 /* 1880 * When the destination overlaps with one of the argument 1881 * registers, don't do anything tricky. 1882 */ 1883 TCGLabel *label_true = gen_new_label(); 1884 1885 tcg_out_brcond2(s, cond, al, ah, bl, const_bl, 1886 bh, const_bh, label_true, true); 1887 1888 tcg_out_movi(s, TCG_TYPE_I32, ret, 0); 1889 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1890 tcg_out_label(s, label_true); 1891 1892 tcg_out_movi(s, TCG_TYPE_I32, ret, 1); 1893 } else { 1894 /* 1895 * When the destination does not overlap one of the arguments, 1896 * clear the destination first, jump if cond false, and emit an 1897 * increment in the true case. This results in smaller code. 1898 */ 1899 tcg_out_movi(s, TCG_TYPE_I32, ret, 0); 1900 1901 tcg_out_brcond2(s, tcg_invert_cond(cond), al, ah, bl, const_bl, 1902 bh, const_bh, label_over, true); 1903 1904 tgen_arithi(s, ARITH_ADD, ret, 1, 0); 1905 } 1906 tcg_out_label(s, label_over); 1907} 1908 1909#if TCG_TARGET_REG_BITS != 32 1910__attribute__((unused)) 1911#endif 1912static const TCGOutOpSetcond2 outop_setcond2 = { 1913 .base.static_constraint = C_O1_I4(r, r, r, ri, ri), 1914 .out = tgen_setcond2, 1915}; 1916 1917static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1918 TCGReg dest, TCGReg v1) 1919{ 1920 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1921} 1922 1923static void tgen_movcond(TCGContext *s, TCGType type, TCGCond cond, 1924 TCGReg dest, TCGReg c1, TCGArg c2, bool const_c2, 1925 TCGArg vt, bool const_vt, 1926 TCGArg vf, bool consf_vf) 1927{ 1928 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1929 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1930 tcg_out_cmov(s, jcc, rexw, dest, vt); 1931} 1932 1933static const TCGOutOpMovcond outop_movcond = { 1934 .base.static_constraint = C_O1_I4(r, r, reT, r, 0), 1935 .out = tgen_movcond, 1936}; 1937 1938static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1939{ 1940 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1941 1942 if (disp == (int32_t)disp) { 1943 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1944 tcg_out32(s, disp); 1945 } else { 1946 /* rip-relative addressing into the constant pool. 1947 This is 6 + 8 = 14 bytes, as compared to using an 1948 immediate load 10 + 6 = 16 bytes, plus we may 1949 be able to re-use the pool constant for more calls. */ 1950 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1951 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1952 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1953 tcg_out32(s, 0); 1954 } 1955} 1956 1957static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1958 const TCGHelperInfo *info) 1959{ 1960 tcg_out_branch(s, 1, dest); 1961 1962#ifndef _WIN32 1963 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1964 /* 1965 * The sysv i386 abi for struct return places a reference as the 1966 * first argument of the stack, and pops that argument with the 1967 * return statement. Since we want to retain the aligned stack 1968 * pointer for the callee, we do not want to actually push that 1969 * argument before the call but rely on the normal store to the 1970 * stack slot. But we do need to compensate for the pop in order 1971 * to reset our correct stack pointer value. 1972 * Pushing a garbage value back onto the stack is quickest. 1973 */ 1974 tcg_out_push(s, TCG_REG_EAX); 1975 } 1976#endif 1977} 1978 1979static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1980{ 1981 tcg_out_branch(s, 0, dest); 1982} 1983 1984static void tcg_out_nopn(TCGContext *s, int n) 1985{ 1986 int i; 1987 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1988 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1989 * duplicate prefix, and all of the interesting recent cores can 1990 * decode and discard the duplicates in a single cycle. 1991 */ 1992 tcg_debug_assert(n >= 1); 1993 for (i = 1; i < n; ++i) { 1994 tcg_out8(s, 0x66); 1995 } 1996 tcg_out8(s, 0x90); 1997} 1998 1999typedef struct { 2000 TCGReg base; 2001 int index; 2002 int ofs; 2003 int seg; 2004 TCGAtomAlign aa; 2005} HostAddress; 2006 2007bool tcg_target_has_memory_bswap(MemOp memop) 2008{ 2009 TCGAtomAlign aa; 2010 2011 if (!have_movbe) { 2012 return false; 2013 } 2014 if ((memop & MO_SIZE) < MO_128) { 2015 return true; 2016 } 2017 2018 /* 2019 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 2020 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 2021 */ 2022 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 2023 return aa.atom < MO_128; 2024} 2025 2026/* 2027 * Because i686 has no register parameters and because x86_64 has xchg 2028 * to handle addr/data register overlap, we have placed all input arguments 2029 * before we need might need a scratch reg. 2030 * 2031 * Even then, a scratch is only needed for l->raddr. Rather than expose 2032 * a general-purpose scratch when we don't actually know it's available, 2033 * use the ra_gen hook to load into RAX if needed. 2034 */ 2035#if TCG_TARGET_REG_BITS == 64 2036static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 2037{ 2038 if (arg < 0) { 2039 arg = TCG_REG_RAX; 2040 } 2041 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2042 return arg; 2043} 2044static const TCGLdstHelperParam ldst_helper_param = { 2045 .ra_gen = ldst_ra_gen 2046}; 2047#else 2048static const TCGLdstHelperParam ldst_helper_param = { }; 2049#endif 2050 2051static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2052 TCGReg l, TCGReg h, TCGReg v) 2053{ 2054 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2055 2056 /* vpmov{d,q} %v, %l */ 2057 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2058 /* vpextr{d,q} $1, %v, %h */ 2059 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2060 tcg_out8(s, 1); 2061} 2062 2063static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2064 TCGReg v, TCGReg l, TCGReg h) 2065{ 2066 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2067 2068 /* vmov{d,q} %l, %v */ 2069 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2070 /* vpinsr{d,q} $1, %h, %v, %v */ 2071 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2072 tcg_out8(s, 1); 2073} 2074 2075/* 2076 * Generate code for the slow path for a load at the end of block 2077 */ 2078static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2079{ 2080 MemOp opc = get_memop(l->oi); 2081 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2082 2083 /* resolve label address */ 2084 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2085 if (label_ptr[1]) { 2086 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2087 } 2088 2089 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2090 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2091 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2092 2093 tcg_out_jmp(s, l->raddr); 2094 return true; 2095} 2096 2097/* 2098 * Generate code for the slow path for a store at the end of block 2099 */ 2100static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2101{ 2102 MemOp opc = get_memop(l->oi); 2103 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2104 2105 /* resolve label address */ 2106 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2107 if (label_ptr[1]) { 2108 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2109 } 2110 2111 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2112 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2113 2114 tcg_out_jmp(s, l->raddr); 2115 return true; 2116} 2117 2118#ifdef CONFIG_USER_ONLY 2119static HostAddress x86_guest_base = { 2120 .index = -1 2121}; 2122 2123#if defined(__x86_64__) && defined(__linux__) 2124# include <asm/prctl.h> 2125# include <sys/prctl.h> 2126int arch_prctl(int code, unsigned long addr); 2127static inline int setup_guest_base_seg(void) 2128{ 2129 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2130 return P_GS; 2131 } 2132 return 0; 2133} 2134#define setup_guest_base_seg setup_guest_base_seg 2135#elif defined(__x86_64__) && \ 2136 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2137# include <machine/sysarch.h> 2138static inline int setup_guest_base_seg(void) 2139{ 2140 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2141 return P_GS; 2142 } 2143 return 0; 2144} 2145#define setup_guest_base_seg setup_guest_base_seg 2146#endif 2147#else 2148# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2149#endif /* CONFIG_USER_ONLY */ 2150#ifndef setup_guest_base_seg 2151# define setup_guest_base_seg() 0 2152#endif 2153 2154#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2155 2156/* 2157 * For softmmu, perform the TLB load and compare. 2158 * For useronly, perform any required alignment tests. 2159 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2160 * is required and fill in @h with the host address for the fast path. 2161 */ 2162static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2163 TCGReg addr, MemOpIdx oi, bool is_ld) 2164{ 2165 TCGLabelQemuLdst *ldst = NULL; 2166 MemOp opc = get_memop(oi); 2167 MemOp s_bits = opc & MO_SIZE; 2168 unsigned a_mask; 2169 2170 if (tcg_use_softmmu) { 2171 h->index = TCG_REG_L0; 2172 h->ofs = 0; 2173 h->seg = 0; 2174 } else { 2175 *h = x86_guest_base; 2176 } 2177 h->base = addr; 2178 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2179 a_mask = (1 << h->aa.align) - 1; 2180 2181 if (tcg_use_softmmu) { 2182 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2183 : offsetof(CPUTLBEntry, addr_write); 2184 TCGType ttype = TCG_TYPE_I32; 2185 TCGType tlbtype = TCG_TYPE_I32; 2186 int trexw = 0, hrexw = 0, tlbrexw = 0; 2187 unsigned mem_index = get_mmuidx(oi); 2188 unsigned s_mask = (1 << s_bits) - 1; 2189 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2190 int tlb_mask; 2191 2192 ldst = new_ldst_label(s); 2193 ldst->is_ld = is_ld; 2194 ldst->oi = oi; 2195 ldst->addr_reg = addr; 2196 2197 if (TCG_TARGET_REG_BITS == 64) { 2198 ttype = s->addr_type; 2199 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2200 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2201 hrexw = P_REXW; 2202 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2203 tlbtype = TCG_TYPE_I64; 2204 tlbrexw = P_REXW; 2205 } 2206 } 2207 } 2208 2209 tcg_out_mov(s, tlbtype, TCG_REG_L0, addr); 2210 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2211 s->page_bits - CPU_TLB_ENTRY_BITS); 2212 2213 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2214 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2215 2216 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2217 fast_ofs + offsetof(CPUTLBDescFast, table)); 2218 2219 /* 2220 * If the required alignment is at least as large as the access, 2221 * simply copy the address and mask. For lesser alignments, 2222 * check that we don't cross pages for the complete access. 2223 */ 2224 if (a_mask >= s_mask) { 2225 tcg_out_mov(s, ttype, TCG_REG_L1, addr); 2226 } else { 2227 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2228 addr, s_mask - a_mask); 2229 } 2230 tlb_mask = s->page_mask | a_mask; 2231 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2232 2233 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2234 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2235 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2236 2237 /* jne slow_path */ 2238 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2239 ldst->label_ptr[0] = s->code_ptr; 2240 s->code_ptr += 4; 2241 2242 /* TLB Hit. */ 2243 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2244 offsetof(CPUTLBEntry, addend)); 2245 } else if (a_mask) { 2246 int jcc; 2247 2248 ldst = new_ldst_label(s); 2249 ldst->is_ld = is_ld; 2250 ldst->oi = oi; 2251 ldst->addr_reg = addr; 2252 2253 /* jne slow_path */ 2254 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addr, a_mask, true, false); 2255 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2256 ldst->label_ptr[0] = s->code_ptr; 2257 s->code_ptr += 4; 2258 } 2259 2260 return ldst; 2261} 2262 2263static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2264 HostAddress h, TCGType type, MemOp memop) 2265{ 2266 bool use_movbe = false; 2267 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2268 int movop = OPC_MOVL_GvEv; 2269 2270 /* Do big-endian loads with movbe. */ 2271 if (memop & MO_BSWAP) { 2272 tcg_debug_assert(have_movbe); 2273 use_movbe = true; 2274 movop = OPC_MOVBE_GyMy; 2275 } 2276 2277 switch (memop & MO_SSIZE) { 2278 case MO_UB: 2279 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2280 h.base, h.index, 0, h.ofs); 2281 break; 2282 case MO_SB: 2283 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2284 h.base, h.index, 0, h.ofs); 2285 break; 2286 case MO_UW: 2287 if (use_movbe) { 2288 /* There is no extending movbe; only low 16-bits are modified. */ 2289 if (datalo != h.base && datalo != h.index) { 2290 /* XOR breaks dependency chains. */ 2291 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2292 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2293 datalo, h.base, h.index, 0, h.ofs); 2294 } else { 2295 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2296 datalo, h.base, h.index, 0, h.ofs); 2297 tcg_out_ext16u(s, datalo, datalo); 2298 } 2299 } else { 2300 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2301 h.base, h.index, 0, h.ofs); 2302 } 2303 break; 2304 case MO_SW: 2305 if (use_movbe) { 2306 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2307 datalo, h.base, h.index, 0, h.ofs); 2308 tcg_out_ext16s(s, type, datalo, datalo); 2309 } else { 2310 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2311 datalo, h.base, h.index, 0, h.ofs); 2312 } 2313 break; 2314 case MO_UL: 2315 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2316 h.base, h.index, 0, h.ofs); 2317 break; 2318#if TCG_TARGET_REG_BITS == 64 2319 case MO_SL: 2320 if (use_movbe) { 2321 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2322 h.base, h.index, 0, h.ofs); 2323 tcg_out_ext32s(s, datalo, datalo); 2324 } else { 2325 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2326 h.base, h.index, 0, h.ofs); 2327 } 2328 break; 2329#endif 2330 case MO_UQ: 2331 if (TCG_TARGET_REG_BITS == 64) { 2332 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2333 h.base, h.index, 0, h.ofs); 2334 break; 2335 } 2336 if (use_movbe) { 2337 TCGReg t = datalo; 2338 datalo = datahi; 2339 datahi = t; 2340 } 2341 if (h.base == datalo || h.index == datalo) { 2342 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2343 h.base, h.index, 0, h.ofs); 2344 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2345 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2346 } else { 2347 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2348 h.base, h.index, 0, h.ofs); 2349 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2350 h.base, h.index, 0, h.ofs + 4); 2351 } 2352 break; 2353 2354 case MO_128: 2355 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2356 2357 /* 2358 * Without 16-byte atomicity, use integer regs. 2359 * That is where we want the data, and it allows bswaps. 2360 */ 2361 if (h.aa.atom < MO_128) { 2362 if (use_movbe) { 2363 TCGReg t = datalo; 2364 datalo = datahi; 2365 datahi = t; 2366 } 2367 if (h.base == datalo || h.index == datalo) { 2368 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2369 h.base, h.index, 0, h.ofs); 2370 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2371 datalo, datahi, 0); 2372 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2373 datahi, datahi, 8); 2374 } else { 2375 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2376 h.base, h.index, 0, h.ofs); 2377 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2378 h.base, h.index, 0, h.ofs + 8); 2379 } 2380 break; 2381 } 2382 2383 /* 2384 * With 16-byte atomicity, a vector load is required. 2385 * If we already have 16-byte alignment, then VMOVDQA always works. 2386 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2387 * Else use we require a runtime test for alignment for VMOVDQA; 2388 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2389 */ 2390 if (h.aa.align >= MO_128) { 2391 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2392 TCG_TMP_VEC, 0, 2393 h.base, h.index, 0, h.ofs); 2394 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2395 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2396 TCG_TMP_VEC, 0, 2397 h.base, h.index, 0, h.ofs); 2398 } else { 2399 TCGLabel *l1 = gen_new_label(); 2400 TCGLabel *l2 = gen_new_label(); 2401 int jcc; 2402 2403 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2404 tcg_out_jxx(s, jcc, l1, true); 2405 2406 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2407 TCG_TMP_VEC, 0, 2408 h.base, h.index, 0, h.ofs); 2409 tcg_out_jxx(s, JCC_JMP, l2, true); 2410 2411 tcg_out_label(s, l1); 2412 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2413 TCG_TMP_VEC, 0, 2414 h.base, h.index, 0, h.ofs); 2415 tcg_out_label(s, l2); 2416 } 2417 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2418 break; 2419 2420 default: 2421 g_assert_not_reached(); 2422 } 2423} 2424 2425static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2426 TCGReg addr, MemOpIdx oi, TCGType data_type) 2427{ 2428 TCGLabelQemuLdst *ldst; 2429 HostAddress h; 2430 2431 ldst = prepare_host_addr(s, &h, addr, oi, true); 2432 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2433 2434 if (ldst) { 2435 ldst->type = data_type; 2436 ldst->datalo_reg = datalo; 2437 ldst->datahi_reg = datahi; 2438 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2439 } 2440} 2441 2442static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2443 HostAddress h, MemOp memop) 2444{ 2445 bool use_movbe = false; 2446 int movop = OPC_MOVL_EvGv; 2447 2448 /* 2449 * Do big-endian stores with movbe or system-mode. 2450 * User-only without movbe will have its swapping done generically. 2451 */ 2452 if (memop & MO_BSWAP) { 2453 tcg_debug_assert(have_movbe); 2454 use_movbe = true; 2455 movop = OPC_MOVBE_MyGy; 2456 } 2457 2458 switch (memop & MO_SIZE) { 2459 case MO_8: 2460 /* This is handled with constraints on INDEX_op_qemu_st_i32. */ 2461 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2462 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2463 datalo, h.base, h.index, 0, h.ofs); 2464 break; 2465 case MO_16: 2466 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2467 h.base, h.index, 0, h.ofs); 2468 break; 2469 case MO_32: 2470 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2471 h.base, h.index, 0, h.ofs); 2472 break; 2473 case MO_64: 2474 if (TCG_TARGET_REG_BITS == 64) { 2475 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2476 h.base, h.index, 0, h.ofs); 2477 } else { 2478 if (use_movbe) { 2479 TCGReg t = datalo; 2480 datalo = datahi; 2481 datahi = t; 2482 } 2483 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2484 h.base, h.index, 0, h.ofs); 2485 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2486 h.base, h.index, 0, h.ofs + 4); 2487 } 2488 break; 2489 2490 case MO_128: 2491 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2492 2493 /* 2494 * Without 16-byte atomicity, use integer regs. 2495 * That is where we have the data, and it allows bswaps. 2496 */ 2497 if (h.aa.atom < MO_128) { 2498 if (use_movbe) { 2499 TCGReg t = datalo; 2500 datalo = datahi; 2501 datahi = t; 2502 } 2503 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2504 h.base, h.index, 0, h.ofs); 2505 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2506 h.base, h.index, 0, h.ofs + 8); 2507 break; 2508 } 2509 2510 /* 2511 * With 16-byte atomicity, a vector store is required. 2512 * If we already have 16-byte alignment, then VMOVDQA always works. 2513 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2514 * Else use we require a runtime test for alignment for VMOVDQA; 2515 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2516 */ 2517 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2518 if (h.aa.align >= MO_128) { 2519 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2520 TCG_TMP_VEC, 0, 2521 h.base, h.index, 0, h.ofs); 2522 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2523 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2524 TCG_TMP_VEC, 0, 2525 h.base, h.index, 0, h.ofs); 2526 } else { 2527 TCGLabel *l1 = gen_new_label(); 2528 TCGLabel *l2 = gen_new_label(); 2529 int jcc; 2530 2531 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2532 tcg_out_jxx(s, jcc, l1, true); 2533 2534 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2535 TCG_TMP_VEC, 0, 2536 h.base, h.index, 0, h.ofs); 2537 tcg_out_jxx(s, JCC_JMP, l2, true); 2538 2539 tcg_out_label(s, l1); 2540 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2541 TCG_TMP_VEC, 0, 2542 h.base, h.index, 0, h.ofs); 2543 tcg_out_label(s, l2); 2544 } 2545 break; 2546 2547 default: 2548 g_assert_not_reached(); 2549 } 2550} 2551 2552static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2553 TCGReg addr, MemOpIdx oi, TCGType data_type) 2554{ 2555 TCGLabelQemuLdst *ldst; 2556 HostAddress h; 2557 2558 ldst = prepare_host_addr(s, &h, addr, oi, false); 2559 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2560 2561 if (ldst) { 2562 ldst->type = data_type; 2563 ldst->datalo_reg = datalo; 2564 ldst->datahi_reg = datahi; 2565 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2566 } 2567} 2568 2569static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2570{ 2571 /* Reuse the zeroing that exists for goto_ptr. */ 2572 if (a0 == 0) { 2573 tcg_out_jmp(s, tcg_code_gen_epilogue); 2574 } else { 2575 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2576 tcg_out_jmp(s, tb_ret_addr); 2577 } 2578} 2579 2580static void tcg_out_goto_tb(TCGContext *s, int which) 2581{ 2582 /* 2583 * Jump displacement must be aligned for atomic patching; 2584 * see if we need to add extra nops before jump 2585 */ 2586 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2587 if (gap != 1) { 2588 tcg_out_nopn(s, gap - 1); 2589 } 2590 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2591 set_jmp_insn_offset(s, which); 2592 tcg_out32(s, 0); 2593 set_jmp_reset_offset(s, which); 2594} 2595 2596static void tcg_out_goto_ptr(TCGContext *s, TCGReg a0) 2597{ 2598 /* Jump to the given host address (could be epilogue) */ 2599 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2600} 2601 2602void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2603 uintptr_t jmp_rx, uintptr_t jmp_rw) 2604{ 2605 /* patch the branch destination */ 2606 uintptr_t addr = tb->jmp_target_addr[n]; 2607 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2608 /* no need to flush icache explicitly */ 2609} 2610 2611 2612static void tgen_add(TCGContext *s, TCGType type, 2613 TCGReg a0, TCGReg a1, TCGReg a2) 2614{ 2615 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2616 2617 if (a0 == a1) { 2618 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2619 } else if (a0 == a2) { 2620 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2621 } else { 2622 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, 0); 2623 } 2624} 2625 2626static void tgen_addi(TCGContext *s, TCGType type, 2627 TCGReg a0, TCGReg a1, tcg_target_long a2) 2628{ 2629 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2630 2631 if (a0 == a1) { 2632 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, false); 2633 } else { 2634 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2); 2635 } 2636} 2637 2638static const TCGOutOpBinary outop_add = { 2639 .base.static_constraint = C_O1_I2(r, r, re), 2640 .out_rrr = tgen_add, 2641 .out_rri = tgen_addi, 2642}; 2643 2644static void tgen_addco(TCGContext *s, TCGType type, 2645 TCGReg a0, TCGReg a1, TCGReg a2) 2646{ 2647 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2648 tgen_arithr(s, ARITH_ADD + rexw, a0, a2); 2649} 2650 2651static void tgen_addco_imm(TCGContext *s, TCGType type, 2652 TCGReg a0, TCGReg a1, tcg_target_long a2) 2653{ 2654 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2655 tgen_arithi(s, ARITH_ADD + rexw, a0, a2, true); 2656} 2657 2658static const TCGOutOpBinary outop_addco = { 2659 .base.static_constraint = C_O1_I2(r, 0, re), 2660 .out_rrr = tgen_addco, 2661 .out_rri = tgen_addco_imm, 2662}; 2663 2664static void tgen_addcio(TCGContext *s, TCGType type, 2665 TCGReg a0, TCGReg a1, TCGReg a2) 2666{ 2667 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2668 tgen_arithr(s, ARITH_ADC + rexw, a0, a2); 2669} 2670 2671static void tgen_addcio_imm(TCGContext *s, TCGType type, 2672 TCGReg a0, TCGReg a1, tcg_target_long a2) 2673{ 2674 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2675 tgen_arithi(s, ARITH_ADC + rexw, a0, a2, true); 2676} 2677 2678static const TCGOutOpBinary outop_addcio = { 2679 .base.static_constraint = C_O1_I2(r, 0, re), 2680 .out_rrr = tgen_addcio, 2681 .out_rri = tgen_addcio_imm, 2682}; 2683 2684static void tgen_addci_rrr(TCGContext *s, TCGType type, 2685 TCGReg a0, TCGReg a1, TCGReg a2) 2686{ 2687 /* Because "0O" is not a valid constraint, we must match ourselves. */ 2688 if (a0 == a2) { 2689 tgen_addcio(s, type, a0, a0, a1); 2690 } else { 2691 tcg_out_mov(s, type, a0, a1); 2692 tgen_addcio(s, type, a0, a0, a2); 2693 } 2694} 2695 2696static void tgen_addci_rri(TCGContext *s, TCGType type, 2697 TCGReg a0, TCGReg a1, tcg_target_long a2) 2698{ 2699 tcg_out_mov(s, type, a0, a1); 2700 tgen_addcio_imm(s, type, a0, a0, a2); 2701} 2702 2703static void tgen_addci_rir(TCGContext *s, TCGType type, 2704 TCGReg a0, tcg_target_long a1, TCGReg a2) 2705{ 2706 tgen_addci_rri(s, type, a0, a2, a1); 2707} 2708 2709static void tgen_addci_rii(TCGContext *s, TCGType type, TCGReg a0, 2710 tcg_target_long a1, tcg_target_long a2) 2711{ 2712 if (a2 == 0) { 2713 /* Implement 0 + 0 + C with -(x - x - c). */ 2714 tgen_arithr(s, ARITH_SBB, a0, a0); 2715 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, a0); 2716 } else { 2717 tcg_out_movi(s, type, a0, a2); 2718 tgen_addcio_imm(s, type, a0, a0, a1); 2719 } 2720} 2721 2722static const TCGOutOpAddSubCarry outop_addci = { 2723 .base.static_constraint = C_O1_I2(r, rO, re), 2724 .out_rrr = tgen_addci_rrr, 2725 .out_rri = tgen_addci_rri, 2726 .out_rir = tgen_addci_rir, 2727 .out_rii = tgen_addci_rii, 2728}; 2729 2730static void tcg_out_set_carry(TCGContext *s) 2731{ 2732 tcg_out8(s, OPC_STC); 2733} 2734 2735static void tgen_and(TCGContext *s, TCGType type, 2736 TCGReg a0, TCGReg a1, TCGReg a2) 2737{ 2738 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2739 tgen_arithr(s, ARITH_AND + rexw, a0, a2); 2740} 2741 2742static void tgen_andi(TCGContext *s, TCGType type, 2743 TCGReg a0, TCGReg a1, tcg_target_long a2) 2744{ 2745 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2746 tgen_arithi(s, ARITH_AND + rexw, a0, a2, false); 2747} 2748 2749static const TCGOutOpBinary outop_and = { 2750 .base.static_constraint = C_O1_I2(r, 0, reZ), 2751 .out_rrr = tgen_and, 2752 .out_rri = tgen_andi, 2753}; 2754 2755static void tgen_andc(TCGContext *s, TCGType type, 2756 TCGReg a0, TCGReg a1, TCGReg a2) 2757{ 2758 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2759 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2760} 2761 2762static TCGConstraintSetIndex cset_andc(TCGType type, unsigned flags) 2763{ 2764 return have_bmi1 ? C_O1_I2(r, r, r) : C_NotImplemented; 2765} 2766 2767static const TCGOutOpBinary outop_andc = { 2768 .base.static_constraint = C_Dynamic, 2769 .base.dynamic_constraint = cset_andc, 2770 .out_rrr = tgen_andc, 2771}; 2772 2773static void tgen_clz(TCGContext *s, TCGType type, 2774 TCGReg a0, TCGReg a1, TCGReg a2) 2775{ 2776 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2777 int jcc; 2778 2779 if (have_lzcnt) { 2780 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2781 jcc = JCC_JB; 2782 } else { 2783 /* Recall that the output of BSR is the index not the count. */ 2784 tcg_out_modrm(s, OPC_BSR + rexw, a0, a1); 2785 tgen_arithi(s, ARITH_XOR + rexw, a0, rexw ? 63 : 31, 0); 2786 2787 /* Since we have destroyed the flags from BSR, we have to re-test. */ 2788 jcc = tcg_out_cmp(s, TCG_COND_EQ, a1, 0, 1, rexw); 2789 } 2790 tcg_out_cmov(s, jcc, rexw, a0, a2); 2791} 2792 2793static void tgen_clzi(TCGContext *s, TCGType type, 2794 TCGReg a0, TCGReg a1, tcg_target_long a2) 2795{ 2796 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2797 tcg_out_modrm(s, OPC_LZCNT + rexw, a0, a1); 2798} 2799 2800static TCGConstraintSetIndex cset_clz(TCGType type, unsigned flags) 2801{ 2802 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2803} 2804 2805static const TCGOutOpBinary outop_clz = { 2806 .base.static_constraint = C_Dynamic, 2807 .base.dynamic_constraint = cset_clz, 2808 .out_rrr = tgen_clz, 2809 .out_rri = tgen_clzi, 2810}; 2811 2812static void tgen_ctpop(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 2813{ 2814 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2815 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2816} 2817 2818static TCGConstraintSetIndex cset_ctpop(TCGType type, unsigned flags) 2819{ 2820 return have_popcnt ? C_O1_I1(r, r) : C_NotImplemented; 2821} 2822 2823static const TCGOutOpUnary outop_ctpop = { 2824 .base.static_constraint = C_Dynamic, 2825 .base.dynamic_constraint = cset_ctpop, 2826 .out_rr = tgen_ctpop, 2827}; 2828 2829static void tgen_ctz(TCGContext *s, TCGType type, 2830 TCGReg a0, TCGReg a1, TCGReg a2) 2831{ 2832 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2833 int jcc; 2834 2835 if (have_bmi1) { 2836 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2837 jcc = JCC_JB; 2838 } else { 2839 tcg_out_modrm(s, OPC_BSF + rexw, a0, a1); 2840 jcc = JCC_JE; 2841 } 2842 tcg_out_cmov(s, jcc, rexw, a0, a2); 2843} 2844 2845static void tgen_ctzi(TCGContext *s, TCGType type, 2846 TCGReg a0, TCGReg a1, tcg_target_long a2) 2847{ 2848 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2849 tcg_out_modrm(s, OPC_TZCNT + rexw, a0, a1); 2850} 2851 2852static TCGConstraintSetIndex cset_ctz(TCGType type, unsigned flags) 2853{ 2854 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 2855} 2856 2857static const TCGOutOpBinary outop_ctz = { 2858 .base.static_constraint = C_Dynamic, 2859 .base.dynamic_constraint = cset_ctz, 2860 .out_rrr = tgen_ctz, 2861 .out_rri = tgen_ctzi, 2862}; 2863 2864static const TCGOutOpBinary outop_divs = { 2865 .base.static_constraint = C_NotImplemented, 2866}; 2867 2868static void tgen_divs2(TCGContext *s, TCGType type, 2869 TCGReg a0, TCGReg a1, TCGReg a4) 2870{ 2871 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2872 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, a4); 2873} 2874 2875static const TCGOutOpDivRem outop_divs2 = { 2876 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2877 .out_rr01r = tgen_divs2, 2878}; 2879 2880static const TCGOutOpBinary outop_divu = { 2881 .base.static_constraint = C_NotImplemented, 2882}; 2883 2884static void tgen_divu2(TCGContext *s, TCGType type, 2885 TCGReg a0, TCGReg a1, TCGReg a4) 2886{ 2887 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2888 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, a4); 2889} 2890 2891static const TCGOutOpDivRem outop_divu2 = { 2892 .base.static_constraint = C_O2_I3(a, d, 0, 1, r), 2893 .out_rr01r = tgen_divu2, 2894}; 2895 2896static const TCGOutOpBinary outop_eqv = { 2897 .base.static_constraint = C_NotImplemented, 2898}; 2899 2900#if TCG_TARGET_REG_BITS == 64 2901static void tgen_extrh_i64_i32(TCGContext *s, TCGType t, TCGReg a0, TCGReg a1) 2902{ 2903 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2904} 2905 2906static const TCGOutOpUnary outop_extrh_i64_i32 = { 2907 .base.static_constraint = C_O1_I1(r, 0), 2908 .out_rr = tgen_extrh_i64_i32, 2909}; 2910#endif /* TCG_TARGET_REG_BITS == 64 */ 2911 2912static void tgen_mul(TCGContext *s, TCGType type, 2913 TCGReg a0, TCGReg a1, TCGReg a2) 2914{ 2915 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2916 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2917} 2918 2919static void tgen_muli(TCGContext *s, TCGType type, 2920 TCGReg a0, TCGReg a1, tcg_target_long a2) 2921{ 2922 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2923 2924 if (a2 == (int8_t)a2) { 2925 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2926 tcg_out8(s, a2); 2927 } else { 2928 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2929 tcg_out32(s, a2); 2930 } 2931} 2932 2933static const TCGOutOpBinary outop_mul = { 2934 .base.static_constraint = C_O1_I2(r, 0, re), 2935 .out_rrr = tgen_mul, 2936 .out_rri = tgen_muli, 2937}; 2938 2939static void tgen_muls2(TCGContext *s, TCGType type, 2940 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2941{ 2942 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2943 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, a3); 2944} 2945 2946static const TCGOutOpMul2 outop_muls2 = { 2947 .base.static_constraint = C_O2_I2(a, d, a, r), 2948 .out_rrrr = tgen_muls2, 2949}; 2950 2951static const TCGOutOpBinary outop_mulsh = { 2952 .base.static_constraint = C_NotImplemented, 2953}; 2954 2955static const TCGOutOpBinary outop_muluh = { 2956 .base.static_constraint = C_NotImplemented, 2957}; 2958 2959static void tgen_mulu2(TCGContext *s, TCGType type, 2960 TCGReg a0, TCGReg a1, TCGReg a2, TCGReg a3) 2961{ 2962 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2963 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, a3); 2964} 2965 2966static const TCGOutOpMul2 outop_mulu2 = { 2967 .base.static_constraint = C_O2_I2(a, d, a, r), 2968 .out_rrrr = tgen_mulu2, 2969}; 2970 2971static const TCGOutOpBinary outop_nand = { 2972 .base.static_constraint = C_NotImplemented, 2973}; 2974 2975static const TCGOutOpBinary outop_nor = { 2976 .base.static_constraint = C_NotImplemented, 2977}; 2978 2979static void tgen_or(TCGContext *s, TCGType type, 2980 TCGReg a0, TCGReg a1, TCGReg a2) 2981{ 2982 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2983 tgen_arithr(s, ARITH_OR + rexw, a0, a2); 2984} 2985 2986static void tgen_ori(TCGContext *s, TCGType type, 2987 TCGReg a0, TCGReg a1, tcg_target_long a2) 2988{ 2989 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2990 tgen_arithi(s, ARITH_OR + rexw, a0, a2, false); 2991} 2992 2993static const TCGOutOpBinary outop_or = { 2994 .base.static_constraint = C_O1_I2(r, 0, re), 2995 .out_rrr = tgen_or, 2996 .out_rri = tgen_ori, 2997}; 2998 2999static const TCGOutOpBinary outop_orc = { 3000 .base.static_constraint = C_NotImplemented, 3001}; 3002 3003static const TCGOutOpBinary outop_rems = { 3004 .base.static_constraint = C_NotImplemented, 3005}; 3006 3007static const TCGOutOpBinary outop_remu = { 3008 .base.static_constraint = C_NotImplemented, 3009}; 3010 3011static void tgen_rotl(TCGContext *s, TCGType type, 3012 TCGReg a0, TCGReg a1, TCGReg a2) 3013{ 3014 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3015 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROL, a0); 3016} 3017 3018static void tgen_rotli(TCGContext *s, TCGType type, 3019 TCGReg a0, TCGReg a1, tcg_target_long a2) 3020{ 3021 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3022 tcg_out_shifti(s, SHIFT_ROL + rexw, a0, a2); 3023} 3024 3025static const TCGOutOpBinary outop_rotl = { 3026 .base.static_constraint = C_O1_I2(r, 0, ci), 3027 .out_rrr = tgen_rotl, 3028 .out_rri = tgen_rotli, 3029}; 3030 3031static void tgen_rotr(TCGContext *s, TCGType type, 3032 TCGReg a0, TCGReg a1, TCGReg a2) 3033{ 3034 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3035 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_ROR, a0); 3036} 3037 3038static void tgen_rotri(TCGContext *s, TCGType type, 3039 TCGReg a0, TCGReg a1, tcg_target_long a2) 3040{ 3041 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3042 tcg_out_shifti(s, SHIFT_ROR + rexw, a0, a2); 3043} 3044 3045static const TCGOutOpBinary outop_rotr = { 3046 .base.static_constraint = C_O1_I2(r, 0, ci), 3047 .out_rrr = tgen_rotr, 3048 .out_rri = tgen_rotri, 3049}; 3050 3051static TCGConstraintSetIndex cset_shift(TCGType type, unsigned flags) 3052{ 3053 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3054} 3055 3056static void tgen_sar(TCGContext *s, TCGType type, 3057 TCGReg a0, TCGReg a1, TCGReg a2) 3058{ 3059 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3060 if (have_bmi2) { 3061 tcg_out_vex_modrm(s, OPC_SARX + rexw, a0, a2, a1); 3062 } else { 3063 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SAR, a0); 3064 } 3065} 3066 3067static void tgen_sari(TCGContext *s, TCGType type, 3068 TCGReg a0, TCGReg a1, tcg_target_long a2) 3069{ 3070 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3071 3072 tcg_out_mov(s, type, a0, a1); 3073 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, a2); 3074} 3075 3076static const TCGOutOpBinary outop_sar = { 3077 .base.static_constraint = C_Dynamic, 3078 .base.dynamic_constraint = cset_shift, 3079 .out_rrr = tgen_sar, 3080 .out_rri = tgen_sari, 3081}; 3082 3083static void tgen_shl(TCGContext *s, TCGType type, 3084 TCGReg a0, TCGReg a1, TCGReg a2) 3085{ 3086 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3087 if (have_bmi2) { 3088 tcg_out_vex_modrm(s, OPC_SHLX + rexw, a0, a2, a1); 3089 } else { 3090 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHL, a0); 3091 } 3092} 3093 3094static void tgen_shli(TCGContext *s, TCGType type, 3095 TCGReg a0, TCGReg a1, tcg_target_long a2) 3096{ 3097 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3098 3099 /* For small constant 3-operand shift, use LEA. */ 3100 if (a0 != a1 && a2 >= 1 && a2 <= 3) { 3101 if (a2 == 1) { 3102 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 3103 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 3104 } else { 3105 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 3106 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 3107 } 3108 return; 3109 } 3110 tcg_out_mov(s, type, a0, a1); 3111 tcg_out_shifti(s, SHIFT_SHL + rexw, a0, a2); 3112} 3113 3114static const TCGOutOpBinary outop_shl = { 3115 .base.static_constraint = C_Dynamic, 3116 .base.dynamic_constraint = cset_shift, 3117 .out_rrr = tgen_shl, 3118 .out_rri = tgen_shli, 3119}; 3120 3121static void tgen_shr(TCGContext *s, TCGType type, 3122 TCGReg a0, TCGReg a1, TCGReg a2) 3123{ 3124 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3125 if (have_bmi2) { 3126 tcg_out_vex_modrm(s, OPC_SHRX + rexw, a0, a2, a1); 3127 } else { 3128 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, SHIFT_SHR, a0); 3129 } 3130} 3131 3132static void tgen_shri(TCGContext *s, TCGType type, 3133 TCGReg a0, TCGReg a1, tcg_target_long a2) 3134{ 3135 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3136 3137 tcg_out_mov(s, type, a0, a1); 3138 tcg_out_shifti(s, SHIFT_SHR + rexw, a0, a2); 3139} 3140 3141static const TCGOutOpBinary outop_shr = { 3142 .base.static_constraint = C_Dynamic, 3143 .base.dynamic_constraint = cset_shift, 3144 .out_rrr = tgen_shr, 3145 .out_rri = tgen_shri, 3146}; 3147 3148static void tgen_sub(TCGContext *s, TCGType type, 3149 TCGReg a0, TCGReg a1, TCGReg a2) 3150{ 3151 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3152 tgen_arithr(s, ARITH_SUB + rexw, a0, a2); 3153} 3154 3155static const TCGOutOpSubtract outop_sub = { 3156 .base.static_constraint = C_O1_I2(r, 0, r), 3157 .out_rrr = tgen_sub, 3158}; 3159 3160static void tgen_subbo_rri(TCGContext *s, TCGType type, 3161 TCGReg a0, TCGReg a1, tcg_target_long a2) 3162{ 3163 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3164 tgen_arithi(s, ARITH_SUB + rexw, a0, a2, 1); 3165} 3166 3167static const TCGOutOpAddSubCarry outop_subbo = { 3168 .base.static_constraint = C_O1_I2(r, 0, re), 3169 .out_rrr = tgen_sub, 3170 .out_rri = tgen_subbo_rri, 3171}; 3172 3173static void tgen_subbio_rrr(TCGContext *s, TCGType type, 3174 TCGReg a0, TCGReg a1, TCGReg a2) 3175{ 3176 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3177 tgen_arithr(s, ARITH_SBB + rexw, a0, a2); 3178} 3179 3180static void tgen_subbio_rri(TCGContext *s, TCGType type, 3181 TCGReg a0, TCGReg a1, tcg_target_long a2) 3182{ 3183 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3184 tgen_arithi(s, ARITH_SBB + rexw, a0, a2, 1); 3185} 3186 3187static const TCGOutOpAddSubCarry outop_subbio = { 3188 .base.static_constraint = C_O1_I2(r, 0, re), 3189 .out_rrr = tgen_subbio_rrr, 3190 .out_rri = tgen_subbio_rri, 3191}; 3192 3193#define outop_subbi outop_subbio 3194 3195static void tcg_out_set_borrow(TCGContext *s) 3196{ 3197 tcg_out8(s, OPC_STC); 3198} 3199 3200static void tgen_xor(TCGContext *s, TCGType type, 3201 TCGReg a0, TCGReg a1, TCGReg a2) 3202{ 3203 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3204 tgen_arithr(s, ARITH_XOR + rexw, a0, a2); 3205} 3206 3207static void tgen_xori(TCGContext *s, TCGType type, 3208 TCGReg a0, TCGReg a1, tcg_target_long a2) 3209{ 3210 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3211 tgen_arithi(s, ARITH_XOR + rexw, a0, a2, false); 3212} 3213 3214static const TCGOutOpBinary outop_xor = { 3215 .base.static_constraint = C_O1_I2(r, 0, re), 3216 .out_rrr = tgen_xor, 3217 .out_rri = tgen_xori, 3218}; 3219 3220static void tgen_bswap16(TCGContext *s, TCGType type, 3221 TCGReg a0, TCGReg a1, unsigned flags) 3222{ 3223 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3224 3225 if (flags & TCG_BSWAP_OS) { 3226 /* Output must be sign-extended. */ 3227 if (rexw) { 3228 tcg_out_bswap64(s, a0); 3229 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 3230 } else { 3231 tcg_out_bswap32(s, a0); 3232 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 3233 } 3234 } else if ((flags & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 3235 /* Output must be zero-extended, but input isn't. */ 3236 tcg_out_bswap32(s, a0); 3237 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 3238 } else { 3239 tcg_out_rolw_8(s, a0); 3240 } 3241} 3242 3243static const TCGOutOpBswap outop_bswap16 = { 3244 .base.static_constraint = C_O1_I1(r, 0), 3245 .out_rr = tgen_bswap16, 3246}; 3247 3248static void tgen_bswap32(TCGContext *s, TCGType type, 3249 TCGReg a0, TCGReg a1, unsigned flags) 3250{ 3251 tcg_out_bswap32(s, a0); 3252 if (flags & TCG_BSWAP_OS) { 3253 tcg_out_ext32s(s, a0, a0); 3254 } 3255} 3256 3257static const TCGOutOpBswap outop_bswap32 = { 3258 .base.static_constraint = C_O1_I1(r, 0), 3259 .out_rr = tgen_bswap32, 3260}; 3261 3262#if TCG_TARGET_REG_BITS == 64 3263static void tgen_bswap64(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3264{ 3265 tcg_out_bswap64(s, a0); 3266} 3267 3268static const TCGOutOpUnary outop_bswap64 = { 3269 .base.static_constraint = C_O1_I1(r, 0), 3270 .out_rr = tgen_bswap64, 3271}; 3272#endif 3273 3274static void tgen_neg(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3275{ 3276 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3277 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 3278} 3279 3280static const TCGOutOpUnary outop_neg = { 3281 .base.static_constraint = C_O1_I1(r, 0), 3282 .out_rr = tgen_neg, 3283}; 3284 3285static void tgen_not(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1) 3286{ 3287 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3288 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 3289} 3290 3291static const TCGOutOpUnary outop_not = { 3292 .base.static_constraint = C_O1_I1(r, 0), 3293 .out_rr = tgen_not, 3294}; 3295 3296static void tgen_deposit(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1, 3297 TCGReg a2, unsigned ofs, unsigned len) 3298{ 3299 if (ofs == 0 && len == 8) { 3300 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 3301 } else if (ofs == 0 && len == 16) { 3302 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 3303 } else if (TCG_TARGET_REG_BITS == 32 && ofs == 8 && len == 8) { 3304 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 3305 } else { 3306 g_assert_not_reached(); 3307 } 3308} 3309 3310static void tgen_depositi(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1, 3311 tcg_target_long a2, unsigned ofs, unsigned len) 3312{ 3313 if (ofs == 0 && len == 8) { 3314 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 0, a0, 0); 3315 tcg_out8(s, a2); 3316 } else if (ofs == 0 && len == 16) { 3317 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 0, a0, 0); 3318 tcg_out16(s, a2); 3319 } else if (TCG_TARGET_REG_BITS == 32 && ofs == 8 && len == 8) { 3320 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 3321 tcg_out8(s, a2); 3322 } else { 3323 g_assert_not_reached(); 3324 } 3325} 3326 3327static const TCGOutOpDeposit outop_deposit = { 3328 .base.static_constraint = C_O1_I2(q, 0, qi), 3329 .out_rrr = tgen_deposit, 3330 .out_rri = tgen_depositi, 3331}; 3332 3333static void tgen_extract(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1, 3334 unsigned ofs, unsigned len) 3335{ 3336 if (ofs == 0) { 3337 switch (len) { 3338 case 8: 3339 tcg_out_ext8u(s, a0, a1); 3340 return; 3341 case 16: 3342 tcg_out_ext16u(s, a0, a1); 3343 return; 3344 case 32: 3345 tcg_out_ext32u(s, a0, a1); 3346 return; 3347 } 3348 } else if (TCG_TARGET_REG_BITS == 64 && ofs + len == 32) { 3349 /* This is a 32-bit zero-extending right shift. */ 3350 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3351 tcg_out_shifti(s, SHIFT_SHR, a0, ofs); 3352 return; 3353 } else if (ofs == 8 && len == 8) { 3354 /* 3355 * On the off-chance that we can use the high-byte registers. 3356 * Otherwise we emit the same ext16 + shift pattern that we 3357 * would have gotten from the normal tcg-op.c expansion. 3358 */ 3359 if (a1 < 4 && (TCG_TARGET_REG_BITS == 32 || a0 < 8)) { 3360 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3361 } else { 3362 tcg_out_ext16u(s, a0, a1); 3363 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3364 } 3365 return; 3366 } 3367 g_assert_not_reached(); 3368} 3369 3370static const TCGOutOpExtract outop_extract = { 3371 .base.static_constraint = C_O1_I1(r, r), 3372 .out_rr = tgen_extract, 3373}; 3374 3375static void tgen_sextract(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1, 3376 unsigned ofs, unsigned len) 3377{ 3378 if (ofs == 0) { 3379 switch (len) { 3380 case 8: 3381 tcg_out_ext8s(s, type, a0, a1); 3382 return; 3383 case 16: 3384 tcg_out_ext16s(s, type, a0, a1); 3385 return; 3386 case 32: 3387 tcg_out_ext32s(s, a0, a1); 3388 return; 3389 } 3390 } else if (ofs == 8 && len == 8) { 3391 if (type == TCG_TYPE_I32 && a1 < 4 && a0 < 8) { 3392 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3393 } else { 3394 tcg_out_ext16s(s, type, a0, a1); 3395 tgen_sari(s, type, a0, a0, 8); 3396 } 3397 return; 3398 } 3399 g_assert_not_reached(); 3400} 3401 3402static const TCGOutOpExtract outop_sextract = { 3403 .base.static_constraint = C_O1_I1(r, r), 3404 .out_rr = tgen_sextract, 3405}; 3406 3407static void tgen_extract2(TCGContext *s, TCGType type, TCGReg a0, 3408 TCGReg a1, TCGReg a2, unsigned shr) 3409{ 3410 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3411 3412 /* Note that SHRD outputs to the r/m operand. */ 3413 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3414 tcg_out8(s, shr); 3415} 3416 3417static const TCGOutOpExtract2 outop_extract2 = { 3418 .base.static_constraint = C_O1_I2(r, 0, r), 3419 .out_rrr = tgen_extract2, 3420}; 3421 3422static void tgen_ld8u(TCGContext *s, TCGType type, TCGReg dest, 3423 TCGReg base, ptrdiff_t offset) 3424{ 3425 tcg_out_modrm_offset(s, OPC_MOVZBL, dest, base, offset); 3426} 3427 3428static const TCGOutOpLoad outop_ld8u = { 3429 .base.static_constraint = C_O1_I1(r, r), 3430 .out = tgen_ld8u, 3431}; 3432 3433static void tgen_ld8s(TCGContext *s, TCGType type, TCGReg dest, 3434 TCGReg base, ptrdiff_t offset) 3435{ 3436 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3437 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, dest, base, offset); 3438} 3439 3440static const TCGOutOpLoad outop_ld8s = { 3441 .base.static_constraint = C_O1_I1(r, r), 3442 .out = tgen_ld8s, 3443}; 3444 3445static void tgen_ld16u(TCGContext *s, TCGType type, TCGReg dest, 3446 TCGReg base, ptrdiff_t offset) 3447{ 3448 tcg_out_modrm_offset(s, OPC_MOVZWL, dest, base, offset); 3449} 3450 3451static const TCGOutOpLoad outop_ld16u = { 3452 .base.static_constraint = C_O1_I1(r, r), 3453 .out = tgen_ld16u, 3454}; 3455 3456static void tgen_ld16s(TCGContext *s, TCGType type, TCGReg dest, 3457 TCGReg base, ptrdiff_t offset) 3458{ 3459 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 3460 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, dest, base, offset); 3461} 3462 3463static const TCGOutOpLoad outop_ld16s = { 3464 .base.static_constraint = C_O1_I1(r, r), 3465 .out = tgen_ld16s, 3466}; 3467 3468#if TCG_TARGET_REG_BITS == 64 3469static void tgen_ld32u(TCGContext *s, TCGType type, TCGReg dest, 3470 TCGReg base, ptrdiff_t offset) 3471{ 3472 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, dest, base, offset); 3473} 3474 3475static const TCGOutOpLoad outop_ld32u = { 3476 .base.static_constraint = C_O1_I1(r, r), 3477 .out = tgen_ld32u, 3478}; 3479 3480static void tgen_ld32s(TCGContext *s, TCGType type, TCGReg dest, 3481 TCGReg base, ptrdiff_t offset) 3482{ 3483 tcg_out_modrm_offset(s, OPC_MOVSLQ, dest, base, offset); 3484} 3485 3486static const TCGOutOpLoad outop_ld32s = { 3487 .base.static_constraint = C_O1_I1(r, r), 3488 .out = tgen_ld32s, 3489}; 3490#endif 3491 3492static void tgen_st8_r(TCGContext *s, TCGType type, TCGReg data, 3493 TCGReg base, ptrdiff_t offset) 3494{ 3495 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, data, base, offset); 3496} 3497 3498static void tgen_st8_i(TCGContext *s, TCGType type, tcg_target_long data, 3499 TCGReg base, ptrdiff_t offset) 3500{ 3501 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, base, offset); 3502 tcg_out8(s, data); 3503} 3504 3505static const TCGOutOpStore outop_st8 = { 3506 .base.static_constraint = C_O0_I2(qi, r), 3507 .out_r = tgen_st8_r, 3508 .out_i = tgen_st8_i, 3509}; 3510 3511static void tgen_st16_r(TCGContext *s, TCGType type, TCGReg data, 3512 TCGReg base, ptrdiff_t offset) 3513{ 3514 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, data, base, offset); 3515} 3516 3517static void tgen_st16_i(TCGContext *s, TCGType type, tcg_target_long data, 3518 TCGReg base, ptrdiff_t offset) 3519{ 3520 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, base, offset); 3521 tcg_out16(s, data); 3522} 3523 3524static const TCGOutOpStore outop_st16 = { 3525 .base.static_constraint = C_O0_I2(ri, r), 3526 .out_r = tgen_st16_r, 3527 .out_i = tgen_st16_i, 3528}; 3529 3530static void tgen_st_i(TCGContext *s, TCGType type, tcg_target_long data, 3531 TCGReg base, ptrdiff_t offset) 3532{ 3533 bool ok = tcg_out_sti(s, type, data, base, offset); 3534 tcg_debug_assert(ok); 3535} 3536 3537static const TCGOutOpStore outop_st = { 3538 .base.static_constraint = C_O0_I2(re, r), 3539 .out_r = tcg_out_st, 3540 .out_i = tgen_st_i, 3541}; 3542 3543static void tcg_out_op(TCGContext *s, TCGOpcode opc, TCGType type, 3544 const TCGArg args[TCG_MAX_OP_ARGS], 3545 const int const_args[TCG_MAX_OP_ARGS]) 3546{ 3547 TCGArg a0, a1, a2; 3548 3549 /* Hoist the loads of the most common arguments. */ 3550 a0 = args[0]; 3551 a1 = args[1]; 3552 a2 = args[2]; 3553 3554 switch (opc) { 3555 case INDEX_op_qemu_ld_i32: 3556 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32); 3557 break; 3558 case INDEX_op_qemu_ld_i64: 3559 if (TCG_TARGET_REG_BITS == 64) { 3560 tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64); 3561 } else { 3562 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3563 } 3564 break; 3565 case INDEX_op_qemu_ld_i128: 3566 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3567 tcg_out_qemu_ld(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3568 break; 3569 3570 case INDEX_op_qemu_st_i32: 3571 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32); 3572 break; 3573 case INDEX_op_qemu_st_i64: 3574 if (TCG_TARGET_REG_BITS == 64) { 3575 tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64); 3576 } else { 3577 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I64); 3578 } 3579 break; 3580 case INDEX_op_qemu_st_i128: 3581 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3582 tcg_out_qemu_st(s, a0, a1, a2, args[3], TCG_TYPE_I128); 3583 break; 3584 3585 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3586 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3587 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3588 default: 3589 g_assert_not_reached(); 3590 } 3591} 3592 3593static int const umin_insn[4] = { 3594 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3595}; 3596 3597static int const umax_insn[4] = { 3598 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3599}; 3600 3601static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3602 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3603{ 3604 static int const cmpeq_insn[4] = { 3605 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3606 }; 3607 static int const cmpgt_insn[4] = { 3608 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3609 }; 3610 3611 enum { 3612 NEED_INV = 1, 3613 NEED_SWAP = 2, 3614 NEED_UMIN = 4, 3615 NEED_UMAX = 8, 3616 INVALID = 16, 3617 }; 3618 static const uint8_t cond_fixup[16] = { 3619 [0 ... 15] = INVALID, 3620 [TCG_COND_EQ] = 0, 3621 [TCG_COND_GT] = 0, 3622 [TCG_COND_NE] = NEED_INV, 3623 [TCG_COND_LE] = NEED_INV, 3624 [TCG_COND_LT] = NEED_SWAP, 3625 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3626 [TCG_COND_LEU] = NEED_UMIN, 3627 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3628 [TCG_COND_GEU] = NEED_UMAX, 3629 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3630 }; 3631 int fixup = cond_fixup[cond]; 3632 3633 assert(!(fixup & INVALID)); 3634 3635 if (fixup & NEED_INV) { 3636 cond = tcg_invert_cond(cond); 3637 } 3638 3639 if (fixup & NEED_SWAP) { 3640 TCGReg swap = v1; 3641 v1 = v2; 3642 v2 = swap; 3643 cond = tcg_swap_cond(cond); 3644 } 3645 3646 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3647 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3648 3649 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3650 assert(vece <= MO_32); 3651 3652 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3653 v2 = TCG_TMP_VEC; 3654 cond = TCG_COND_EQ; 3655 } 3656 3657 switch (cond) { 3658 case TCG_COND_EQ: 3659 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3660 break; 3661 case TCG_COND_GT: 3662 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3663 break; 3664 default: 3665 g_assert_not_reached(); 3666 } 3667 return fixup & NEED_INV; 3668} 3669 3670static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3671 TCGReg v1, TCGReg v2, TCGCond cond) 3672{ 3673 static const int cmpm_insn[2][4] = { 3674 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3675 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3676 }; 3677 static const int testm_insn[4] = { 3678 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3679 }; 3680 static const int testnm_insn[4] = { 3681 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3682 }; 3683 3684 static const int cond_ext[16] = { 3685 [TCG_COND_EQ] = 0, 3686 [TCG_COND_NE] = 4, 3687 [TCG_COND_LT] = 1, 3688 [TCG_COND_LTU] = 1, 3689 [TCG_COND_LE] = 2, 3690 [TCG_COND_LEU] = 2, 3691 [TCG_COND_NEVER] = 3, 3692 [TCG_COND_GE] = 5, 3693 [TCG_COND_GEU] = 5, 3694 [TCG_COND_GT] = 6, 3695 [TCG_COND_GTU] = 6, 3696 [TCG_COND_ALWAYS] = 7, 3697 }; 3698 3699 switch (cond) { 3700 case TCG_COND_TSTNE: 3701 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3702 break; 3703 case TCG_COND_TSTEQ: 3704 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3705 break; 3706 default: 3707 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3708 /* k1 */ 1, v1, v2, type); 3709 tcg_out8(s, cond_ext[cond]); 3710 break; 3711 } 3712} 3713 3714static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3715 unsigned vece, TCGReg dest) 3716{ 3717 static const int movm_insn[] = { 3718 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3719 }; 3720 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3721} 3722 3723static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3724 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3725{ 3726 /* 3727 * With avx512, we have a complete set of comparisons into mask. 3728 * Unless there's a single insn expansion for the comparision, 3729 * expand via a mask in k1. 3730 */ 3731 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3732 && cond != TCG_COND_EQ 3733 && cond != TCG_COND_LT 3734 && cond != TCG_COND_GT) { 3735 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3736 tcg_out_k1_to_vec(s, type, vece, v0); 3737 return; 3738 } 3739 3740 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3741 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3742 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3743 } 3744} 3745 3746static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3747 TCGReg v0, TCGReg c1, TCGReg c2, 3748 TCGReg v3, TCGReg v4, TCGCond cond) 3749{ 3750 static const int vpblendm_insn[] = { 3751 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3752 }; 3753 bool z = false; 3754 3755 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3756 if (!v3) { 3757 z = true; 3758 v3 = v4; 3759 cond = tcg_invert_cond(cond); 3760 } 3761 3762 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3763 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3764 /* k1 */1, z, type); 3765} 3766 3767static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3768 TCGReg v0, TCGReg c1, TCGReg c2, 3769 TCGReg v3, TCGReg v4, TCGCond cond) 3770{ 3771 bool inv; 3772 3773 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3774 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3775 return; 3776 } 3777 3778 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3779 3780 /* 3781 * Since XMM0 is 16, the only way we get 0 into V3 3782 * is via the constant zero constraint. 3783 */ 3784 if (!v3) { 3785 if (inv) { 3786 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3787 } else { 3788 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3789 } 3790 } else { 3791 if (inv) { 3792 TCGReg swap = v3; 3793 v3 = v4; 3794 v4 = swap; 3795 } 3796 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3797 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3798 } 3799} 3800 3801static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3802 unsigned vecl, unsigned vece, 3803 const TCGArg args[TCG_MAX_OP_ARGS], 3804 const int const_args[TCG_MAX_OP_ARGS]) 3805{ 3806 static int const add_insn[4] = { 3807 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3808 }; 3809 static int const ssadd_insn[4] = { 3810 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3811 }; 3812 static int const usadd_insn[4] = { 3813 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3814 }; 3815 static int const sub_insn[4] = { 3816 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3817 }; 3818 static int const sssub_insn[4] = { 3819 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3820 }; 3821 static int const ussub_insn[4] = { 3822 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3823 }; 3824 static int const mul_insn[4] = { 3825 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3826 }; 3827 static int const shift_imm_insn[4] = { 3828 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3829 }; 3830 static int const punpckl_insn[4] = { 3831 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3832 }; 3833 static int const punpckh_insn[4] = { 3834 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3835 }; 3836 static int const packss_insn[4] = { 3837 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3838 }; 3839 static int const packus_insn[4] = { 3840 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3841 }; 3842 static int const smin_insn[4] = { 3843 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3844 }; 3845 static int const smax_insn[4] = { 3846 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3847 }; 3848 static int const rotlv_insn[4] = { 3849 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3850 }; 3851 static int const rotrv_insn[4] = { 3852 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3853 }; 3854 static int const shlv_insn[4] = { 3855 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3856 }; 3857 static int const shrv_insn[4] = { 3858 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3859 }; 3860 static int const sarv_insn[4] = { 3861 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3862 }; 3863 static int const shls_insn[4] = { 3864 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3865 }; 3866 static int const shrs_insn[4] = { 3867 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3868 }; 3869 static int const sars_insn[4] = { 3870 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3871 }; 3872 static int const vpshldi_insn[4] = { 3873 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3874 }; 3875 static int const vpshldv_insn[4] = { 3876 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3877 }; 3878 static int const vpshrdv_insn[4] = { 3879 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3880 }; 3881 static int const abs_insn[4] = { 3882 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3883 }; 3884 3885 TCGType type = vecl + TCG_TYPE_V64; 3886 int insn, sub; 3887 TCGArg a0, a1, a2, a3; 3888 3889 a0 = args[0]; 3890 a1 = args[1]; 3891 a2 = args[2]; 3892 3893 switch (opc) { 3894 case INDEX_op_add_vec: 3895 insn = add_insn[vece]; 3896 goto gen_simd; 3897 case INDEX_op_ssadd_vec: 3898 insn = ssadd_insn[vece]; 3899 goto gen_simd; 3900 case INDEX_op_usadd_vec: 3901 insn = usadd_insn[vece]; 3902 goto gen_simd; 3903 case INDEX_op_sub_vec: 3904 insn = sub_insn[vece]; 3905 goto gen_simd; 3906 case INDEX_op_sssub_vec: 3907 insn = sssub_insn[vece]; 3908 goto gen_simd; 3909 case INDEX_op_ussub_vec: 3910 insn = ussub_insn[vece]; 3911 goto gen_simd; 3912 case INDEX_op_mul_vec: 3913 insn = mul_insn[vece]; 3914 goto gen_simd; 3915 case INDEX_op_and_vec: 3916 insn = OPC_PAND; 3917 goto gen_simd; 3918 case INDEX_op_or_vec: 3919 insn = OPC_POR; 3920 goto gen_simd; 3921 case INDEX_op_xor_vec: 3922 insn = OPC_PXOR; 3923 goto gen_simd; 3924 case INDEX_op_smin_vec: 3925 insn = smin_insn[vece]; 3926 goto gen_simd; 3927 case INDEX_op_umin_vec: 3928 insn = umin_insn[vece]; 3929 goto gen_simd; 3930 case INDEX_op_smax_vec: 3931 insn = smax_insn[vece]; 3932 goto gen_simd; 3933 case INDEX_op_umax_vec: 3934 insn = umax_insn[vece]; 3935 goto gen_simd; 3936 case INDEX_op_shlv_vec: 3937 insn = shlv_insn[vece]; 3938 goto gen_simd; 3939 case INDEX_op_shrv_vec: 3940 insn = shrv_insn[vece]; 3941 goto gen_simd; 3942 case INDEX_op_sarv_vec: 3943 insn = sarv_insn[vece]; 3944 goto gen_simd; 3945 case INDEX_op_rotlv_vec: 3946 insn = rotlv_insn[vece]; 3947 goto gen_simd; 3948 case INDEX_op_rotrv_vec: 3949 insn = rotrv_insn[vece]; 3950 goto gen_simd; 3951 case INDEX_op_shls_vec: 3952 insn = shls_insn[vece]; 3953 goto gen_simd; 3954 case INDEX_op_shrs_vec: 3955 insn = shrs_insn[vece]; 3956 goto gen_simd; 3957 case INDEX_op_sars_vec: 3958 insn = sars_insn[vece]; 3959 goto gen_simd; 3960 case INDEX_op_x86_punpckl_vec: 3961 insn = punpckl_insn[vece]; 3962 goto gen_simd; 3963 case INDEX_op_x86_punpckh_vec: 3964 insn = punpckh_insn[vece]; 3965 goto gen_simd; 3966 case INDEX_op_x86_packss_vec: 3967 insn = packss_insn[vece]; 3968 goto gen_simd; 3969 case INDEX_op_x86_packus_vec: 3970 insn = packus_insn[vece]; 3971 goto gen_simd; 3972 case INDEX_op_x86_vpshldv_vec: 3973 insn = vpshldv_insn[vece]; 3974 a1 = a2; 3975 a2 = args[3]; 3976 goto gen_simd; 3977 case INDEX_op_x86_vpshrdv_vec: 3978 insn = vpshrdv_insn[vece]; 3979 a1 = a2; 3980 a2 = args[3]; 3981 goto gen_simd; 3982#if TCG_TARGET_REG_BITS == 32 3983 case INDEX_op_dup2_vec: 3984 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3985 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3986 /* Then replicate the 64-bit elements across the rest of the vector. */ 3987 if (type != TCG_TYPE_V64) { 3988 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3989 } 3990 break; 3991#endif 3992 case INDEX_op_abs_vec: 3993 insn = abs_insn[vece]; 3994 a2 = a1; 3995 a1 = 0; 3996 goto gen_simd; 3997 gen_simd: 3998 tcg_debug_assert(insn != OPC_UD2); 3999 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 4000 break; 4001 4002 case INDEX_op_cmp_vec: 4003 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 4004 break; 4005 4006 case INDEX_op_cmpsel_vec: 4007 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 4008 args[3], args[4], args[5]); 4009 break; 4010 4011 case INDEX_op_andc_vec: 4012 insn = OPC_PANDN; 4013 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 4014 break; 4015 4016 case INDEX_op_shli_vec: 4017 insn = shift_imm_insn[vece]; 4018 sub = 6; 4019 goto gen_shift; 4020 case INDEX_op_shri_vec: 4021 insn = shift_imm_insn[vece]; 4022 sub = 2; 4023 goto gen_shift; 4024 case INDEX_op_sari_vec: 4025 if (vece == MO_64) { 4026 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 4027 } else { 4028 insn = shift_imm_insn[vece]; 4029 } 4030 sub = 4; 4031 goto gen_shift; 4032 case INDEX_op_rotli_vec: 4033 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 4034 if (vece == MO_64) { 4035 insn |= P_VEXW; 4036 } 4037 sub = 1; 4038 goto gen_shift; 4039 gen_shift: 4040 tcg_debug_assert(vece != MO_8); 4041 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 4042 tcg_out8(s, a2); 4043 break; 4044 4045 case INDEX_op_ld_vec: 4046 tcg_out_ld(s, type, a0, a1, a2); 4047 break; 4048 case INDEX_op_st_vec: 4049 tcg_out_st(s, type, a0, a1, a2); 4050 break; 4051 case INDEX_op_dupm_vec: 4052 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 4053 break; 4054 4055 case INDEX_op_x86_shufps_vec: 4056 insn = OPC_SHUFPS; 4057 sub = args[3]; 4058 goto gen_simd_imm8; 4059 case INDEX_op_x86_blend_vec: 4060 if (vece == MO_16) { 4061 insn = OPC_PBLENDW; 4062 } else if (vece == MO_32) { 4063 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 4064 } else { 4065 g_assert_not_reached(); 4066 } 4067 sub = args[3]; 4068 goto gen_simd_imm8; 4069 case INDEX_op_x86_vperm2i128_vec: 4070 insn = OPC_VPERM2I128; 4071 sub = args[3]; 4072 goto gen_simd_imm8; 4073 case INDEX_op_x86_vpshldi_vec: 4074 insn = vpshldi_insn[vece]; 4075 sub = args[3]; 4076 goto gen_simd_imm8; 4077 4078 case INDEX_op_not_vec: 4079 insn = OPC_VPTERNLOGQ; 4080 a2 = a1; 4081 sub = 0x33; /* !B */ 4082 goto gen_simd_imm8; 4083 case INDEX_op_nor_vec: 4084 insn = OPC_VPTERNLOGQ; 4085 sub = 0x11; /* norCB */ 4086 goto gen_simd_imm8; 4087 case INDEX_op_nand_vec: 4088 insn = OPC_VPTERNLOGQ; 4089 sub = 0x77; /* nandCB */ 4090 goto gen_simd_imm8; 4091 case INDEX_op_eqv_vec: 4092 insn = OPC_VPTERNLOGQ; 4093 sub = 0x99; /* xnorCB */ 4094 goto gen_simd_imm8; 4095 case INDEX_op_orc_vec: 4096 insn = OPC_VPTERNLOGQ; 4097 sub = 0xdd; /* orB!C */ 4098 goto gen_simd_imm8; 4099 4100 case INDEX_op_bitsel_vec: 4101 insn = OPC_VPTERNLOGQ; 4102 a3 = args[3]; 4103 if (a0 == a1) { 4104 a1 = a2; 4105 a2 = a3; 4106 sub = 0xca; /* A?B:C */ 4107 } else if (a0 == a2) { 4108 a2 = a3; 4109 sub = 0xe2; /* B?A:C */ 4110 } else { 4111 tcg_out_mov(s, type, a0, a3); 4112 sub = 0xb8; /* B?C:A */ 4113 } 4114 goto gen_simd_imm8; 4115 4116 gen_simd_imm8: 4117 tcg_debug_assert(insn != OPC_UD2); 4118 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 4119 tcg_out8(s, sub); 4120 break; 4121 4122 case INDEX_op_x86_psrldq_vec: 4123 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 4124 tcg_out8(s, a2); 4125 break; 4126 4127 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 4128 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 4129 default: 4130 g_assert_not_reached(); 4131 } 4132} 4133 4134static TCGConstraintSetIndex 4135tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) 4136{ 4137 switch (op) { 4138 case INDEX_op_qemu_ld_i32: 4139 return C_O1_I1(r, L); 4140 4141 case INDEX_op_qemu_st_i32: 4142 return (TCG_TARGET_REG_BITS == 32 && flags == MO_8 4143 ? C_O0_I2(s, L) 4144 : C_O0_I2(L, L)); 4145 4146 case INDEX_op_qemu_ld_i64: 4147 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 4148 4149 case INDEX_op_qemu_st_i64: 4150 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 4151 4152 case INDEX_op_qemu_ld_i128: 4153 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4154 return C_O2_I1(r, r, L); 4155 case INDEX_op_qemu_st_i128: 4156 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 4157 return C_O0_I3(L, L, L); 4158 4159 case INDEX_op_ld_vec: 4160 case INDEX_op_dupm_vec: 4161 return C_O1_I1(x, r); 4162 4163 case INDEX_op_st_vec: 4164 return C_O0_I2(x, r); 4165 4166 case INDEX_op_add_vec: 4167 case INDEX_op_sub_vec: 4168 case INDEX_op_mul_vec: 4169 case INDEX_op_and_vec: 4170 case INDEX_op_or_vec: 4171 case INDEX_op_xor_vec: 4172 case INDEX_op_andc_vec: 4173 case INDEX_op_orc_vec: 4174 case INDEX_op_nand_vec: 4175 case INDEX_op_nor_vec: 4176 case INDEX_op_eqv_vec: 4177 case INDEX_op_ssadd_vec: 4178 case INDEX_op_usadd_vec: 4179 case INDEX_op_sssub_vec: 4180 case INDEX_op_ussub_vec: 4181 case INDEX_op_smin_vec: 4182 case INDEX_op_umin_vec: 4183 case INDEX_op_smax_vec: 4184 case INDEX_op_umax_vec: 4185 case INDEX_op_shlv_vec: 4186 case INDEX_op_shrv_vec: 4187 case INDEX_op_sarv_vec: 4188 case INDEX_op_rotlv_vec: 4189 case INDEX_op_rotrv_vec: 4190 case INDEX_op_shls_vec: 4191 case INDEX_op_shrs_vec: 4192 case INDEX_op_sars_vec: 4193 case INDEX_op_cmp_vec: 4194 case INDEX_op_x86_shufps_vec: 4195 case INDEX_op_x86_blend_vec: 4196 case INDEX_op_x86_packss_vec: 4197 case INDEX_op_x86_packus_vec: 4198 case INDEX_op_x86_vperm2i128_vec: 4199 case INDEX_op_x86_punpckl_vec: 4200 case INDEX_op_x86_punpckh_vec: 4201 case INDEX_op_x86_vpshldi_vec: 4202#if TCG_TARGET_REG_BITS == 32 4203 case INDEX_op_dup2_vec: 4204#endif 4205 return C_O1_I2(x, x, x); 4206 4207 case INDEX_op_abs_vec: 4208 case INDEX_op_dup_vec: 4209 case INDEX_op_not_vec: 4210 case INDEX_op_shli_vec: 4211 case INDEX_op_shri_vec: 4212 case INDEX_op_sari_vec: 4213 case INDEX_op_rotli_vec: 4214 case INDEX_op_x86_psrldq_vec: 4215 return C_O1_I1(x, x); 4216 4217 case INDEX_op_x86_vpshldv_vec: 4218 case INDEX_op_x86_vpshrdv_vec: 4219 return C_O1_I3(x, 0, x, x); 4220 4221 case INDEX_op_bitsel_vec: 4222 return C_O1_I3(x, x, x, x); 4223 case INDEX_op_cmpsel_vec: 4224 return C_O1_I4(x, x, x, xO, x); 4225 4226 default: 4227 return C_NotImplemented; 4228 } 4229} 4230 4231int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 4232{ 4233 switch (opc) { 4234 case INDEX_op_add_vec: 4235 case INDEX_op_sub_vec: 4236 case INDEX_op_and_vec: 4237 case INDEX_op_or_vec: 4238 case INDEX_op_xor_vec: 4239 case INDEX_op_andc_vec: 4240 case INDEX_op_orc_vec: 4241 case INDEX_op_nand_vec: 4242 case INDEX_op_nor_vec: 4243 case INDEX_op_eqv_vec: 4244 case INDEX_op_not_vec: 4245 case INDEX_op_bitsel_vec: 4246 return 1; 4247 case INDEX_op_cmp_vec: 4248 case INDEX_op_cmpsel_vec: 4249 return -1; 4250 4251 case INDEX_op_rotli_vec: 4252 return have_avx512vl && vece >= MO_32 ? 1 : -1; 4253 4254 case INDEX_op_shli_vec: 4255 case INDEX_op_shri_vec: 4256 /* We must expand the operation for MO_8. */ 4257 return vece == MO_8 ? -1 : 1; 4258 4259 case INDEX_op_sari_vec: 4260 switch (vece) { 4261 case MO_8: 4262 return -1; 4263 case MO_16: 4264 case MO_32: 4265 return 1; 4266 case MO_64: 4267 if (have_avx512vl) { 4268 return 1; 4269 } 4270 /* 4271 * We can emulate this for MO_64, but it does not pay off 4272 * unless we're producing at least 4 values. 4273 */ 4274 return type >= TCG_TYPE_V256 ? -1 : 0; 4275 } 4276 return 0; 4277 4278 case INDEX_op_shls_vec: 4279 case INDEX_op_shrs_vec: 4280 return vece >= MO_16; 4281 case INDEX_op_sars_vec: 4282 switch (vece) { 4283 case MO_16: 4284 case MO_32: 4285 return 1; 4286 case MO_64: 4287 return have_avx512vl; 4288 } 4289 return 0; 4290 case INDEX_op_rotls_vec: 4291 return vece >= MO_16 ? -1 : 0; 4292 4293 case INDEX_op_shlv_vec: 4294 case INDEX_op_shrv_vec: 4295 switch (vece) { 4296 case MO_16: 4297 return have_avx512bw; 4298 case MO_32: 4299 case MO_64: 4300 return have_avx2; 4301 } 4302 return 0; 4303 case INDEX_op_sarv_vec: 4304 switch (vece) { 4305 case MO_16: 4306 return have_avx512bw; 4307 case MO_32: 4308 return have_avx2; 4309 case MO_64: 4310 return have_avx512vl; 4311 } 4312 return 0; 4313 case INDEX_op_rotlv_vec: 4314 case INDEX_op_rotrv_vec: 4315 switch (vece) { 4316 case MO_16: 4317 return have_avx512vbmi2 ? -1 : 0; 4318 case MO_32: 4319 case MO_64: 4320 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 4321 } 4322 return 0; 4323 4324 case INDEX_op_mul_vec: 4325 switch (vece) { 4326 case MO_8: 4327 return -1; 4328 case MO_64: 4329 return have_avx512dq; 4330 } 4331 return 1; 4332 4333 case INDEX_op_ssadd_vec: 4334 case INDEX_op_usadd_vec: 4335 case INDEX_op_sssub_vec: 4336 case INDEX_op_ussub_vec: 4337 return vece <= MO_16; 4338 case INDEX_op_smin_vec: 4339 case INDEX_op_smax_vec: 4340 case INDEX_op_umin_vec: 4341 case INDEX_op_umax_vec: 4342 case INDEX_op_abs_vec: 4343 return vece <= MO_32 || have_avx512vl; 4344 4345 default: 4346 return 0; 4347 } 4348} 4349 4350static void expand_vec_shi(TCGType type, unsigned vece, bool right, 4351 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4352{ 4353 uint8_t mask; 4354 4355 tcg_debug_assert(vece == MO_8); 4356 if (right) { 4357 mask = 0xff >> imm; 4358 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4359 } else { 4360 mask = 0xff << imm; 4361 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4362 } 4363 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4364} 4365 4366static void expand_vec_sari(TCGType type, unsigned vece, 4367 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4368{ 4369 TCGv_vec t1, t2; 4370 4371 switch (vece) { 4372 case MO_8: 4373 /* Unpack to 16-bit, shift, and repack. */ 4374 t1 = tcg_temp_new_vec(type); 4375 t2 = tcg_temp_new_vec(type); 4376 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4377 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4378 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4379 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4380 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4381 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4382 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4383 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4384 tcg_temp_free_vec(t1); 4385 tcg_temp_free_vec(t2); 4386 break; 4387 4388 case MO_64: 4389 t1 = tcg_temp_new_vec(type); 4390 if (imm <= 32) { 4391 /* 4392 * We can emulate a small sign extend by performing an arithmetic 4393 * 32-bit shift and overwriting the high half of a 64-bit logical 4394 * shift. Note that the ISA says shift of 32 is valid, but TCG 4395 * does not, so we have to bound the smaller shift -- we get the 4396 * same result in the high half either way. 4397 */ 4398 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4399 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4400 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4401 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4402 tcgv_vec_arg(t1), 0xaa); 4403 } else { 4404 /* Otherwise we will need to use a compare vs 0 to produce 4405 * the sign-extend, shift and merge. 4406 */ 4407 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4408 tcg_constant_vec(type, MO_64, 0), v1); 4409 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4410 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4411 tcg_gen_or_vec(MO_64, v0, v0, t1); 4412 } 4413 tcg_temp_free_vec(t1); 4414 break; 4415 4416 default: 4417 g_assert_not_reached(); 4418 } 4419} 4420 4421static void expand_vec_rotli(TCGType type, unsigned vece, 4422 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4423{ 4424 TCGv_vec t; 4425 4426 if (vece != MO_8 && have_avx512vbmi2) { 4427 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4428 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4429 return; 4430 } 4431 4432 t = tcg_temp_new_vec(type); 4433 tcg_gen_shli_vec(vece, t, v1, imm); 4434 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4435 tcg_gen_or_vec(vece, v0, v0, t); 4436 tcg_temp_free_vec(t); 4437} 4438 4439static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4440 TCGv_vec v1, TCGv_vec sh, bool right) 4441{ 4442 TCGv_vec t; 4443 4444 if (have_avx512vbmi2) { 4445 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4446 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4447 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4448 return; 4449 } 4450 4451 t = tcg_temp_new_vec(type); 4452 tcg_gen_dupi_vec(vece, t, 8 << vece); 4453 tcg_gen_sub_vec(vece, t, t, sh); 4454 if (right) { 4455 tcg_gen_shlv_vec(vece, t, v1, t); 4456 tcg_gen_shrv_vec(vece, v0, v1, sh); 4457 } else { 4458 tcg_gen_shrv_vec(vece, t, v1, t); 4459 tcg_gen_shlv_vec(vece, v0, v1, sh); 4460 } 4461 tcg_gen_or_vec(vece, v0, v0, t); 4462 tcg_temp_free_vec(t); 4463} 4464 4465static void expand_vec_rotls(TCGType type, unsigned vece, 4466 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4467{ 4468 TCGv_vec t = tcg_temp_new_vec(type); 4469 4470 tcg_debug_assert(vece != MO_8); 4471 4472 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4473 tcg_gen_dup_i32_vec(vece, t, lsh); 4474 if (vece >= MO_32) { 4475 tcg_gen_rotlv_vec(vece, v0, v1, t); 4476 } else { 4477 expand_vec_rotv(type, vece, v0, v1, t, false); 4478 } 4479 } else { 4480 TCGv_i32 rsh = tcg_temp_new_i32(); 4481 4482 tcg_gen_neg_i32(rsh, lsh); 4483 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4484 tcg_gen_shls_vec(vece, t, v1, lsh); 4485 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4486 tcg_gen_or_vec(vece, v0, v0, t); 4487 4488 tcg_temp_free_i32(rsh); 4489 } 4490 4491 tcg_temp_free_vec(t); 4492} 4493 4494static void expand_vec_mul(TCGType type, unsigned vece, 4495 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4496{ 4497 TCGv_vec t1, t2, t3, t4, zero; 4498 4499 tcg_debug_assert(vece == MO_8); 4500 4501 /* 4502 * Unpack v1 bytes to words, 0 | x. 4503 * Unpack v2 bytes to words, y | 0. 4504 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4505 * Shift logical right by 8 bits to clear the high 8 bytes before 4506 * using an unsigned saturated pack. 4507 * 4508 * The difference between the V64, V128 and V256 cases is merely how 4509 * we distribute the expansion between temporaries. 4510 */ 4511 switch (type) { 4512 case TCG_TYPE_V64: 4513 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4514 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4515 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4516 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4517 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4518 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4519 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4520 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4521 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4522 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4523 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4524 tcg_temp_free_vec(t1); 4525 tcg_temp_free_vec(t2); 4526 break; 4527 4528 case TCG_TYPE_V128: 4529 case TCG_TYPE_V256: 4530 t1 = tcg_temp_new_vec(type); 4531 t2 = tcg_temp_new_vec(type); 4532 t3 = tcg_temp_new_vec(type); 4533 t4 = tcg_temp_new_vec(type); 4534 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4535 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4536 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4537 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4538 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4539 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4540 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4541 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4542 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4543 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4544 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4545 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4546 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4547 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4548 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4549 tcg_temp_free_vec(t1); 4550 tcg_temp_free_vec(t2); 4551 tcg_temp_free_vec(t3); 4552 tcg_temp_free_vec(t4); 4553 break; 4554 4555 default: 4556 g_assert_not_reached(); 4557 } 4558} 4559 4560static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4561 TCGArg *a1, TCGArg *a2, TCGCond cond) 4562{ 4563 /* 4564 * Without AVX512, there are no 64-bit unsigned comparisons. 4565 * We must bias the inputs so that they become signed. 4566 * All other swapping and inversion are handled during code generation. 4567 */ 4568 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4569 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4570 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4571 TCGv_vec t1 = tcg_temp_new_vec(type); 4572 TCGv_vec t2 = tcg_temp_new_vec(type); 4573 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4574 4575 tcg_gen_sub_vec(vece, t1, v1, t3); 4576 tcg_gen_sub_vec(vece, t2, v2, t3); 4577 *a1 = tcgv_vec_arg(t1); 4578 *a2 = tcgv_vec_arg(t2); 4579 cond = tcg_signed_cond(cond); 4580 } 4581 return cond; 4582} 4583 4584static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4585 TCGArg a1, TCGArg a2, TCGCond cond) 4586{ 4587 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4588 /* Expand directly; do not recurse. */ 4589 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4590} 4591 4592static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4593 TCGArg a1, TCGArg a2, 4594 TCGArg a3, TCGArg a4, TCGCond cond) 4595{ 4596 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4597 /* Expand directly; do not recurse. */ 4598 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4599} 4600 4601void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4602 TCGArg a0, ...) 4603{ 4604 va_list va; 4605 TCGArg a1, a2, a3, a4, a5; 4606 TCGv_vec v0, v1, v2; 4607 4608 va_start(va, a0); 4609 a1 = va_arg(va, TCGArg); 4610 a2 = va_arg(va, TCGArg); 4611 v0 = temp_tcgv_vec(arg_temp(a0)); 4612 v1 = temp_tcgv_vec(arg_temp(a1)); 4613 4614 switch (opc) { 4615 case INDEX_op_shli_vec: 4616 expand_vec_shi(type, vece, false, v0, v1, a2); 4617 break; 4618 case INDEX_op_shri_vec: 4619 expand_vec_shi(type, vece, true, v0, v1, a2); 4620 break; 4621 case INDEX_op_sari_vec: 4622 expand_vec_sari(type, vece, v0, v1, a2); 4623 break; 4624 4625 case INDEX_op_rotli_vec: 4626 expand_vec_rotli(type, vece, v0, v1, a2); 4627 break; 4628 4629 case INDEX_op_rotls_vec: 4630 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4631 break; 4632 4633 case INDEX_op_rotlv_vec: 4634 v2 = temp_tcgv_vec(arg_temp(a2)); 4635 expand_vec_rotv(type, vece, v0, v1, v2, false); 4636 break; 4637 case INDEX_op_rotrv_vec: 4638 v2 = temp_tcgv_vec(arg_temp(a2)); 4639 expand_vec_rotv(type, vece, v0, v1, v2, true); 4640 break; 4641 4642 case INDEX_op_mul_vec: 4643 v2 = temp_tcgv_vec(arg_temp(a2)); 4644 expand_vec_mul(type, vece, v0, v1, v2); 4645 break; 4646 4647 case INDEX_op_cmp_vec: 4648 a3 = va_arg(va, TCGArg); 4649 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4650 break; 4651 4652 case INDEX_op_cmpsel_vec: 4653 a3 = va_arg(va, TCGArg); 4654 a4 = va_arg(va, TCGArg); 4655 a5 = va_arg(va, TCGArg); 4656 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4657 break; 4658 4659 default: 4660 break; 4661 } 4662 4663 va_end(va); 4664} 4665 4666static const int tcg_target_callee_save_regs[] = { 4667#if TCG_TARGET_REG_BITS == 64 4668 TCG_REG_RBP, 4669 TCG_REG_RBX, 4670#if defined(_WIN64) 4671 TCG_REG_RDI, 4672 TCG_REG_RSI, 4673#endif 4674 TCG_REG_R12, 4675 TCG_REG_R13, 4676 TCG_REG_R14, /* Currently used for the global env. */ 4677 TCG_REG_R15, 4678#else 4679 TCG_REG_EBP, /* Currently used for the global env. */ 4680 TCG_REG_EBX, 4681 TCG_REG_ESI, 4682 TCG_REG_EDI, 4683#endif 4684}; 4685 4686/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4687 and tcg_register_jit. */ 4688 4689#define PUSH_SIZE \ 4690 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4691 * (TCG_TARGET_REG_BITS / 8)) 4692 4693#define FRAME_SIZE \ 4694 ((PUSH_SIZE \ 4695 + TCG_STATIC_CALL_ARGS_SIZE \ 4696 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4697 + TCG_TARGET_STACK_ALIGN - 1) \ 4698 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4699 4700/* Generate global QEMU prologue and epilogue code */ 4701static void tcg_target_qemu_prologue(TCGContext *s) 4702{ 4703 int i, stack_addend; 4704 4705 /* TB prologue */ 4706 4707 /* Reserve some stack space, also for TCG temps. */ 4708 stack_addend = FRAME_SIZE - PUSH_SIZE; 4709 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4710 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4711 4712 /* Save all callee saved registers. */ 4713 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4714 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4715 } 4716 4717 if (!tcg_use_softmmu && guest_base) { 4718 int seg = setup_guest_base_seg(); 4719 if (seg != 0) { 4720 x86_guest_base.seg = seg; 4721 } else if (guest_base == (int32_t)guest_base) { 4722 x86_guest_base.ofs = guest_base; 4723 } else { 4724 assert(TCG_TARGET_REG_BITS == 64); 4725 /* Choose R12 because, as a base, it requires a SIB byte. */ 4726 x86_guest_base.index = TCG_REG_R12; 4727 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4728 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4729 } 4730 } 4731 4732 if (TCG_TARGET_REG_BITS == 32) { 4733 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4734 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4735 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4736 /* jmp *tb. */ 4737 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4738 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4739 + stack_addend); 4740 } else { 4741 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4742 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4743 /* jmp *tb. */ 4744 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4745 } 4746 4747 /* 4748 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4749 * and fall through to the rest of the epilogue. 4750 */ 4751 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4752 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4753 4754 /* TB epilogue */ 4755 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4756 4757 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4758 4759 if (have_avx2) { 4760 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4761 } 4762 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4763 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4764 } 4765 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4766} 4767 4768static void tcg_out_tb_start(TCGContext *s) 4769{ 4770 /* nothing to do */ 4771} 4772 4773static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4774{ 4775 memset(p, 0x90, count); 4776} 4777 4778static void tcg_target_init(TCGContext *s) 4779{ 4780 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4781 if (TCG_TARGET_REG_BITS == 64) { 4782 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4783 } 4784 if (have_avx1) { 4785 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4786 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4787 } 4788 if (have_avx2) { 4789 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4790 } 4791 4792 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4793 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4794 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4795 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4796 if (TCG_TARGET_REG_BITS == 64) { 4797#if !defined(_WIN64) 4798 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4799 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4800#endif 4801 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4802 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4803 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4804 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4805 } 4806 4807 s->reserved_regs = 0; 4808 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4809 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4810#ifdef _WIN64 4811 /* These are call saved, and we don't save them, so don't use them. */ 4812 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4813 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4814 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4815 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4816 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4817 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4818 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4819 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4820 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4821 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4822#endif 4823} 4824 4825typedef struct { 4826 DebugFrameHeader h; 4827 uint8_t fde_def_cfa[4]; 4828 uint8_t fde_reg_ofs[14]; 4829} DebugFrame; 4830 4831/* We're expecting a 2 byte uleb128 encoded value. */ 4832QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4833 4834#if !defined(__ELF__) 4835 /* Host machine without ELF. */ 4836#elif TCG_TARGET_REG_BITS == 64 4837#define ELF_HOST_MACHINE EM_X86_64 4838static const DebugFrame debug_frame = { 4839 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4840 .h.cie.id = -1, 4841 .h.cie.version = 1, 4842 .h.cie.code_align = 1, 4843 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4844 .h.cie.return_column = 16, 4845 4846 /* Total FDE size does not include the "len" member. */ 4847 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4848 4849 .fde_def_cfa = { 4850 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4851 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4852 (FRAME_SIZE >> 7) 4853 }, 4854 .fde_reg_ofs = { 4855 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4856 /* The following ordering must match tcg_target_callee_save_regs. */ 4857 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4858 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4859 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4860 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4861 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4862 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4863 } 4864}; 4865#else 4866#define ELF_HOST_MACHINE EM_386 4867static const DebugFrame debug_frame = { 4868 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4869 .h.cie.id = -1, 4870 .h.cie.version = 1, 4871 .h.cie.code_align = 1, 4872 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4873 .h.cie.return_column = 8, 4874 4875 /* Total FDE size does not include the "len" member. */ 4876 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4877 4878 .fde_def_cfa = { 4879 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4880 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4881 (FRAME_SIZE >> 7) 4882 }, 4883 .fde_reg_ofs = { 4884 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4885 /* The following ordering must match tcg_target_callee_save_regs. */ 4886 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4887 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4888 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4889 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4890 } 4891}; 4892#endif 4893 4894#if defined(ELF_HOST_MACHINE) 4895void tcg_register_jit(const void *buf, size_t buf_size) 4896{ 4897 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4898} 4899#endif 4900