1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94#define TCG_TMP_VEC TCG_REG_XMM5 95 96static const int tcg_target_call_iarg_regs[] = { 97#if TCG_TARGET_REG_BITS == 64 98#if defined(_WIN64) 99 TCG_REG_RCX, 100 TCG_REG_RDX, 101#else 102 TCG_REG_RDI, 103 TCG_REG_RSI, 104 TCG_REG_RDX, 105 TCG_REG_RCX, 106#endif 107 TCG_REG_R8, 108 TCG_REG_R9, 109#else 110 /* 32 bit mode uses stack based calling convention (GCC default). */ 111#endif 112}; 113 114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 115{ 116 switch (kind) { 117 case TCG_CALL_RET_NORMAL: 118 tcg_debug_assert(slot >= 0 && slot <= 1); 119 return slot ? TCG_REG_EDX : TCG_REG_EAX; 120#ifdef _WIN64 121 case TCG_CALL_RET_BY_VEC: 122 tcg_debug_assert(slot == 0); 123 return TCG_REG_XMM0; 124#endif 125 default: 126 g_assert_not_reached(); 127 } 128} 129 130/* Constants we accept. */ 131#define TCG_CT_CONST_S32 0x100 132#define TCG_CT_CONST_U32 0x200 133#define TCG_CT_CONST_I32 0x400 134#define TCG_CT_CONST_WSZ 0x800 135#define TCG_CT_CONST_TST 0x1000 136 137/* Registers used with L constraint, which are the first argument 138 registers on x86_64, and two random call clobbered registers on 139 i386. */ 140#if TCG_TARGET_REG_BITS == 64 141# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 142# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 143#else 144# define TCG_REG_L0 TCG_REG_EAX 145# define TCG_REG_L1 TCG_REG_EDX 146#endif 147 148#if TCG_TARGET_REG_BITS == 64 149# define ALL_GENERAL_REGS 0x0000ffffu 150# define ALL_VECTOR_REGS 0xffff0000u 151# define ALL_BYTEL_REGS ALL_GENERAL_REGS 152#else 153# define ALL_GENERAL_REGS 0x000000ffu 154# define ALL_VECTOR_REGS 0x00ff0000u 155# define ALL_BYTEL_REGS 0x0000000fu 156#endif 157#define SOFTMMU_RESERVE_REGS \ 158 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 159 160#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 161#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 162 163static const tcg_insn_unit *tb_ret_addr; 164 165static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 166 intptr_t value, intptr_t addend) 167{ 168 value += addend; 169 switch(type) { 170 case R_386_PC32: 171 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 172 if (value != (int32_t)value) { 173 return false; 174 } 175 /* FALLTHRU */ 176 case R_386_32: 177 tcg_patch32(code_ptr, value); 178 break; 179 case R_386_PC8: 180 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 181 if (value != (int8_t)value) { 182 return false; 183 } 184 tcg_patch8(code_ptr, value); 185 break; 186 default: 187 g_assert_not_reached(); 188 } 189 return true; 190} 191 192/* test if a constant matches the constraint */ 193static bool tcg_target_const_match(int64_t val, int ct, 194 TCGType type, TCGCond cond, int vece) 195{ 196 if (ct & TCG_CT_CONST) { 197 return 1; 198 } 199 if (type == TCG_TYPE_I32) { 200 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 201 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 202 return 1; 203 } 204 } else { 205 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 206 return 1; 207 } 208 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 209 return 1; 210 } 211 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 212 return 1; 213 } 214 /* 215 * This will be used in combination with TCG_CT_CONST_S32, 216 * so "normal" TESTQ is already matched. Also accept: 217 * TESTQ -> TESTL (uint32_t) 218 * TESTQ -> BT (is_power_of_2) 219 */ 220 if ((ct & TCG_CT_CONST_TST) 221 && is_tst_cond(cond) 222 && (val == (uint32_t)val || is_power_of_2(val))) { 223 return 1; 224 } 225 } 226 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 227 return 1; 228 } 229 return 0; 230} 231 232# define LOWREGMASK(x) ((x) & 7) 233 234#define P_EXT 0x100 /* 0x0f opcode prefix */ 235#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 236#define P_DATA16 0x400 /* 0x66 opcode prefix */ 237#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 238#if TCG_TARGET_REG_BITS == 64 239# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 240# define P_REXB_R 0x2000 /* REG field as byte register */ 241# define P_REXB_RM 0x4000 /* R/M field as byte register */ 242# define P_GS 0x8000 /* gs segment override */ 243#else 244# define P_REXW 0 245# define P_REXB_R 0 246# define P_REXB_RM 0 247# define P_GS 0 248#endif 249#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 250#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 251#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 252#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 253#define P_EVEX 0x100000 /* Requires EVEX encoding */ 254 255#define OPC_ARITH_EbIb (0x80) 256#define OPC_ARITH_EvIz (0x81) 257#define OPC_ARITH_EvIb (0x83) 258#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 259#define OPC_ANDN (0xf2 | P_EXT38) 260#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 261#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 262#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 263#define OPC_BSF (0xbc | P_EXT) 264#define OPC_BSR (0xbd | P_EXT) 265#define OPC_BSWAP (0xc8 | P_EXT) 266#define OPC_CALL_Jz (0xe8) 267#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 268#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 269#define OPC_DEC_r32 (0x48) 270#define OPC_IMUL_GvEv (0xaf | P_EXT) 271#define OPC_IMUL_GvEvIb (0x6b) 272#define OPC_IMUL_GvEvIz (0x69) 273#define OPC_INC_r32 (0x40) 274#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 275#define OPC_JCC_short (0x70) /* ... plus condition code */ 276#define OPC_JMP_long (0xe9) 277#define OPC_JMP_short (0xeb) 278#define OPC_LEA (0x8d) 279#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 280#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 281#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 282#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 283#define OPC_MOVB_EvIz (0xc6) 284#define OPC_MOVL_EvIz (0xc7) 285#define OPC_MOVB_Ib (0xb0) 286#define OPC_MOVL_Iv (0xb8) 287#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 288#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 289#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 290#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 291#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 292#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 293#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 294#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 295#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 296#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 297#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 298#define OPC_MOVSBL (0xbe | P_EXT) 299#define OPC_MOVSWL (0xbf | P_EXT) 300#define OPC_MOVSLQ (0x63 | P_REXW) 301#define OPC_MOVZBL (0xb6 | P_EXT) 302#define OPC_MOVZWL (0xb7 | P_EXT) 303#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 304#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 305#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 306#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 307#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 308#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 309#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 310#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 311#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 312#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 313#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 314#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 315#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 316#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 317#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 318#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 319#define OPC_PAND (0xdb | P_EXT | P_DATA16) 320#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 321#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 322#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 323#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 324#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 325#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 326#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 327#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 328#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 329#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 330#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 331#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 332#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 333#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 334#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 335#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 336#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 337#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 338#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 339#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 340#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 341#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 342#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 343#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 344#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 345#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 346#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 347#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 348#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 349#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 350#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 351#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 352#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 353#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 354#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 355#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 356#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_POR (0xeb | P_EXT | P_DATA16) 358#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 359#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 360#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 361#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 362#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 363#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 364#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 365#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 366#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 367#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 368#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 369#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 370#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 371#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 372#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 373#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 374#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 375#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 376#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 377#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 378#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 379#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 380#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 381#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 382#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 383#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 384#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 385#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 386#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 387#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 388#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 389#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 390#define OPC_PXOR (0xef | P_EXT | P_DATA16) 391#define OPC_POP_r32 (0x58) 392#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 393#define OPC_PUSH_r32 (0x50) 394#define OPC_PUSH_Iv (0x68) 395#define OPC_PUSH_Ib (0x6a) 396#define OPC_RET (0xc3) 397#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 398#define OPC_SHIFT_1 (0xd1) 399#define OPC_SHIFT_Ib (0xc1) 400#define OPC_SHIFT_cl (0xd3) 401#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 402#define OPC_SHUFPS (0xc6 | P_EXT) 403#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 404#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 405#define OPC_SHRD_Ib (0xac | P_EXT) 406#define OPC_TESTB (0x84) 407#define OPC_TESTL (0x85) 408#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 409#define OPC_UD2 (0x0b | P_EXT) 410#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 411#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 412#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 413#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 414#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 415#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 416#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 417#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 418#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 419#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 420#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 421#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 422#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 423#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 424#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 425#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 426#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 427#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 428#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 429#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 430#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 431#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 432#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 433#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 434#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 436#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 437#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 438#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 439#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 440#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 442#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 443#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 444#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VZEROUPPER (0x77 | P_EXT) 446#define OPC_XCHG_ax_r32 (0x90) 447#define OPC_XCHG_EvGv (0x87) 448 449#define OPC_GRP3_Eb (0xf6) 450#define OPC_GRP3_Ev (0xf7) 451#define OPC_GRP5 (0xff) 452#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 453#define OPC_GRPBT (0xba | P_EXT) 454 455#define OPC_GRPBT_BT 4 456#define OPC_GRPBT_BTS 5 457#define OPC_GRPBT_BTR 6 458#define OPC_GRPBT_BTC 7 459 460/* Group 1 opcode extensions for 0x80-0x83. 461 These are also used as modifiers for OPC_ARITH. */ 462#define ARITH_ADD 0 463#define ARITH_OR 1 464#define ARITH_ADC 2 465#define ARITH_SBB 3 466#define ARITH_AND 4 467#define ARITH_SUB 5 468#define ARITH_XOR 6 469#define ARITH_CMP 7 470 471/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 472#define SHIFT_ROL 0 473#define SHIFT_ROR 1 474#define SHIFT_SHL 4 475#define SHIFT_SHR 5 476#define SHIFT_SAR 7 477 478/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 479#define EXT3_TESTi 0 480#define EXT3_NOT 2 481#define EXT3_NEG 3 482#define EXT3_MUL 4 483#define EXT3_IMUL 5 484#define EXT3_DIV 6 485#define EXT3_IDIV 7 486 487/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 488#define EXT5_INC_Ev 0 489#define EXT5_DEC_Ev 1 490#define EXT5_CALLN_Ev 2 491#define EXT5_JMPN_Ev 4 492 493/* Condition codes to be added to OPC_JCC_{long,short}. */ 494#define JCC_JMP (-1) 495#define JCC_JO 0x0 496#define JCC_JNO 0x1 497#define JCC_JB 0x2 498#define JCC_JAE 0x3 499#define JCC_JE 0x4 500#define JCC_JNE 0x5 501#define JCC_JBE 0x6 502#define JCC_JA 0x7 503#define JCC_JS 0x8 504#define JCC_JNS 0x9 505#define JCC_JP 0xa 506#define JCC_JNP 0xb 507#define JCC_JL 0xc 508#define JCC_JGE 0xd 509#define JCC_JLE 0xe 510#define JCC_JG 0xf 511 512static const uint8_t tcg_cond_to_jcc[] = { 513 [TCG_COND_EQ] = JCC_JE, 514 [TCG_COND_NE] = JCC_JNE, 515 [TCG_COND_LT] = JCC_JL, 516 [TCG_COND_GE] = JCC_JGE, 517 [TCG_COND_LE] = JCC_JLE, 518 [TCG_COND_GT] = JCC_JG, 519 [TCG_COND_LTU] = JCC_JB, 520 [TCG_COND_GEU] = JCC_JAE, 521 [TCG_COND_LEU] = JCC_JBE, 522 [TCG_COND_GTU] = JCC_JA, 523 [TCG_COND_TSTEQ] = JCC_JE, 524 [TCG_COND_TSTNE] = JCC_JNE, 525}; 526 527#if TCG_TARGET_REG_BITS == 64 528static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 529{ 530 int rex; 531 532 if (opc & P_GS) { 533 tcg_out8(s, 0x65); 534 } 535 if (opc & P_DATA16) { 536 /* We should never be asking for both 16 and 64-bit operation. */ 537 tcg_debug_assert((opc & P_REXW) == 0); 538 tcg_out8(s, 0x66); 539 } 540 if (opc & P_SIMDF3) { 541 tcg_out8(s, 0xf3); 542 } else if (opc & P_SIMDF2) { 543 tcg_out8(s, 0xf2); 544 } 545 546 rex = 0; 547 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 548 rex |= (r & 8) >> 1; /* REX.R */ 549 rex |= (x & 8) >> 2; /* REX.X */ 550 rex |= (rm & 8) >> 3; /* REX.B */ 551 552 /* P_REXB_{R,RM} indicates that the given register is the low byte. 553 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 554 as otherwise the encoding indicates %[abcd]h. Note that the values 555 that are ORed in merely indicate that the REX byte must be present; 556 those bits get discarded in output. */ 557 rex |= opc & (r >= 4 ? P_REXB_R : 0); 558 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 559 560 if (rex) { 561 tcg_out8(s, (uint8_t)(rex | 0x40)); 562 } 563 564 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 565 tcg_out8(s, 0x0f); 566 if (opc & P_EXT38) { 567 tcg_out8(s, 0x38); 568 } else if (opc & P_EXT3A) { 569 tcg_out8(s, 0x3a); 570 } 571 } 572 573 tcg_out8(s, opc); 574} 575#else 576static void tcg_out_opc(TCGContext *s, int opc) 577{ 578 if (opc & P_DATA16) { 579 tcg_out8(s, 0x66); 580 } 581 if (opc & P_SIMDF3) { 582 tcg_out8(s, 0xf3); 583 } else if (opc & P_SIMDF2) { 584 tcg_out8(s, 0xf2); 585 } 586 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 587 tcg_out8(s, 0x0f); 588 if (opc & P_EXT38) { 589 tcg_out8(s, 0x38); 590 } else if (opc & P_EXT3A) { 591 tcg_out8(s, 0x3a); 592 } 593 } 594 tcg_out8(s, opc); 595} 596/* Discard the register arguments to tcg_out_opc early, so as not to penalize 597 the 32-bit compilation paths. This method works with all versions of gcc, 598 whereas relying on optimization may not be able to exclude them. */ 599#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 600#endif 601 602static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 603{ 604 tcg_out_opc(s, opc, r, rm, 0); 605 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 606} 607 608static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 609 int rm, int index) 610{ 611 int tmp; 612 613 if (opc & P_GS) { 614 tcg_out8(s, 0x65); 615 } 616 /* Use the two byte form if possible, which cannot encode 617 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 618 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 619 && ((rm | index) & 8) == 0) { 620 /* Two byte VEX prefix. */ 621 tcg_out8(s, 0xc5); 622 623 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 624 } else { 625 /* Three byte VEX prefix. */ 626 tcg_out8(s, 0xc4); 627 628 /* VEX.m-mmmm */ 629 if (opc & P_EXT3A) { 630 tmp = 3; 631 } else if (opc & P_EXT38) { 632 tmp = 2; 633 } else if (opc & P_EXT) { 634 tmp = 1; 635 } else { 636 g_assert_not_reached(); 637 } 638 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 639 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 640 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 641 tcg_out8(s, tmp); 642 643 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 644 } 645 646 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 647 /* VEX.pp */ 648 if (opc & P_DATA16) { 649 tmp |= 1; /* 0x66 */ 650 } else if (opc & P_SIMDF3) { 651 tmp |= 2; /* 0xf3 */ 652 } else if (opc & P_SIMDF2) { 653 tmp |= 3; /* 0xf2 */ 654 } 655 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 656 tcg_out8(s, tmp); 657 tcg_out8(s, opc); 658} 659 660static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 661 int rm, int index) 662{ 663 /* The entire 4-byte evex prefix; with R' and V' set. */ 664 uint32_t p = 0x08041062; 665 int mm, pp; 666 667 tcg_debug_assert(have_avx512vl); 668 669 /* EVEX.mm */ 670 if (opc & P_EXT3A) { 671 mm = 3; 672 } else if (opc & P_EXT38) { 673 mm = 2; 674 } else if (opc & P_EXT) { 675 mm = 1; 676 } else { 677 g_assert_not_reached(); 678 } 679 680 /* EVEX.pp */ 681 if (opc & P_DATA16) { 682 pp = 1; /* 0x66 */ 683 } else if (opc & P_SIMDF3) { 684 pp = 2; /* 0xf3 */ 685 } else if (opc & P_SIMDF2) { 686 pp = 3; /* 0xf2 */ 687 } else { 688 pp = 0; 689 } 690 691 p = deposit32(p, 8, 2, mm); 692 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 693 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 694 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 695 p = deposit32(p, 16, 2, pp); 696 p = deposit32(p, 19, 4, ~v); 697 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 698 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 699 700 tcg_out32(s, p); 701 tcg_out8(s, opc); 702} 703 704static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 705{ 706 if (opc & P_EVEX) { 707 tcg_out_evex_opc(s, opc, r, v, rm, 0); 708 } else { 709 tcg_out_vex_opc(s, opc, r, v, rm, 0); 710 } 711 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 712} 713 714static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 715 int r, int v, int rm, TCGType type) 716{ 717 if (type == TCG_TYPE_V256) { 718 opc |= P_VEXL; 719 } 720 tcg_out_vex_modrm(s, opc, r, v, rm); 721} 722 723/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 724 We handle either RM and INDEX missing with a negative value. In 64-bit 725 mode for absolute addresses, ~RM is the size of the immediate operand 726 that will follow the instruction. */ 727 728static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 729 int shift, intptr_t offset) 730{ 731 int mod, len; 732 733 if (index < 0 && rm < 0) { 734 if (TCG_TARGET_REG_BITS == 64) { 735 /* Try for a rip-relative addressing mode. This has replaced 736 the 32-bit-mode absolute addressing encoding. */ 737 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 738 intptr_t disp = offset - pc; 739 if (disp == (int32_t)disp) { 740 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 741 tcg_out32(s, disp); 742 return; 743 } 744 745 /* Try for an absolute address encoding. This requires the 746 use of the MODRM+SIB encoding and is therefore larger than 747 rip-relative addressing. */ 748 if (offset == (int32_t)offset) { 749 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 750 tcg_out8(s, (4 << 3) | 5); 751 tcg_out32(s, offset); 752 return; 753 } 754 755 /* ??? The memory isn't directly addressable. */ 756 g_assert_not_reached(); 757 } else { 758 /* Absolute address. */ 759 tcg_out8(s, (r << 3) | 5); 760 tcg_out32(s, offset); 761 return; 762 } 763 } 764 765 /* Find the length of the immediate addend. Note that the encoding 766 that would be used for (%ebp) indicates absolute addressing. */ 767 if (rm < 0) { 768 mod = 0, len = 4, rm = 5; 769 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 770 mod = 0, len = 0; 771 } else if (offset == (int8_t)offset) { 772 mod = 0x40, len = 1; 773 } else { 774 mod = 0x80, len = 4; 775 } 776 777 /* Use a single byte MODRM format if possible. Note that the encoding 778 that would be used for %esp is the escape to the two byte form. */ 779 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 780 /* Single byte MODRM format. */ 781 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 782 } else { 783 /* Two byte MODRM+SIB format. */ 784 785 /* Note that the encoding that would place %esp into the index 786 field indicates no index register. In 64-bit mode, the REX.X 787 bit counts, so %r12 can be used as the index. */ 788 if (index < 0) { 789 index = 4; 790 } else { 791 tcg_debug_assert(index != TCG_REG_ESP); 792 } 793 794 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 795 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 796 } 797 798 if (len == 1) { 799 tcg_out8(s, offset); 800 } else if (len == 4) { 801 tcg_out32(s, offset); 802 } 803} 804 805static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 806 int index, int shift, intptr_t offset) 807{ 808 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 809 tcg_out_sib_offset(s, r, rm, index, shift, offset); 810} 811 812static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 813 int rm, int index, int shift, 814 intptr_t offset) 815{ 816 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 817 tcg_out_sib_offset(s, r, rm, index, shift, offset); 818} 819 820/* A simplification of the above with no index or shift. */ 821static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 822 int rm, intptr_t offset) 823{ 824 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 825} 826 827static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 828 int v, int rm, intptr_t offset) 829{ 830 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 831} 832 833/* Output an opcode with an expected reference to the constant pool. */ 834static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 835{ 836 tcg_out_opc(s, opc, r, 0, 0); 837 /* Absolute for 32-bit, pc-relative for 64-bit. */ 838 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 839 tcg_out32(s, 0); 840} 841 842/* Output an opcode with an expected reference to the constant pool. */ 843static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 844{ 845 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 846 /* Absolute for 32-bit, pc-relative for 64-bit. */ 847 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 848 tcg_out32(s, 0); 849} 850 851/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 852static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 853{ 854 /* Propagate an opcode prefix, such as P_REXW. */ 855 int ext = subop & ~0x7; 856 subop &= 0x7; 857 858 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 859} 860 861static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 862{ 863 int rexw = 0; 864 865 if (arg == ret) { 866 return true; 867 } 868 switch (type) { 869 case TCG_TYPE_I64: 870 rexw = P_REXW; 871 /* fallthru */ 872 case TCG_TYPE_I32: 873 if (ret < 16) { 874 if (arg < 16) { 875 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 876 } else { 877 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 878 } 879 } else { 880 if (arg < 16) { 881 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 882 } else { 883 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 884 } 885 } 886 break; 887 888 case TCG_TYPE_V64: 889 tcg_debug_assert(ret >= 16 && arg >= 16); 890 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 891 break; 892 case TCG_TYPE_V128: 893 tcg_debug_assert(ret >= 16 && arg >= 16); 894 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 895 break; 896 case TCG_TYPE_V256: 897 tcg_debug_assert(ret >= 16 && arg >= 16); 898 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 899 break; 900 901 default: 902 g_assert_not_reached(); 903 } 904 return true; 905} 906 907static const int avx2_dup_insn[4] = { 908 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 909 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 910}; 911 912static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 913 TCGReg r, TCGReg a) 914{ 915 if (have_avx2) { 916 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 917 } else { 918 switch (vece) { 919 case MO_8: 920 /* ??? With zero in a register, use PSHUFB. */ 921 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 922 a = r; 923 /* FALLTHRU */ 924 case MO_16: 925 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 926 a = r; 927 /* FALLTHRU */ 928 case MO_32: 929 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 930 /* imm8 operand: all output lanes selected from input lane 0. */ 931 tcg_out8(s, 0); 932 break; 933 case MO_64: 934 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 935 break; 936 default: 937 g_assert_not_reached(); 938 } 939 } 940 return true; 941} 942 943static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 944 TCGReg r, TCGReg base, intptr_t offset) 945{ 946 if (have_avx2) { 947 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 948 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 949 r, 0, base, offset); 950 } else { 951 switch (vece) { 952 case MO_64: 953 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 954 break; 955 case MO_32: 956 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 957 break; 958 case MO_16: 959 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 960 tcg_out8(s, 0); /* imm8 */ 961 tcg_out_dup_vec(s, type, vece, r, r); 962 break; 963 case MO_8: 964 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 965 tcg_out8(s, 0); /* imm8 */ 966 tcg_out_dup_vec(s, type, vece, r, r); 967 break; 968 default: 969 g_assert_not_reached(); 970 } 971 } 972 return true; 973} 974 975static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 976 TCGReg ret, int64_t arg) 977{ 978 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 979 980 if (arg == 0) { 981 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 982 return; 983 } 984 if (arg == -1) { 985 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 986 return; 987 } 988 989 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 990 if (have_avx2) { 991 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 992 } else { 993 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 994 } 995 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 996 } else { 997 if (type == TCG_TYPE_V64) { 998 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 999 } else if (have_avx2) { 1000 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1001 } else { 1002 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1003 } 1004 if (TCG_TARGET_REG_BITS == 64) { 1005 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1006 } else { 1007 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1008 } 1009 } 1010} 1011 1012static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1013 TCGReg ret, tcg_target_long arg) 1014{ 1015 if (arg == 0) { 1016 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1017 return; 1018 } 1019 if (arg == -1) { 1020 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1021 return; 1022 } 1023 1024 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1025 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1026 if (TCG_TARGET_REG_BITS == 64) { 1027 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1028 } else { 1029 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1030 } 1031} 1032 1033static void tcg_out_movi_int(TCGContext *s, TCGType type, 1034 TCGReg ret, tcg_target_long arg) 1035{ 1036 tcg_target_long diff; 1037 1038 if (arg == 0) { 1039 tgen_arithr(s, ARITH_XOR, ret, ret); 1040 return; 1041 } 1042 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1043 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1044 tcg_out32(s, arg); 1045 return; 1046 } 1047 if (arg == (int32_t)arg) { 1048 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1049 tcg_out32(s, arg); 1050 return; 1051 } 1052 1053 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1054 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1055 if (diff == (int32_t)diff) { 1056 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1057 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1058 tcg_out32(s, diff); 1059 return; 1060 } 1061 1062 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1063 tcg_out64(s, arg); 1064} 1065 1066static void tcg_out_movi(TCGContext *s, TCGType type, 1067 TCGReg ret, tcg_target_long arg) 1068{ 1069 switch (type) { 1070 case TCG_TYPE_I32: 1071#if TCG_TARGET_REG_BITS == 64 1072 case TCG_TYPE_I64: 1073#endif 1074 if (ret < 16) { 1075 tcg_out_movi_int(s, type, ret, arg); 1076 } else { 1077 tcg_out_movi_vec(s, type, ret, arg); 1078 } 1079 break; 1080 default: 1081 g_assert_not_reached(); 1082 } 1083} 1084 1085static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1086{ 1087 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1088 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1089 return true; 1090} 1091 1092static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1093 tcg_target_long imm) 1094{ 1095 /* This function is only used for passing structs by reference. */ 1096 tcg_debug_assert(imm == (int32_t)imm); 1097 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1098} 1099 1100static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1101{ 1102 if (val == (int8_t)val) { 1103 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1104 tcg_out8(s, val); 1105 } else if (val == (int32_t)val) { 1106 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1107 tcg_out32(s, val); 1108 } else { 1109 g_assert_not_reached(); 1110 } 1111} 1112 1113static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1114{ 1115 /* Given the strength of x86 memory ordering, we only need care for 1116 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1117 faster than "mfence", so don't bother with the sse insn. */ 1118 if (a0 & TCG_MO_ST_LD) { 1119 tcg_out8(s, 0xf0); 1120 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1121 tcg_out8(s, 0); 1122 } 1123} 1124 1125static inline void tcg_out_push(TCGContext *s, int reg) 1126{ 1127 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1128} 1129 1130static inline void tcg_out_pop(TCGContext *s, int reg) 1131{ 1132 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1133} 1134 1135static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1136 TCGReg arg1, intptr_t arg2) 1137{ 1138 switch (type) { 1139 case TCG_TYPE_I32: 1140 if (ret < 16) { 1141 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1142 } else { 1143 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1144 } 1145 break; 1146 case TCG_TYPE_I64: 1147 if (ret < 16) { 1148 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1149 break; 1150 } 1151 /* FALLTHRU */ 1152 case TCG_TYPE_V64: 1153 /* There is no instruction that can validate 8-byte alignment. */ 1154 tcg_debug_assert(ret >= 16); 1155 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1156 break; 1157 case TCG_TYPE_V128: 1158 /* 1159 * The gvec infrastructure is asserts that v128 vector loads 1160 * and stores use a 16-byte aligned offset. Validate that the 1161 * final pointer is aligned by using an insn that will SIGSEGV. 1162 */ 1163 tcg_debug_assert(ret >= 16); 1164 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1165 break; 1166 case TCG_TYPE_V256: 1167 /* 1168 * The gvec infrastructure only requires 16-byte alignment, 1169 * so here we must use an unaligned load. 1170 */ 1171 tcg_debug_assert(ret >= 16); 1172 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1173 ret, 0, arg1, arg2); 1174 break; 1175 default: 1176 g_assert_not_reached(); 1177 } 1178} 1179 1180static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1181 TCGReg arg1, intptr_t arg2) 1182{ 1183 switch (type) { 1184 case TCG_TYPE_I32: 1185 if (arg < 16) { 1186 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1187 } else { 1188 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1189 } 1190 break; 1191 case TCG_TYPE_I64: 1192 if (arg < 16) { 1193 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1194 break; 1195 } 1196 /* FALLTHRU */ 1197 case TCG_TYPE_V64: 1198 /* There is no instruction that can validate 8-byte alignment. */ 1199 tcg_debug_assert(arg >= 16); 1200 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1201 break; 1202 case TCG_TYPE_V128: 1203 /* 1204 * The gvec infrastructure is asserts that v128 vector loads 1205 * and stores use a 16-byte aligned offset. Validate that the 1206 * final pointer is aligned by using an insn that will SIGSEGV. 1207 * 1208 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1209 * for _WIN64, which must have SSE2 but may not have AVX. 1210 */ 1211 tcg_debug_assert(arg >= 16); 1212 if (have_avx1) { 1213 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1214 } else { 1215 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1216 } 1217 break; 1218 case TCG_TYPE_V256: 1219 /* 1220 * The gvec infrastructure only requires 16-byte alignment, 1221 * so here we must use an unaligned store. 1222 */ 1223 tcg_debug_assert(arg >= 16); 1224 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1225 arg, 0, arg1, arg2); 1226 break; 1227 default: 1228 g_assert_not_reached(); 1229 } 1230} 1231 1232static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1233 TCGReg base, intptr_t ofs) 1234{ 1235 int rexw = 0; 1236 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1237 if (val != (int32_t)val) { 1238 return false; 1239 } 1240 rexw = P_REXW; 1241 } else if (type != TCG_TYPE_I32) { 1242 return false; 1243 } 1244 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1245 tcg_out32(s, val); 1246 return true; 1247} 1248 1249static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1250{ 1251 /* Propagate an opcode prefix, such as P_DATA16. */ 1252 int ext = subopc & ~0x7; 1253 subopc &= 0x7; 1254 1255 if (count == 1) { 1256 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1257 } else { 1258 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1259 tcg_out8(s, count); 1260 } 1261} 1262 1263static inline void tcg_out_bswap32(TCGContext *s, int reg) 1264{ 1265 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1266} 1267 1268static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1269{ 1270 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1271} 1272 1273static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1274{ 1275 /* movzbl */ 1276 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1277 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1278} 1279 1280static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1281{ 1282 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1283 /* movsbl */ 1284 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1285 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1286} 1287 1288static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1289{ 1290 /* movzwl */ 1291 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1292} 1293 1294static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1295{ 1296 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1297 /* movsw[lq] */ 1298 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1299} 1300 1301static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1302{ 1303 /* 32-bit mov zero extends. */ 1304 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1305} 1306 1307static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1308{ 1309 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1310 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1311} 1312 1313static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1314{ 1315 tcg_out_ext32s(s, dest, src); 1316} 1317 1318static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1319{ 1320 if (dest != src) { 1321 tcg_out_ext32u(s, dest, src); 1322 } 1323} 1324 1325static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1326{ 1327 tcg_out_ext32u(s, dest, src); 1328} 1329 1330static inline void tcg_out_bswap64(TCGContext *s, int reg) 1331{ 1332 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1333} 1334 1335static void tgen_arithi(TCGContext *s, int c, int r0, 1336 tcg_target_long val, int cf) 1337{ 1338 int rexw = 0; 1339 1340 if (TCG_TARGET_REG_BITS == 64) { 1341 rexw = c & -8; 1342 c &= 7; 1343 } 1344 1345 switch (c) { 1346 case ARITH_ADD: 1347 case ARITH_SUB: 1348 if (!cf) { 1349 /* 1350 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1351 * partial flags update stalls on Pentium4 and are not recommended 1352 * by current Intel optimization manuals. 1353 */ 1354 if (val == 1 || val == -1) { 1355 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1356 if (TCG_TARGET_REG_BITS == 64) { 1357 /* 1358 * The single-byte increment encodings are re-tasked 1359 * as the REX prefixes. Use the MODRM encoding. 1360 */ 1361 tcg_out_modrm(s, OPC_GRP5 + rexw, 1362 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1363 } else { 1364 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1365 } 1366 return; 1367 } 1368 if (val == 128) { 1369 /* 1370 * Facilitate using an 8-bit immediate. Carry is inverted 1371 * by this transformation, so do it only if cf == 0. 1372 */ 1373 c ^= ARITH_ADD ^ ARITH_SUB; 1374 val = -128; 1375 } 1376 } 1377 break; 1378 1379 case ARITH_AND: 1380 if (TCG_TARGET_REG_BITS == 64) { 1381 if (val == 0xffffffffu) { 1382 tcg_out_ext32u(s, r0, r0); 1383 return; 1384 } 1385 if (val == (uint32_t)val) { 1386 /* AND with no high bits set can use a 32-bit operation. */ 1387 rexw = 0; 1388 } 1389 } 1390 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1391 tcg_out_ext8u(s, r0, r0); 1392 return; 1393 } 1394 if (val == 0xffffu) { 1395 tcg_out_ext16u(s, r0, r0); 1396 return; 1397 } 1398 break; 1399 1400 case ARITH_OR: 1401 case ARITH_XOR: 1402 if (val >= 0x80 && val <= 0xff 1403 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1404 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1405 tcg_out8(s, val); 1406 return; 1407 } 1408 break; 1409 } 1410 1411 if (val == (int8_t)val) { 1412 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1413 tcg_out8(s, val); 1414 return; 1415 } 1416 if (rexw == 0 || val == (int32_t)val) { 1417 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1418 tcg_out32(s, val); 1419 return; 1420 } 1421 1422 g_assert_not_reached(); 1423} 1424 1425static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1426{ 1427 if (val != 0) { 1428 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1429 } 1430} 1431 1432/* Set SMALL to force a short forward branch. */ 1433static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1434{ 1435 int32_t val, val1; 1436 1437 if (l->has_value) { 1438 val = tcg_pcrel_diff(s, l->u.value_ptr); 1439 val1 = val - 2; 1440 if ((int8_t)val1 == val1) { 1441 if (opc == -1) { 1442 tcg_out8(s, OPC_JMP_short); 1443 } else { 1444 tcg_out8(s, OPC_JCC_short + opc); 1445 } 1446 tcg_out8(s, val1); 1447 } else { 1448 tcg_debug_assert(!small); 1449 if (opc == -1) { 1450 tcg_out8(s, OPC_JMP_long); 1451 tcg_out32(s, val - 5); 1452 } else { 1453 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1454 tcg_out32(s, val - 6); 1455 } 1456 } 1457 } else if (small) { 1458 if (opc == -1) { 1459 tcg_out8(s, OPC_JMP_short); 1460 } else { 1461 tcg_out8(s, OPC_JCC_short + opc); 1462 } 1463 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1464 s->code_ptr += 1; 1465 } else { 1466 if (opc == -1) { 1467 tcg_out8(s, OPC_JMP_long); 1468 } else { 1469 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1470 } 1471 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1472 s->code_ptr += 4; 1473 } 1474} 1475 1476static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1477 TCGArg arg2, int const_arg2, int rexw) 1478{ 1479 int jz, js; 1480 1481 if (!is_tst_cond(cond)) { 1482 if (!const_arg2) { 1483 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1484 } else if (arg2 == 0) { 1485 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1486 } else { 1487 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1488 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1489 } 1490 return tcg_cond_to_jcc[cond]; 1491 } 1492 1493 jz = tcg_cond_to_jcc[cond]; 1494 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1495 1496 if (!const_arg2) { 1497 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1498 return jz; 1499 } 1500 1501 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1502 if (arg2 == 0x80) { 1503 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1504 return js; 1505 } 1506 if (arg2 == 0xff) { 1507 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1508 return jz; 1509 } 1510 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1511 tcg_out8(s, arg2); 1512 return jz; 1513 } 1514 1515 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1516 if (arg2 == 0x8000) { 1517 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1518 return js; 1519 } 1520 if (arg2 == 0xff00) { 1521 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1522 return jz; 1523 } 1524 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1525 tcg_out8(s, arg2 >> 8); 1526 return jz; 1527 } 1528 1529 if (arg2 == 0xffff) { 1530 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1531 return jz; 1532 } 1533 if (arg2 == 0xffffffffu) { 1534 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1535 return jz; 1536 } 1537 1538 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1539 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1540 int sh = ctz64(arg2); 1541 1542 rexw = (sh & 32 ? P_REXW : 0); 1543 if ((sh & 31) == 31) { 1544 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1545 return js; 1546 } else { 1547 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1548 tcg_out8(s, sh); 1549 return jc; 1550 } 1551 } 1552 1553 if (rexw) { 1554 if (arg2 == (uint32_t)arg2) { 1555 rexw = 0; 1556 } else { 1557 tcg_debug_assert(arg2 == (int32_t)arg2); 1558 } 1559 } 1560 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1561 tcg_out32(s, arg2); 1562 return jz; 1563} 1564 1565static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1566 TCGArg arg1, TCGArg arg2, int const_arg2, 1567 TCGLabel *label, bool small) 1568{ 1569 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1570 tcg_out_jxx(s, jcc, label, small); 1571} 1572 1573#if TCG_TARGET_REG_BITS == 32 1574static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1575 const int *const_args, bool small) 1576{ 1577 TCGLabel *label_next = gen_new_label(); 1578 TCGLabel *label_this = arg_label(args[5]); 1579 TCGCond cond = args[4]; 1580 1581 switch (cond) { 1582 case TCG_COND_EQ: 1583 case TCG_COND_TSTEQ: 1584 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1585 args[0], args[2], const_args[2], label_next, 1); 1586 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1587 label_this, small); 1588 break; 1589 case TCG_COND_NE: 1590 case TCG_COND_TSTNE: 1591 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1592 label_this, small); 1593 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1594 label_this, small); 1595 break; 1596 case TCG_COND_LT: 1597 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1598 label_this, small); 1599 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1600 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1601 label_this, small); 1602 break; 1603 case TCG_COND_LE: 1604 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1605 label_this, small); 1606 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1607 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1608 label_this, small); 1609 break; 1610 case TCG_COND_GT: 1611 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1612 label_this, small); 1613 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1614 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1615 label_this, small); 1616 break; 1617 case TCG_COND_GE: 1618 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1619 label_this, small); 1620 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1621 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1622 label_this, small); 1623 break; 1624 case TCG_COND_LTU: 1625 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1626 label_this, small); 1627 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1628 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1629 label_this, small); 1630 break; 1631 case TCG_COND_LEU: 1632 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1633 label_this, small); 1634 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1635 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1636 label_this, small); 1637 break; 1638 case TCG_COND_GTU: 1639 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1640 label_this, small); 1641 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1642 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1643 label_this, small); 1644 break; 1645 case TCG_COND_GEU: 1646 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1647 label_this, small); 1648 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1649 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1650 label_this, small); 1651 break; 1652 default: 1653 g_assert_not_reached(); 1654 } 1655 tcg_out_label(s, label_next); 1656} 1657#endif 1658 1659static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1660 TCGArg dest, TCGArg arg1, TCGArg arg2, 1661 int const_arg2, bool neg) 1662{ 1663 int cmp_rexw = rexw; 1664 bool inv = false; 1665 bool cleared; 1666 int jcc; 1667 1668 switch (cond) { 1669 case TCG_COND_NE: 1670 inv = true; 1671 /* fall through */ 1672 case TCG_COND_EQ: 1673 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1674 if (const_arg2 && arg2 == 0) { 1675 arg2 = 1; 1676 goto do_ltu; 1677 } 1678 break; 1679 1680 case TCG_COND_TSTNE: 1681 inv = true; 1682 /* fall through */ 1683 case TCG_COND_TSTEQ: 1684 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1685 if (const_arg2 && arg2 == 0xffffffffu) { 1686 arg2 = 1; 1687 cmp_rexw = 0; 1688 goto do_ltu; 1689 } 1690 break; 1691 1692 case TCG_COND_LEU: 1693 inv = true; 1694 /* fall through */ 1695 case TCG_COND_GTU: 1696 /* If arg2 is a register, swap for LTU/GEU. */ 1697 if (!const_arg2) { 1698 TCGReg t = arg1; 1699 arg1 = arg2; 1700 arg2 = t; 1701 goto do_ltu; 1702 } 1703 break; 1704 1705 case TCG_COND_GEU: 1706 inv = true; 1707 /* fall through */ 1708 case TCG_COND_LTU: 1709 do_ltu: 1710 /* 1711 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1712 * We can then use NEG or INC to produce the desired result. 1713 * This is always smaller than the SETCC expansion. 1714 */ 1715 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1716 1717 /* X - X - C = -C = (C ? -1 : 0) */ 1718 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1719 if (inv && neg) { 1720 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1721 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1722 } else if (inv) { 1723 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1724 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1725 } else if (!neg) { 1726 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1727 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1728 } 1729 return; 1730 1731 case TCG_COND_GE: 1732 inv = true; 1733 /* fall through */ 1734 case TCG_COND_LT: 1735 /* If arg2 is 0, extract the sign bit. */ 1736 if (const_arg2 && arg2 == 0) { 1737 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1738 if (inv) { 1739 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1740 } 1741 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1742 dest, rexw ? 63 : 31); 1743 return; 1744 } 1745 break; 1746 1747 default: 1748 break; 1749 } 1750 1751 /* 1752 * If dest does not overlap the inputs, clearing it first is preferred. 1753 * The XOR breaks any false dependency for the low-byte write to dest, 1754 * and is also one byte smaller than MOVZBL. 1755 */ 1756 cleared = false; 1757 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1758 tgen_arithr(s, ARITH_XOR, dest, dest); 1759 cleared = true; 1760 } 1761 1762 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1763 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1764 1765 if (!cleared) { 1766 tcg_out_ext8u(s, dest, dest); 1767 } 1768 if (neg) { 1769 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1770 } 1771} 1772 1773#if TCG_TARGET_REG_BITS == 32 1774static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1775 const int *const_args) 1776{ 1777 TCGArg new_args[6]; 1778 TCGLabel *label_true, *label_over; 1779 1780 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1781 1782 if (args[0] == args[1] || args[0] == args[2] 1783 || (!const_args[3] && args[0] == args[3]) 1784 || (!const_args[4] && args[0] == args[4])) { 1785 /* When the destination overlaps with one of the argument 1786 registers, don't do anything tricky. */ 1787 label_true = gen_new_label(); 1788 label_over = gen_new_label(); 1789 1790 new_args[5] = label_arg(label_true); 1791 tcg_out_brcond2(s, new_args, const_args+1, 1); 1792 1793 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1794 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1795 tcg_out_label(s, label_true); 1796 1797 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1798 tcg_out_label(s, label_over); 1799 } else { 1800 /* When the destination does not overlap one of the arguments, 1801 clear the destination first, jump if cond false, and emit an 1802 increment in the true case. This results in smaller code. */ 1803 1804 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1805 1806 label_over = gen_new_label(); 1807 new_args[4] = tcg_invert_cond(new_args[4]); 1808 new_args[5] = label_arg(label_over); 1809 tcg_out_brcond2(s, new_args, const_args+1, 1); 1810 1811 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1812 tcg_out_label(s, label_over); 1813 } 1814} 1815#endif 1816 1817static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1818 TCGReg dest, TCGReg v1) 1819{ 1820 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1821} 1822 1823static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1824 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1825 TCGReg v1) 1826{ 1827 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1828 tcg_out_cmov(s, jcc, rexw, dest, v1); 1829} 1830 1831static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1832 TCGArg arg2, bool const_a2) 1833{ 1834 if (have_bmi1) { 1835 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1836 if (const_a2) { 1837 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1838 } else { 1839 tcg_debug_assert(dest != arg2); 1840 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1841 } 1842 } else { 1843 tcg_debug_assert(dest != arg2); 1844 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1845 tcg_out_cmov(s, JCC_JE, rexw, dest, arg2); 1846 } 1847} 1848 1849static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1850 TCGArg arg2, bool const_a2) 1851{ 1852 if (have_lzcnt) { 1853 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1854 if (const_a2) { 1855 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1856 } else { 1857 tcg_debug_assert(dest != arg2); 1858 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1859 } 1860 } else { 1861 tcg_debug_assert(!const_a2); 1862 tcg_debug_assert(dest != arg1); 1863 tcg_debug_assert(dest != arg2); 1864 1865 /* Recall that the output of BSR is the index not the count. */ 1866 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1867 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1868 1869 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1870 int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw); 1871 tcg_out_cmov(s, jcc, rexw, dest, arg2); 1872 } 1873} 1874 1875static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1876{ 1877 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1878 1879 if (disp == (int32_t)disp) { 1880 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1881 tcg_out32(s, disp); 1882 } else { 1883 /* rip-relative addressing into the constant pool. 1884 This is 6 + 8 = 14 bytes, as compared to using an 1885 immediate load 10 + 6 = 16 bytes, plus we may 1886 be able to re-use the pool constant for more calls. */ 1887 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1888 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1889 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1890 tcg_out32(s, 0); 1891 } 1892} 1893 1894static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1895 const TCGHelperInfo *info) 1896{ 1897 tcg_out_branch(s, 1, dest); 1898 1899#ifndef _WIN32 1900 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1901 /* 1902 * The sysv i386 abi for struct return places a reference as the 1903 * first argument of the stack, and pops that argument with the 1904 * return statement. Since we want to retain the aligned stack 1905 * pointer for the callee, we do not want to actually push that 1906 * argument before the call but rely on the normal store to the 1907 * stack slot. But we do need to compensate for the pop in order 1908 * to reset our correct stack pointer value. 1909 * Pushing a garbage value back onto the stack is quickest. 1910 */ 1911 tcg_out_push(s, TCG_REG_EAX); 1912 } 1913#endif 1914} 1915 1916static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1917{ 1918 tcg_out_branch(s, 0, dest); 1919} 1920 1921static void tcg_out_nopn(TCGContext *s, int n) 1922{ 1923 int i; 1924 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1925 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1926 * duplicate prefix, and all of the interesting recent cores can 1927 * decode and discard the duplicates in a single cycle. 1928 */ 1929 tcg_debug_assert(n >= 1); 1930 for (i = 1; i < n; ++i) { 1931 tcg_out8(s, 0x66); 1932 } 1933 tcg_out8(s, 0x90); 1934} 1935 1936typedef struct { 1937 TCGReg base; 1938 int index; 1939 int ofs; 1940 int seg; 1941 TCGAtomAlign aa; 1942} HostAddress; 1943 1944bool tcg_target_has_memory_bswap(MemOp memop) 1945{ 1946 TCGAtomAlign aa; 1947 1948 if (!have_movbe) { 1949 return false; 1950 } 1951 if ((memop & MO_SIZE) < MO_128) { 1952 return true; 1953 } 1954 1955 /* 1956 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1957 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1958 */ 1959 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1960 return aa.atom < MO_128; 1961} 1962 1963/* 1964 * Because i686 has no register parameters and because x86_64 has xchg 1965 * to handle addr/data register overlap, we have placed all input arguments 1966 * before we need might need a scratch reg. 1967 * 1968 * Even then, a scratch is only needed for l->raddr. Rather than expose 1969 * a general-purpose scratch when we don't actually know it's available, 1970 * use the ra_gen hook to load into RAX if needed. 1971 */ 1972#if TCG_TARGET_REG_BITS == 64 1973static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1974{ 1975 if (arg < 0) { 1976 arg = TCG_REG_RAX; 1977 } 1978 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1979 return arg; 1980} 1981static const TCGLdstHelperParam ldst_helper_param = { 1982 .ra_gen = ldst_ra_gen 1983}; 1984#else 1985static const TCGLdstHelperParam ldst_helper_param = { }; 1986#endif 1987 1988static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 1989 TCGReg l, TCGReg h, TCGReg v) 1990{ 1991 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1992 1993 /* vpmov{d,q} %v, %l */ 1994 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 1995 /* vpextr{d,q} $1, %v, %h */ 1996 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 1997 tcg_out8(s, 1); 1998} 1999 2000static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2001 TCGReg v, TCGReg l, TCGReg h) 2002{ 2003 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2004 2005 /* vmov{d,q} %l, %v */ 2006 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2007 /* vpinsr{d,q} $1, %h, %v, %v */ 2008 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2009 tcg_out8(s, 1); 2010} 2011 2012/* 2013 * Generate code for the slow path for a load at the end of block 2014 */ 2015static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2016{ 2017 MemOp opc = get_memop(l->oi); 2018 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2019 2020 /* resolve label address */ 2021 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2022 if (label_ptr[1]) { 2023 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2024 } 2025 2026 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2027 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2028 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2029 2030 tcg_out_jmp(s, l->raddr); 2031 return true; 2032} 2033 2034/* 2035 * Generate code for the slow path for a store at the end of block 2036 */ 2037static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2038{ 2039 MemOp opc = get_memop(l->oi); 2040 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2041 2042 /* resolve label address */ 2043 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2044 if (label_ptr[1]) { 2045 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2046 } 2047 2048 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2049 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2050 2051 tcg_out_jmp(s, l->raddr); 2052 return true; 2053} 2054 2055#ifdef CONFIG_USER_ONLY 2056static HostAddress x86_guest_base = { 2057 .index = -1 2058}; 2059 2060#if defined(__x86_64__) && defined(__linux__) 2061# include <asm/prctl.h> 2062# include <sys/prctl.h> 2063int arch_prctl(int code, unsigned long addr); 2064static inline int setup_guest_base_seg(void) 2065{ 2066 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2067 return P_GS; 2068 } 2069 return 0; 2070} 2071#define setup_guest_base_seg setup_guest_base_seg 2072#elif defined(__x86_64__) && \ 2073 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2074# include <machine/sysarch.h> 2075static inline int setup_guest_base_seg(void) 2076{ 2077 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2078 return P_GS; 2079 } 2080 return 0; 2081} 2082#define setup_guest_base_seg setup_guest_base_seg 2083#endif 2084#else 2085# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2086#endif /* CONFIG_USER_ONLY */ 2087#ifndef setup_guest_base_seg 2088# define setup_guest_base_seg() 0 2089#endif 2090 2091#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2092 2093/* 2094 * For softmmu, perform the TLB load and compare. 2095 * For useronly, perform any required alignment tests. 2096 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2097 * is required and fill in @h with the host address for the fast path. 2098 */ 2099static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2100 TCGReg addrlo, TCGReg addrhi, 2101 MemOpIdx oi, bool is_ld) 2102{ 2103 TCGLabelQemuLdst *ldst = NULL; 2104 MemOp opc = get_memop(oi); 2105 MemOp s_bits = opc & MO_SIZE; 2106 unsigned a_mask; 2107 2108 if (tcg_use_softmmu) { 2109 h->index = TCG_REG_L0; 2110 h->ofs = 0; 2111 h->seg = 0; 2112 } else { 2113 *h = x86_guest_base; 2114 } 2115 h->base = addrlo; 2116 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2117 a_mask = (1 << h->aa.align) - 1; 2118 2119 if (tcg_use_softmmu) { 2120 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2121 : offsetof(CPUTLBEntry, addr_write); 2122 TCGType ttype = TCG_TYPE_I32; 2123 TCGType tlbtype = TCG_TYPE_I32; 2124 int trexw = 0, hrexw = 0, tlbrexw = 0; 2125 unsigned mem_index = get_mmuidx(oi); 2126 unsigned s_mask = (1 << s_bits) - 1; 2127 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2128 int tlb_mask; 2129 2130 ldst = new_ldst_label(s); 2131 ldst->is_ld = is_ld; 2132 ldst->oi = oi; 2133 ldst->addrlo_reg = addrlo; 2134 ldst->addrhi_reg = addrhi; 2135 2136 if (TCG_TARGET_REG_BITS == 64) { 2137 ttype = s->addr_type; 2138 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2139 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2140 hrexw = P_REXW; 2141 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2142 tlbtype = TCG_TYPE_I64; 2143 tlbrexw = P_REXW; 2144 } 2145 } 2146 } 2147 2148 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 2149 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2150 s->page_bits - CPU_TLB_ENTRY_BITS); 2151 2152 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2153 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2154 2155 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2156 fast_ofs + offsetof(CPUTLBDescFast, table)); 2157 2158 /* 2159 * If the required alignment is at least as large as the access, 2160 * simply copy the address and mask. For lesser alignments, 2161 * check that we don't cross pages for the complete access. 2162 */ 2163 if (a_mask >= s_mask) { 2164 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 2165 } else { 2166 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2167 addrlo, s_mask - a_mask); 2168 } 2169 tlb_mask = s->page_mask | a_mask; 2170 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2171 2172 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2173 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2174 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2175 2176 /* jne slow_path */ 2177 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2178 ldst->label_ptr[0] = s->code_ptr; 2179 s->code_ptr += 4; 2180 2181 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 2182 /* cmp 4(TCG_REG_L0), addrhi */ 2183 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, 2184 TCG_REG_L0, cmp_ofs + 4); 2185 2186 /* jne slow_path */ 2187 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2188 ldst->label_ptr[1] = s->code_ptr; 2189 s->code_ptr += 4; 2190 } 2191 2192 /* TLB Hit. */ 2193 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2194 offsetof(CPUTLBEntry, addend)); 2195 } else if (a_mask) { 2196 int jcc; 2197 2198 ldst = new_ldst_label(s); 2199 ldst->is_ld = is_ld; 2200 ldst->oi = oi; 2201 ldst->addrlo_reg = addrlo; 2202 ldst->addrhi_reg = addrhi; 2203 2204 /* jne slow_path */ 2205 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false); 2206 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2207 ldst->label_ptr[0] = s->code_ptr; 2208 s->code_ptr += 4; 2209 } 2210 2211 return ldst; 2212} 2213 2214static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2215 HostAddress h, TCGType type, MemOp memop) 2216{ 2217 bool use_movbe = false; 2218 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2219 int movop = OPC_MOVL_GvEv; 2220 2221 /* Do big-endian loads with movbe. */ 2222 if (memop & MO_BSWAP) { 2223 tcg_debug_assert(have_movbe); 2224 use_movbe = true; 2225 movop = OPC_MOVBE_GyMy; 2226 } 2227 2228 switch (memop & MO_SSIZE) { 2229 case MO_UB: 2230 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2231 h.base, h.index, 0, h.ofs); 2232 break; 2233 case MO_SB: 2234 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2235 h.base, h.index, 0, h.ofs); 2236 break; 2237 case MO_UW: 2238 if (use_movbe) { 2239 /* There is no extending movbe; only low 16-bits are modified. */ 2240 if (datalo != h.base && datalo != h.index) { 2241 /* XOR breaks dependency chains. */ 2242 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2243 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2244 datalo, h.base, h.index, 0, h.ofs); 2245 } else { 2246 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2247 datalo, h.base, h.index, 0, h.ofs); 2248 tcg_out_ext16u(s, datalo, datalo); 2249 } 2250 } else { 2251 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2252 h.base, h.index, 0, h.ofs); 2253 } 2254 break; 2255 case MO_SW: 2256 if (use_movbe) { 2257 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2258 datalo, h.base, h.index, 0, h.ofs); 2259 tcg_out_ext16s(s, type, datalo, datalo); 2260 } else { 2261 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2262 datalo, h.base, h.index, 0, h.ofs); 2263 } 2264 break; 2265 case MO_UL: 2266 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2267 h.base, h.index, 0, h.ofs); 2268 break; 2269#if TCG_TARGET_REG_BITS == 64 2270 case MO_SL: 2271 if (use_movbe) { 2272 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2273 h.base, h.index, 0, h.ofs); 2274 tcg_out_ext32s(s, datalo, datalo); 2275 } else { 2276 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2277 h.base, h.index, 0, h.ofs); 2278 } 2279 break; 2280#endif 2281 case MO_UQ: 2282 if (TCG_TARGET_REG_BITS == 64) { 2283 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2284 h.base, h.index, 0, h.ofs); 2285 break; 2286 } 2287 if (use_movbe) { 2288 TCGReg t = datalo; 2289 datalo = datahi; 2290 datahi = t; 2291 } 2292 if (h.base == datalo || h.index == datalo) { 2293 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2294 h.base, h.index, 0, h.ofs); 2295 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2296 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2297 } else { 2298 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2299 h.base, h.index, 0, h.ofs); 2300 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2301 h.base, h.index, 0, h.ofs + 4); 2302 } 2303 break; 2304 2305 case MO_128: 2306 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2307 2308 /* 2309 * Without 16-byte atomicity, use integer regs. 2310 * That is where we want the data, and it allows bswaps. 2311 */ 2312 if (h.aa.atom < MO_128) { 2313 if (use_movbe) { 2314 TCGReg t = datalo; 2315 datalo = datahi; 2316 datahi = t; 2317 } 2318 if (h.base == datalo || h.index == datalo) { 2319 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2320 h.base, h.index, 0, h.ofs); 2321 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2322 datalo, datahi, 0); 2323 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2324 datahi, datahi, 8); 2325 } else { 2326 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2327 h.base, h.index, 0, h.ofs); 2328 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2329 h.base, h.index, 0, h.ofs + 8); 2330 } 2331 break; 2332 } 2333 2334 /* 2335 * With 16-byte atomicity, a vector load is required. 2336 * If we already have 16-byte alignment, then VMOVDQA always works. 2337 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2338 * Else use we require a runtime test for alignment for VMOVDQA; 2339 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2340 */ 2341 if (h.aa.align >= MO_128) { 2342 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2343 TCG_TMP_VEC, 0, 2344 h.base, h.index, 0, h.ofs); 2345 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2346 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2347 TCG_TMP_VEC, 0, 2348 h.base, h.index, 0, h.ofs); 2349 } else { 2350 TCGLabel *l1 = gen_new_label(); 2351 TCGLabel *l2 = gen_new_label(); 2352 int jcc; 2353 2354 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2355 tcg_out_jxx(s, jcc, l1, true); 2356 2357 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2358 TCG_TMP_VEC, 0, 2359 h.base, h.index, 0, h.ofs); 2360 tcg_out_jxx(s, JCC_JMP, l2, true); 2361 2362 tcg_out_label(s, l1); 2363 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2364 TCG_TMP_VEC, 0, 2365 h.base, h.index, 0, h.ofs); 2366 tcg_out_label(s, l2); 2367 } 2368 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2369 break; 2370 2371 default: 2372 g_assert_not_reached(); 2373 } 2374} 2375 2376static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2377 TCGReg addrlo, TCGReg addrhi, 2378 MemOpIdx oi, TCGType data_type) 2379{ 2380 TCGLabelQemuLdst *ldst; 2381 HostAddress h; 2382 2383 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2384 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2385 2386 if (ldst) { 2387 ldst->type = data_type; 2388 ldst->datalo_reg = datalo; 2389 ldst->datahi_reg = datahi; 2390 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2391 } 2392} 2393 2394static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2395 HostAddress h, MemOp memop) 2396{ 2397 bool use_movbe = false; 2398 int movop = OPC_MOVL_EvGv; 2399 2400 /* 2401 * Do big-endian stores with movbe or system-mode. 2402 * User-only without movbe will have its swapping done generically. 2403 */ 2404 if (memop & MO_BSWAP) { 2405 tcg_debug_assert(have_movbe); 2406 use_movbe = true; 2407 movop = OPC_MOVBE_MyGy; 2408 } 2409 2410 switch (memop & MO_SIZE) { 2411 case MO_8: 2412 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2413 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2414 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2415 datalo, h.base, h.index, 0, h.ofs); 2416 break; 2417 case MO_16: 2418 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2419 h.base, h.index, 0, h.ofs); 2420 break; 2421 case MO_32: 2422 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2423 h.base, h.index, 0, h.ofs); 2424 break; 2425 case MO_64: 2426 if (TCG_TARGET_REG_BITS == 64) { 2427 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2428 h.base, h.index, 0, h.ofs); 2429 } else { 2430 if (use_movbe) { 2431 TCGReg t = datalo; 2432 datalo = datahi; 2433 datahi = t; 2434 } 2435 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2436 h.base, h.index, 0, h.ofs); 2437 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2438 h.base, h.index, 0, h.ofs + 4); 2439 } 2440 break; 2441 2442 case MO_128: 2443 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2444 2445 /* 2446 * Without 16-byte atomicity, use integer regs. 2447 * That is where we have the data, and it allows bswaps. 2448 */ 2449 if (h.aa.atom < MO_128) { 2450 if (use_movbe) { 2451 TCGReg t = datalo; 2452 datalo = datahi; 2453 datahi = t; 2454 } 2455 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2456 h.base, h.index, 0, h.ofs); 2457 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2458 h.base, h.index, 0, h.ofs + 8); 2459 break; 2460 } 2461 2462 /* 2463 * With 16-byte atomicity, a vector store is required. 2464 * If we already have 16-byte alignment, then VMOVDQA always works. 2465 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2466 * Else use we require a runtime test for alignment for VMOVDQA; 2467 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2468 */ 2469 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2470 if (h.aa.align >= MO_128) { 2471 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2472 TCG_TMP_VEC, 0, 2473 h.base, h.index, 0, h.ofs); 2474 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2475 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2476 TCG_TMP_VEC, 0, 2477 h.base, h.index, 0, h.ofs); 2478 } else { 2479 TCGLabel *l1 = gen_new_label(); 2480 TCGLabel *l2 = gen_new_label(); 2481 int jcc; 2482 2483 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2484 tcg_out_jxx(s, jcc, l1, true); 2485 2486 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2487 TCG_TMP_VEC, 0, 2488 h.base, h.index, 0, h.ofs); 2489 tcg_out_jxx(s, JCC_JMP, l2, true); 2490 2491 tcg_out_label(s, l1); 2492 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2493 TCG_TMP_VEC, 0, 2494 h.base, h.index, 0, h.ofs); 2495 tcg_out_label(s, l2); 2496 } 2497 break; 2498 2499 default: 2500 g_assert_not_reached(); 2501 } 2502} 2503 2504static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2505 TCGReg addrlo, TCGReg addrhi, 2506 MemOpIdx oi, TCGType data_type) 2507{ 2508 TCGLabelQemuLdst *ldst; 2509 HostAddress h; 2510 2511 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2512 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2513 2514 if (ldst) { 2515 ldst->type = data_type; 2516 ldst->datalo_reg = datalo; 2517 ldst->datahi_reg = datahi; 2518 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2519 } 2520} 2521 2522static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2523{ 2524 /* Reuse the zeroing that exists for goto_ptr. */ 2525 if (a0 == 0) { 2526 tcg_out_jmp(s, tcg_code_gen_epilogue); 2527 } else { 2528 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2529 tcg_out_jmp(s, tb_ret_addr); 2530 } 2531} 2532 2533static void tcg_out_goto_tb(TCGContext *s, int which) 2534{ 2535 /* 2536 * Jump displacement must be aligned for atomic patching; 2537 * see if we need to add extra nops before jump 2538 */ 2539 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2540 if (gap != 1) { 2541 tcg_out_nopn(s, gap - 1); 2542 } 2543 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2544 set_jmp_insn_offset(s, which); 2545 tcg_out32(s, 0); 2546 set_jmp_reset_offset(s, which); 2547} 2548 2549void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2550 uintptr_t jmp_rx, uintptr_t jmp_rw) 2551{ 2552 /* patch the branch destination */ 2553 uintptr_t addr = tb->jmp_target_addr[n]; 2554 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2555 /* no need to flush icache explicitly */ 2556} 2557 2558static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2559 const TCGArg args[TCG_MAX_OP_ARGS], 2560 const int const_args[TCG_MAX_OP_ARGS]) 2561{ 2562 TCGArg a0, a1, a2; 2563 int c, const_a2, vexop, rexw = 0; 2564 2565#if TCG_TARGET_REG_BITS == 64 2566# define OP_32_64(x) \ 2567 case glue(glue(INDEX_op_, x), _i64): \ 2568 rexw = P_REXW; /* FALLTHRU */ \ 2569 case glue(glue(INDEX_op_, x), _i32) 2570#else 2571# define OP_32_64(x) \ 2572 case glue(glue(INDEX_op_, x), _i32) 2573#endif 2574 2575 /* Hoist the loads of the most common arguments. */ 2576 a0 = args[0]; 2577 a1 = args[1]; 2578 a2 = args[2]; 2579 const_a2 = const_args[2]; 2580 2581 switch (opc) { 2582 case INDEX_op_goto_ptr: 2583 /* jmp to the given host address (could be epilogue) */ 2584 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2585 break; 2586 case INDEX_op_br: 2587 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2588 break; 2589 OP_32_64(ld8u): 2590 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2591 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2592 break; 2593 OP_32_64(ld8s): 2594 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2595 break; 2596 OP_32_64(ld16u): 2597 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2598 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2599 break; 2600 OP_32_64(ld16s): 2601 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2602 break; 2603#if TCG_TARGET_REG_BITS == 64 2604 case INDEX_op_ld32u_i64: 2605#endif 2606 case INDEX_op_ld_i32: 2607 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2608 break; 2609 2610 OP_32_64(st8): 2611 if (const_args[0]) { 2612 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2613 tcg_out8(s, a0); 2614 } else { 2615 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2616 } 2617 break; 2618 OP_32_64(st16): 2619 if (const_args[0]) { 2620 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2621 tcg_out16(s, a0); 2622 } else { 2623 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2624 } 2625 break; 2626#if TCG_TARGET_REG_BITS == 64 2627 case INDEX_op_st32_i64: 2628#endif 2629 case INDEX_op_st_i32: 2630 if (const_args[0]) { 2631 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2632 tcg_out32(s, a0); 2633 } else { 2634 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2635 } 2636 break; 2637 2638 OP_32_64(add): 2639 /* For 3-operand addition, use LEA. */ 2640 if (a0 != a1) { 2641 TCGArg c3 = 0; 2642 if (const_a2) { 2643 c3 = a2, a2 = -1; 2644 } else if (a0 == a2) { 2645 /* Watch out for dest = src + dest, since we've removed 2646 the matching constraint on the add. */ 2647 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2648 break; 2649 } 2650 2651 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2652 break; 2653 } 2654 c = ARITH_ADD; 2655 goto gen_arith; 2656 OP_32_64(sub): 2657 c = ARITH_SUB; 2658 goto gen_arith; 2659 OP_32_64(and): 2660 c = ARITH_AND; 2661 goto gen_arith; 2662 OP_32_64(or): 2663 c = ARITH_OR; 2664 goto gen_arith; 2665 OP_32_64(xor): 2666 c = ARITH_XOR; 2667 goto gen_arith; 2668 gen_arith: 2669 if (const_a2) { 2670 tgen_arithi(s, c + rexw, a0, a2, 0); 2671 } else { 2672 tgen_arithr(s, c + rexw, a0, a2); 2673 } 2674 break; 2675 2676 OP_32_64(andc): 2677 if (const_a2) { 2678 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2679 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2680 } else { 2681 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2682 } 2683 break; 2684 2685 OP_32_64(mul): 2686 if (const_a2) { 2687 int32_t val; 2688 val = a2; 2689 if (val == (int8_t)val) { 2690 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2691 tcg_out8(s, val); 2692 } else { 2693 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2694 tcg_out32(s, val); 2695 } 2696 } else { 2697 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2698 } 2699 break; 2700 2701 OP_32_64(div2): 2702 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2703 break; 2704 OP_32_64(divu2): 2705 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2706 break; 2707 2708 OP_32_64(shl): 2709 /* For small constant 3-operand shift, use LEA. */ 2710 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2711 if (a2 - 1 == 0) { 2712 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2713 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2714 } else { 2715 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2716 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2717 } 2718 break; 2719 } 2720 c = SHIFT_SHL; 2721 vexop = OPC_SHLX; 2722 goto gen_shift_maybe_vex; 2723 OP_32_64(shr): 2724 c = SHIFT_SHR; 2725 vexop = OPC_SHRX; 2726 goto gen_shift_maybe_vex; 2727 OP_32_64(sar): 2728 c = SHIFT_SAR; 2729 vexop = OPC_SARX; 2730 goto gen_shift_maybe_vex; 2731 OP_32_64(rotl): 2732 c = SHIFT_ROL; 2733 goto gen_shift; 2734 OP_32_64(rotr): 2735 c = SHIFT_ROR; 2736 goto gen_shift; 2737 gen_shift_maybe_vex: 2738 if (have_bmi2) { 2739 if (!const_a2) { 2740 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2741 break; 2742 } 2743 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2744 } 2745 /* FALLTHRU */ 2746 gen_shift: 2747 if (const_a2) { 2748 tcg_out_shifti(s, c + rexw, a0, a2); 2749 } else { 2750 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2751 } 2752 break; 2753 2754 OP_32_64(ctz): 2755 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2756 break; 2757 OP_32_64(clz): 2758 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2759 break; 2760 OP_32_64(ctpop): 2761 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2762 break; 2763 2764 OP_32_64(brcond): 2765 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 2766 arg_label(args[3]), 0); 2767 break; 2768 OP_32_64(setcond): 2769 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 2770 break; 2771 OP_32_64(negsetcond): 2772 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 2773 break; 2774 OP_32_64(movcond): 2775 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 2776 break; 2777 2778 OP_32_64(bswap16): 2779 if (a2 & TCG_BSWAP_OS) { 2780 /* Output must be sign-extended. */ 2781 if (rexw) { 2782 tcg_out_bswap64(s, a0); 2783 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2784 } else { 2785 tcg_out_bswap32(s, a0); 2786 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2787 } 2788 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2789 /* Output must be zero-extended, but input isn't. */ 2790 tcg_out_bswap32(s, a0); 2791 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2792 } else { 2793 tcg_out_rolw_8(s, a0); 2794 } 2795 break; 2796 OP_32_64(bswap32): 2797 tcg_out_bswap32(s, a0); 2798 if (rexw && (a2 & TCG_BSWAP_OS)) { 2799 tcg_out_ext32s(s, a0, a0); 2800 } 2801 break; 2802 2803 OP_32_64(neg): 2804 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2805 break; 2806 OP_32_64(not): 2807 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2808 break; 2809 2810 case INDEX_op_qemu_ld_a64_i32: 2811 if (TCG_TARGET_REG_BITS == 32) { 2812 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2813 break; 2814 } 2815 /* fall through */ 2816 case INDEX_op_qemu_ld_a32_i32: 2817 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2818 break; 2819 case INDEX_op_qemu_ld_a32_i64: 2820 if (TCG_TARGET_REG_BITS == 64) { 2821 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2822 } else { 2823 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2824 } 2825 break; 2826 case INDEX_op_qemu_ld_a64_i64: 2827 if (TCG_TARGET_REG_BITS == 64) { 2828 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2829 } else { 2830 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2831 } 2832 break; 2833 case INDEX_op_qemu_ld_a32_i128: 2834 case INDEX_op_qemu_ld_a64_i128: 2835 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2836 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2837 break; 2838 2839 case INDEX_op_qemu_st_a64_i32: 2840 case INDEX_op_qemu_st8_a64_i32: 2841 if (TCG_TARGET_REG_BITS == 32) { 2842 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2843 break; 2844 } 2845 /* fall through */ 2846 case INDEX_op_qemu_st_a32_i32: 2847 case INDEX_op_qemu_st8_a32_i32: 2848 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2849 break; 2850 case INDEX_op_qemu_st_a32_i64: 2851 if (TCG_TARGET_REG_BITS == 64) { 2852 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2853 } else { 2854 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2855 } 2856 break; 2857 case INDEX_op_qemu_st_a64_i64: 2858 if (TCG_TARGET_REG_BITS == 64) { 2859 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2860 } else { 2861 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2862 } 2863 break; 2864 case INDEX_op_qemu_st_a32_i128: 2865 case INDEX_op_qemu_st_a64_i128: 2866 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2867 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2868 break; 2869 2870 OP_32_64(mulu2): 2871 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2872 break; 2873 OP_32_64(muls2): 2874 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2875 break; 2876 OP_32_64(add2): 2877 if (const_args[4]) { 2878 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2879 } else { 2880 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2881 } 2882 if (const_args[5]) { 2883 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2884 } else { 2885 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2886 } 2887 break; 2888 OP_32_64(sub2): 2889 if (const_args[4]) { 2890 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2891 } else { 2892 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2893 } 2894 if (const_args[5]) { 2895 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2896 } else { 2897 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2898 } 2899 break; 2900 2901#if TCG_TARGET_REG_BITS == 32 2902 case INDEX_op_brcond2_i32: 2903 tcg_out_brcond2(s, args, const_args, 0); 2904 break; 2905 case INDEX_op_setcond2_i32: 2906 tcg_out_setcond2(s, args, const_args); 2907 break; 2908#else /* TCG_TARGET_REG_BITS == 64 */ 2909 case INDEX_op_ld32s_i64: 2910 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2911 break; 2912 case INDEX_op_ld_i64: 2913 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2914 break; 2915 case INDEX_op_st_i64: 2916 if (const_args[0]) { 2917 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2918 tcg_out32(s, a0); 2919 } else { 2920 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2921 } 2922 break; 2923 2924 case INDEX_op_bswap64_i64: 2925 tcg_out_bswap64(s, a0); 2926 break; 2927 case INDEX_op_extrh_i64_i32: 2928 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2929 break; 2930#endif 2931 2932 OP_32_64(deposit): 2933 if (args[3] == 0 && args[4] == 8) { 2934 /* load bits 0..7 */ 2935 if (const_a2) { 2936 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 2937 0, a0, 0); 2938 tcg_out8(s, a2); 2939 } else { 2940 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2941 } 2942 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 2943 /* load bits 8..15 */ 2944 if (const_a2) { 2945 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 2946 tcg_out8(s, a2); 2947 } else { 2948 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2949 } 2950 } else if (args[3] == 0 && args[4] == 16) { 2951 /* load bits 0..15 */ 2952 if (const_a2) { 2953 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 2954 0, a0, 0); 2955 tcg_out16(s, a2); 2956 } else { 2957 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2958 } 2959 } else { 2960 g_assert_not_reached(); 2961 } 2962 break; 2963 2964 case INDEX_op_extract_i64: 2965 if (a2 + args[3] == 32) { 2966 /* This is a 32-bit zero-extending right shift. */ 2967 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2968 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2969 break; 2970 } 2971 /* FALLTHRU */ 2972 case INDEX_op_extract_i32: 2973 /* On the off-chance that we can use the high-byte registers. 2974 Otherwise we emit the same ext16 + shift pattern that we 2975 would have gotten from the normal tcg-op.c expansion. */ 2976 tcg_debug_assert(a2 == 8 && args[3] == 8); 2977 if (a1 < 4 && a0 < 8) { 2978 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2979 } else { 2980 tcg_out_ext16u(s, a0, a1); 2981 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2982 } 2983 break; 2984 2985 case INDEX_op_sextract_i32: 2986 /* We don't implement sextract_i64, as we cannot sign-extend to 2987 64-bits without using the REX prefix that explicitly excludes 2988 access to the high-byte registers. */ 2989 tcg_debug_assert(a2 == 8 && args[3] == 8); 2990 if (a1 < 4 && a0 < 8) { 2991 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2992 } else { 2993 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 2994 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2995 } 2996 break; 2997 2998 OP_32_64(extract2): 2999 /* Note that SHRD outputs to the r/m operand. */ 3000 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3001 tcg_out8(s, args[3]); 3002 break; 3003 3004 case INDEX_op_mb: 3005 tcg_out_mb(s, a0); 3006 break; 3007 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 3008 case INDEX_op_mov_i64: 3009 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3010 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3011 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3012 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 3013 case INDEX_op_ext8s_i64: 3014 case INDEX_op_ext8u_i32: 3015 case INDEX_op_ext8u_i64: 3016 case INDEX_op_ext16s_i32: 3017 case INDEX_op_ext16s_i64: 3018 case INDEX_op_ext16u_i32: 3019 case INDEX_op_ext16u_i64: 3020 case INDEX_op_ext32s_i64: 3021 case INDEX_op_ext32u_i64: 3022 case INDEX_op_ext_i32_i64: 3023 case INDEX_op_extu_i32_i64: 3024 case INDEX_op_extrl_i64_i32: 3025 default: 3026 g_assert_not_reached(); 3027 } 3028 3029#undef OP_32_64 3030} 3031 3032static int const umin_insn[4] = { 3033 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3034}; 3035 3036static int const umax_insn[4] = { 3037 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3038}; 3039 3040static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3041 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3042{ 3043 static int const cmpeq_insn[4] = { 3044 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3045 }; 3046 static int const cmpgt_insn[4] = { 3047 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3048 }; 3049 3050 enum { 3051 NEED_INV = 1, 3052 NEED_SWAP = 2, 3053 NEED_UMIN = 4, 3054 NEED_UMAX = 8, 3055 INVALID = 16, 3056 }; 3057 static const uint8_t cond_fixup[16] = { 3058 [0 ... 15] = INVALID, 3059 [TCG_COND_EQ] = 0, 3060 [TCG_COND_GT] = 0, 3061 [TCG_COND_NE] = NEED_INV, 3062 [TCG_COND_LE] = NEED_INV, 3063 [TCG_COND_LT] = NEED_SWAP, 3064 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3065 [TCG_COND_LEU] = NEED_UMIN, 3066 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3067 [TCG_COND_GEU] = NEED_UMAX, 3068 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3069 }; 3070 int fixup = cond_fixup[cond]; 3071 3072 assert(!(fixup & INVALID)); 3073 3074 if (fixup & NEED_INV) { 3075 cond = tcg_invert_cond(cond); 3076 } 3077 3078 if (fixup & NEED_SWAP) { 3079 TCGReg swap = v1; 3080 v1 = v2; 3081 v2 = swap; 3082 cond = tcg_swap_cond(cond); 3083 } 3084 3085 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3086 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3087 3088 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3089 assert(vece <= MO_32); 3090 3091 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3092 v2 = TCG_TMP_VEC; 3093 cond = TCG_COND_EQ; 3094 } 3095 3096 switch (cond) { 3097 case TCG_COND_EQ: 3098 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3099 break; 3100 case TCG_COND_GT: 3101 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3102 break; 3103 default: 3104 g_assert_not_reached(); 3105 } 3106 return fixup & NEED_INV; 3107} 3108 3109static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3110 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3111{ 3112 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3113 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3114 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3115 } 3116} 3117 3118static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3119 TCGReg v0, TCGReg c1, TCGReg c2, 3120 TCGReg v3, TCGReg v4, TCGCond cond) 3121{ 3122 if (tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond)) { 3123 TCGReg swap = v3; 3124 v3 = v4; 3125 v4 = swap; 3126 } 3127 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3128 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3129} 3130 3131static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3132 unsigned vecl, unsigned vece, 3133 const TCGArg args[TCG_MAX_OP_ARGS], 3134 const int const_args[TCG_MAX_OP_ARGS]) 3135{ 3136 static int const add_insn[4] = { 3137 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3138 }; 3139 static int const ssadd_insn[4] = { 3140 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3141 }; 3142 static int const usadd_insn[4] = { 3143 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3144 }; 3145 static int const sub_insn[4] = { 3146 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3147 }; 3148 static int const sssub_insn[4] = { 3149 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3150 }; 3151 static int const ussub_insn[4] = { 3152 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3153 }; 3154 static int const mul_insn[4] = { 3155 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3156 }; 3157 static int const shift_imm_insn[4] = { 3158 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3159 }; 3160 static int const punpckl_insn[4] = { 3161 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3162 }; 3163 static int const punpckh_insn[4] = { 3164 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3165 }; 3166 static int const packss_insn[4] = { 3167 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3168 }; 3169 static int const packus_insn[4] = { 3170 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3171 }; 3172 static int const smin_insn[4] = { 3173 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3174 }; 3175 static int const smax_insn[4] = { 3176 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3177 }; 3178 static int const rotlv_insn[4] = { 3179 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3180 }; 3181 static int const rotrv_insn[4] = { 3182 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3183 }; 3184 static int const shlv_insn[4] = { 3185 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3186 }; 3187 static int const shrv_insn[4] = { 3188 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3189 }; 3190 static int const sarv_insn[4] = { 3191 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3192 }; 3193 static int const shls_insn[4] = { 3194 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3195 }; 3196 static int const shrs_insn[4] = { 3197 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3198 }; 3199 static int const sars_insn[4] = { 3200 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3201 }; 3202 static int const vpshldi_insn[4] = { 3203 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3204 }; 3205 static int const vpshldv_insn[4] = { 3206 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3207 }; 3208 static int const vpshrdv_insn[4] = { 3209 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3210 }; 3211 static int const abs_insn[4] = { 3212 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3213 }; 3214 3215 TCGType type = vecl + TCG_TYPE_V64; 3216 int insn, sub; 3217 TCGArg a0, a1, a2, a3; 3218 3219 a0 = args[0]; 3220 a1 = args[1]; 3221 a2 = args[2]; 3222 3223 switch (opc) { 3224 case INDEX_op_add_vec: 3225 insn = add_insn[vece]; 3226 goto gen_simd; 3227 case INDEX_op_ssadd_vec: 3228 insn = ssadd_insn[vece]; 3229 goto gen_simd; 3230 case INDEX_op_usadd_vec: 3231 insn = usadd_insn[vece]; 3232 goto gen_simd; 3233 case INDEX_op_sub_vec: 3234 insn = sub_insn[vece]; 3235 goto gen_simd; 3236 case INDEX_op_sssub_vec: 3237 insn = sssub_insn[vece]; 3238 goto gen_simd; 3239 case INDEX_op_ussub_vec: 3240 insn = ussub_insn[vece]; 3241 goto gen_simd; 3242 case INDEX_op_mul_vec: 3243 insn = mul_insn[vece]; 3244 goto gen_simd; 3245 case INDEX_op_and_vec: 3246 insn = OPC_PAND; 3247 goto gen_simd; 3248 case INDEX_op_or_vec: 3249 insn = OPC_POR; 3250 goto gen_simd; 3251 case INDEX_op_xor_vec: 3252 insn = OPC_PXOR; 3253 goto gen_simd; 3254 case INDEX_op_smin_vec: 3255 insn = smin_insn[vece]; 3256 goto gen_simd; 3257 case INDEX_op_umin_vec: 3258 insn = umin_insn[vece]; 3259 goto gen_simd; 3260 case INDEX_op_smax_vec: 3261 insn = smax_insn[vece]; 3262 goto gen_simd; 3263 case INDEX_op_umax_vec: 3264 insn = umax_insn[vece]; 3265 goto gen_simd; 3266 case INDEX_op_shlv_vec: 3267 insn = shlv_insn[vece]; 3268 goto gen_simd; 3269 case INDEX_op_shrv_vec: 3270 insn = shrv_insn[vece]; 3271 goto gen_simd; 3272 case INDEX_op_sarv_vec: 3273 insn = sarv_insn[vece]; 3274 goto gen_simd; 3275 case INDEX_op_rotlv_vec: 3276 insn = rotlv_insn[vece]; 3277 goto gen_simd; 3278 case INDEX_op_rotrv_vec: 3279 insn = rotrv_insn[vece]; 3280 goto gen_simd; 3281 case INDEX_op_shls_vec: 3282 insn = shls_insn[vece]; 3283 goto gen_simd; 3284 case INDEX_op_shrs_vec: 3285 insn = shrs_insn[vece]; 3286 goto gen_simd; 3287 case INDEX_op_sars_vec: 3288 insn = sars_insn[vece]; 3289 goto gen_simd; 3290 case INDEX_op_x86_punpckl_vec: 3291 insn = punpckl_insn[vece]; 3292 goto gen_simd; 3293 case INDEX_op_x86_punpckh_vec: 3294 insn = punpckh_insn[vece]; 3295 goto gen_simd; 3296 case INDEX_op_x86_packss_vec: 3297 insn = packss_insn[vece]; 3298 goto gen_simd; 3299 case INDEX_op_x86_packus_vec: 3300 insn = packus_insn[vece]; 3301 goto gen_simd; 3302 case INDEX_op_x86_vpshldv_vec: 3303 insn = vpshldv_insn[vece]; 3304 a1 = a2; 3305 a2 = args[3]; 3306 goto gen_simd; 3307 case INDEX_op_x86_vpshrdv_vec: 3308 insn = vpshrdv_insn[vece]; 3309 a1 = a2; 3310 a2 = args[3]; 3311 goto gen_simd; 3312#if TCG_TARGET_REG_BITS == 32 3313 case INDEX_op_dup2_vec: 3314 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3315 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3316 /* Then replicate the 64-bit elements across the rest of the vector. */ 3317 if (type != TCG_TYPE_V64) { 3318 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3319 } 3320 break; 3321#endif 3322 case INDEX_op_abs_vec: 3323 insn = abs_insn[vece]; 3324 a2 = a1; 3325 a1 = 0; 3326 goto gen_simd; 3327 gen_simd: 3328 tcg_debug_assert(insn != OPC_UD2); 3329 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3330 break; 3331 3332 case INDEX_op_cmp_vec: 3333 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3334 break; 3335 3336 case INDEX_op_cmpsel_vec: 3337 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3338 args[3], args[4], args[5]); 3339 break; 3340 3341 case INDEX_op_andc_vec: 3342 insn = OPC_PANDN; 3343 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3344 break; 3345 3346 case INDEX_op_shli_vec: 3347 insn = shift_imm_insn[vece]; 3348 sub = 6; 3349 goto gen_shift; 3350 case INDEX_op_shri_vec: 3351 insn = shift_imm_insn[vece]; 3352 sub = 2; 3353 goto gen_shift; 3354 case INDEX_op_sari_vec: 3355 if (vece == MO_64) { 3356 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3357 } else { 3358 insn = shift_imm_insn[vece]; 3359 } 3360 sub = 4; 3361 goto gen_shift; 3362 case INDEX_op_rotli_vec: 3363 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3364 if (vece == MO_64) { 3365 insn |= P_VEXW; 3366 } 3367 sub = 1; 3368 goto gen_shift; 3369 gen_shift: 3370 tcg_debug_assert(vece != MO_8); 3371 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3372 tcg_out8(s, a2); 3373 break; 3374 3375 case INDEX_op_ld_vec: 3376 tcg_out_ld(s, type, a0, a1, a2); 3377 break; 3378 case INDEX_op_st_vec: 3379 tcg_out_st(s, type, a0, a1, a2); 3380 break; 3381 case INDEX_op_dupm_vec: 3382 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3383 break; 3384 3385 case INDEX_op_x86_shufps_vec: 3386 insn = OPC_SHUFPS; 3387 sub = args[3]; 3388 goto gen_simd_imm8; 3389 case INDEX_op_x86_blend_vec: 3390 if (vece == MO_16) { 3391 insn = OPC_PBLENDW; 3392 } else if (vece == MO_32) { 3393 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3394 } else { 3395 g_assert_not_reached(); 3396 } 3397 sub = args[3]; 3398 goto gen_simd_imm8; 3399 case INDEX_op_x86_vperm2i128_vec: 3400 insn = OPC_VPERM2I128; 3401 sub = args[3]; 3402 goto gen_simd_imm8; 3403 case INDEX_op_x86_vpshldi_vec: 3404 insn = vpshldi_insn[vece]; 3405 sub = args[3]; 3406 goto gen_simd_imm8; 3407 3408 case INDEX_op_not_vec: 3409 insn = OPC_VPTERNLOGQ; 3410 a2 = a1; 3411 sub = 0x33; /* !B */ 3412 goto gen_simd_imm8; 3413 case INDEX_op_nor_vec: 3414 insn = OPC_VPTERNLOGQ; 3415 sub = 0x11; /* norCB */ 3416 goto gen_simd_imm8; 3417 case INDEX_op_nand_vec: 3418 insn = OPC_VPTERNLOGQ; 3419 sub = 0x77; /* nandCB */ 3420 goto gen_simd_imm8; 3421 case INDEX_op_eqv_vec: 3422 insn = OPC_VPTERNLOGQ; 3423 sub = 0x99; /* xnorCB */ 3424 goto gen_simd_imm8; 3425 case INDEX_op_orc_vec: 3426 insn = OPC_VPTERNLOGQ; 3427 sub = 0xdd; /* orB!C */ 3428 goto gen_simd_imm8; 3429 3430 case INDEX_op_bitsel_vec: 3431 insn = OPC_VPTERNLOGQ; 3432 a3 = args[3]; 3433 if (a0 == a1) { 3434 a1 = a2; 3435 a2 = a3; 3436 sub = 0xca; /* A?B:C */ 3437 } else if (a0 == a2) { 3438 a2 = a3; 3439 sub = 0xe2; /* B?A:C */ 3440 } else { 3441 tcg_out_mov(s, type, a0, a3); 3442 sub = 0xb8; /* B?C:A */ 3443 } 3444 goto gen_simd_imm8; 3445 3446 gen_simd_imm8: 3447 tcg_debug_assert(insn != OPC_UD2); 3448 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3449 tcg_out8(s, sub); 3450 break; 3451 3452 case INDEX_op_x86_psrldq_vec: 3453 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3454 tcg_out8(s, a2); 3455 break; 3456 3457 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3458 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3459 default: 3460 g_assert_not_reached(); 3461 } 3462} 3463 3464static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3465{ 3466 switch (op) { 3467 case INDEX_op_goto_ptr: 3468 return C_O0_I1(r); 3469 3470 case INDEX_op_ld8u_i32: 3471 case INDEX_op_ld8u_i64: 3472 case INDEX_op_ld8s_i32: 3473 case INDEX_op_ld8s_i64: 3474 case INDEX_op_ld16u_i32: 3475 case INDEX_op_ld16u_i64: 3476 case INDEX_op_ld16s_i32: 3477 case INDEX_op_ld16s_i64: 3478 case INDEX_op_ld_i32: 3479 case INDEX_op_ld32u_i64: 3480 case INDEX_op_ld32s_i64: 3481 case INDEX_op_ld_i64: 3482 return C_O1_I1(r, r); 3483 3484 case INDEX_op_st8_i32: 3485 case INDEX_op_st8_i64: 3486 return C_O0_I2(qi, r); 3487 3488 case INDEX_op_st16_i32: 3489 case INDEX_op_st16_i64: 3490 case INDEX_op_st_i32: 3491 case INDEX_op_st32_i64: 3492 return C_O0_I2(ri, r); 3493 3494 case INDEX_op_st_i64: 3495 return C_O0_I2(re, r); 3496 3497 case INDEX_op_add_i32: 3498 case INDEX_op_add_i64: 3499 return C_O1_I2(r, r, re); 3500 3501 case INDEX_op_sub_i32: 3502 case INDEX_op_sub_i64: 3503 case INDEX_op_mul_i32: 3504 case INDEX_op_mul_i64: 3505 case INDEX_op_or_i32: 3506 case INDEX_op_or_i64: 3507 case INDEX_op_xor_i32: 3508 case INDEX_op_xor_i64: 3509 return C_O1_I2(r, 0, re); 3510 3511 case INDEX_op_and_i32: 3512 case INDEX_op_and_i64: 3513 return C_O1_I2(r, 0, reZ); 3514 3515 case INDEX_op_andc_i32: 3516 case INDEX_op_andc_i64: 3517 return C_O1_I2(r, r, rI); 3518 3519 case INDEX_op_shl_i32: 3520 case INDEX_op_shl_i64: 3521 case INDEX_op_shr_i32: 3522 case INDEX_op_shr_i64: 3523 case INDEX_op_sar_i32: 3524 case INDEX_op_sar_i64: 3525 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3526 3527 case INDEX_op_rotl_i32: 3528 case INDEX_op_rotl_i64: 3529 case INDEX_op_rotr_i32: 3530 case INDEX_op_rotr_i64: 3531 return C_O1_I2(r, 0, ci); 3532 3533 case INDEX_op_brcond_i32: 3534 case INDEX_op_brcond_i64: 3535 return C_O0_I2(r, reT); 3536 3537 case INDEX_op_bswap16_i32: 3538 case INDEX_op_bswap16_i64: 3539 case INDEX_op_bswap32_i32: 3540 case INDEX_op_bswap32_i64: 3541 case INDEX_op_bswap64_i64: 3542 case INDEX_op_neg_i32: 3543 case INDEX_op_neg_i64: 3544 case INDEX_op_not_i32: 3545 case INDEX_op_not_i64: 3546 case INDEX_op_extrh_i64_i32: 3547 return C_O1_I1(r, 0); 3548 3549 case INDEX_op_ext8s_i32: 3550 case INDEX_op_ext8s_i64: 3551 case INDEX_op_ext8u_i32: 3552 case INDEX_op_ext8u_i64: 3553 return C_O1_I1(r, q); 3554 3555 case INDEX_op_ext16s_i32: 3556 case INDEX_op_ext16s_i64: 3557 case INDEX_op_ext16u_i32: 3558 case INDEX_op_ext16u_i64: 3559 case INDEX_op_ext32s_i64: 3560 case INDEX_op_ext32u_i64: 3561 case INDEX_op_ext_i32_i64: 3562 case INDEX_op_extu_i32_i64: 3563 case INDEX_op_extrl_i64_i32: 3564 case INDEX_op_extract_i32: 3565 case INDEX_op_extract_i64: 3566 case INDEX_op_sextract_i32: 3567 case INDEX_op_ctpop_i32: 3568 case INDEX_op_ctpop_i64: 3569 return C_O1_I1(r, r); 3570 3571 case INDEX_op_extract2_i32: 3572 case INDEX_op_extract2_i64: 3573 return C_O1_I2(r, 0, r); 3574 3575 case INDEX_op_deposit_i32: 3576 case INDEX_op_deposit_i64: 3577 return C_O1_I2(q, 0, qi); 3578 3579 case INDEX_op_setcond_i32: 3580 case INDEX_op_setcond_i64: 3581 case INDEX_op_negsetcond_i32: 3582 case INDEX_op_negsetcond_i64: 3583 return C_O1_I2(q, r, reT); 3584 3585 case INDEX_op_movcond_i32: 3586 case INDEX_op_movcond_i64: 3587 return C_O1_I4(r, r, reT, r, 0); 3588 3589 case INDEX_op_div2_i32: 3590 case INDEX_op_div2_i64: 3591 case INDEX_op_divu2_i32: 3592 case INDEX_op_divu2_i64: 3593 return C_O2_I3(a, d, 0, 1, r); 3594 3595 case INDEX_op_mulu2_i32: 3596 case INDEX_op_mulu2_i64: 3597 case INDEX_op_muls2_i32: 3598 case INDEX_op_muls2_i64: 3599 return C_O2_I2(a, d, a, r); 3600 3601 case INDEX_op_add2_i32: 3602 case INDEX_op_add2_i64: 3603 case INDEX_op_sub2_i32: 3604 case INDEX_op_sub2_i64: 3605 return C_N1_O1_I4(r, r, 0, 1, re, re); 3606 3607 case INDEX_op_ctz_i32: 3608 case INDEX_op_ctz_i64: 3609 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3610 3611 case INDEX_op_clz_i32: 3612 case INDEX_op_clz_i64: 3613 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3614 3615 case INDEX_op_qemu_ld_a32_i32: 3616 return C_O1_I1(r, L); 3617 case INDEX_op_qemu_ld_a64_i32: 3618 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3619 3620 case INDEX_op_qemu_st_a32_i32: 3621 return C_O0_I2(L, L); 3622 case INDEX_op_qemu_st_a64_i32: 3623 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3624 case INDEX_op_qemu_st8_a32_i32: 3625 return C_O0_I2(s, L); 3626 case INDEX_op_qemu_st8_a64_i32: 3627 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3628 3629 case INDEX_op_qemu_ld_a32_i64: 3630 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3631 case INDEX_op_qemu_ld_a64_i64: 3632 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3633 3634 case INDEX_op_qemu_st_a32_i64: 3635 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3636 case INDEX_op_qemu_st_a64_i64: 3637 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3638 3639 case INDEX_op_qemu_ld_a32_i128: 3640 case INDEX_op_qemu_ld_a64_i128: 3641 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3642 return C_O2_I1(r, r, L); 3643 case INDEX_op_qemu_st_a32_i128: 3644 case INDEX_op_qemu_st_a64_i128: 3645 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3646 return C_O0_I3(L, L, L); 3647 3648 case INDEX_op_brcond2_i32: 3649 return C_O0_I4(r, r, ri, ri); 3650 3651 case INDEX_op_setcond2_i32: 3652 return C_O1_I4(r, r, r, ri, ri); 3653 3654 case INDEX_op_ld_vec: 3655 case INDEX_op_dupm_vec: 3656 return C_O1_I1(x, r); 3657 3658 case INDEX_op_st_vec: 3659 return C_O0_I2(x, r); 3660 3661 case INDEX_op_add_vec: 3662 case INDEX_op_sub_vec: 3663 case INDEX_op_mul_vec: 3664 case INDEX_op_and_vec: 3665 case INDEX_op_or_vec: 3666 case INDEX_op_xor_vec: 3667 case INDEX_op_andc_vec: 3668 case INDEX_op_orc_vec: 3669 case INDEX_op_nand_vec: 3670 case INDEX_op_nor_vec: 3671 case INDEX_op_eqv_vec: 3672 case INDEX_op_ssadd_vec: 3673 case INDEX_op_usadd_vec: 3674 case INDEX_op_sssub_vec: 3675 case INDEX_op_ussub_vec: 3676 case INDEX_op_smin_vec: 3677 case INDEX_op_umin_vec: 3678 case INDEX_op_smax_vec: 3679 case INDEX_op_umax_vec: 3680 case INDEX_op_shlv_vec: 3681 case INDEX_op_shrv_vec: 3682 case INDEX_op_sarv_vec: 3683 case INDEX_op_rotlv_vec: 3684 case INDEX_op_rotrv_vec: 3685 case INDEX_op_shls_vec: 3686 case INDEX_op_shrs_vec: 3687 case INDEX_op_sars_vec: 3688 case INDEX_op_cmp_vec: 3689 case INDEX_op_x86_shufps_vec: 3690 case INDEX_op_x86_blend_vec: 3691 case INDEX_op_x86_packss_vec: 3692 case INDEX_op_x86_packus_vec: 3693 case INDEX_op_x86_vperm2i128_vec: 3694 case INDEX_op_x86_punpckl_vec: 3695 case INDEX_op_x86_punpckh_vec: 3696 case INDEX_op_x86_vpshldi_vec: 3697#if TCG_TARGET_REG_BITS == 32 3698 case INDEX_op_dup2_vec: 3699#endif 3700 return C_O1_I2(x, x, x); 3701 3702 case INDEX_op_abs_vec: 3703 case INDEX_op_dup_vec: 3704 case INDEX_op_not_vec: 3705 case INDEX_op_shli_vec: 3706 case INDEX_op_shri_vec: 3707 case INDEX_op_sari_vec: 3708 case INDEX_op_rotli_vec: 3709 case INDEX_op_x86_psrldq_vec: 3710 return C_O1_I1(x, x); 3711 3712 case INDEX_op_x86_vpshldv_vec: 3713 case INDEX_op_x86_vpshrdv_vec: 3714 return C_O1_I3(x, 0, x, x); 3715 3716 case INDEX_op_bitsel_vec: 3717 return C_O1_I3(x, x, x, x); 3718 case INDEX_op_cmpsel_vec: 3719 return C_O1_I4(x, x, x, x, x); 3720 3721 default: 3722 g_assert_not_reached(); 3723 } 3724} 3725 3726int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3727{ 3728 switch (opc) { 3729 case INDEX_op_add_vec: 3730 case INDEX_op_sub_vec: 3731 case INDEX_op_and_vec: 3732 case INDEX_op_or_vec: 3733 case INDEX_op_xor_vec: 3734 case INDEX_op_andc_vec: 3735 case INDEX_op_orc_vec: 3736 case INDEX_op_nand_vec: 3737 case INDEX_op_nor_vec: 3738 case INDEX_op_eqv_vec: 3739 case INDEX_op_not_vec: 3740 case INDEX_op_bitsel_vec: 3741 return 1; 3742 case INDEX_op_cmp_vec: 3743 case INDEX_op_cmpsel_vec: 3744 return -1; 3745 3746 case INDEX_op_rotli_vec: 3747 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3748 3749 case INDEX_op_shli_vec: 3750 case INDEX_op_shri_vec: 3751 /* We must expand the operation for MO_8. */ 3752 return vece == MO_8 ? -1 : 1; 3753 3754 case INDEX_op_sari_vec: 3755 switch (vece) { 3756 case MO_8: 3757 return -1; 3758 case MO_16: 3759 case MO_32: 3760 return 1; 3761 case MO_64: 3762 if (have_avx512vl) { 3763 return 1; 3764 } 3765 /* 3766 * We can emulate this for MO_64, but it does not pay off 3767 * unless we're producing at least 4 values. 3768 */ 3769 return type >= TCG_TYPE_V256 ? -1 : 0; 3770 } 3771 return 0; 3772 3773 case INDEX_op_shls_vec: 3774 case INDEX_op_shrs_vec: 3775 return vece >= MO_16; 3776 case INDEX_op_sars_vec: 3777 switch (vece) { 3778 case MO_16: 3779 case MO_32: 3780 return 1; 3781 case MO_64: 3782 return have_avx512vl; 3783 } 3784 return 0; 3785 case INDEX_op_rotls_vec: 3786 return vece >= MO_16 ? -1 : 0; 3787 3788 case INDEX_op_shlv_vec: 3789 case INDEX_op_shrv_vec: 3790 switch (vece) { 3791 case MO_16: 3792 return have_avx512bw; 3793 case MO_32: 3794 case MO_64: 3795 return have_avx2; 3796 } 3797 return 0; 3798 case INDEX_op_sarv_vec: 3799 switch (vece) { 3800 case MO_16: 3801 return have_avx512bw; 3802 case MO_32: 3803 return have_avx2; 3804 case MO_64: 3805 return have_avx512vl; 3806 } 3807 return 0; 3808 case INDEX_op_rotlv_vec: 3809 case INDEX_op_rotrv_vec: 3810 switch (vece) { 3811 case MO_16: 3812 return have_avx512vbmi2 ? -1 : 0; 3813 case MO_32: 3814 case MO_64: 3815 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3816 } 3817 return 0; 3818 3819 case INDEX_op_mul_vec: 3820 switch (vece) { 3821 case MO_8: 3822 return -1; 3823 case MO_64: 3824 return have_avx512dq; 3825 } 3826 return 1; 3827 3828 case INDEX_op_ssadd_vec: 3829 case INDEX_op_usadd_vec: 3830 case INDEX_op_sssub_vec: 3831 case INDEX_op_ussub_vec: 3832 return vece <= MO_16; 3833 case INDEX_op_smin_vec: 3834 case INDEX_op_smax_vec: 3835 case INDEX_op_umin_vec: 3836 case INDEX_op_umax_vec: 3837 case INDEX_op_abs_vec: 3838 return vece <= MO_32 || have_avx512vl; 3839 3840 default: 3841 return 0; 3842 } 3843} 3844 3845static void expand_vec_shi(TCGType type, unsigned vece, bool right, 3846 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3847{ 3848 uint8_t mask; 3849 3850 tcg_debug_assert(vece == MO_8); 3851 if (right) { 3852 mask = 0xff >> imm; 3853 tcg_gen_shri_vec(MO_16, v0, v1, imm); 3854 } else { 3855 mask = 0xff << imm; 3856 tcg_gen_shli_vec(MO_16, v0, v1, imm); 3857 } 3858 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 3859} 3860 3861static void expand_vec_sari(TCGType type, unsigned vece, 3862 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3863{ 3864 TCGv_vec t1, t2; 3865 3866 switch (vece) { 3867 case MO_8: 3868 /* Unpack to 16-bit, shift, and repack. */ 3869 t1 = tcg_temp_new_vec(type); 3870 t2 = tcg_temp_new_vec(type); 3871 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3872 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3873 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3874 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3875 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3876 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3877 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3878 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3879 tcg_temp_free_vec(t1); 3880 tcg_temp_free_vec(t2); 3881 break; 3882 3883 case MO_64: 3884 t1 = tcg_temp_new_vec(type); 3885 if (imm <= 32) { 3886 /* 3887 * We can emulate a small sign extend by performing an arithmetic 3888 * 32-bit shift and overwriting the high half of a 64-bit logical 3889 * shift. Note that the ISA says shift of 32 is valid, but TCG 3890 * does not, so we have to bound the smaller shift -- we get the 3891 * same result in the high half either way. 3892 */ 3893 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3894 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3895 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3896 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3897 tcgv_vec_arg(t1), 0xaa); 3898 } else { 3899 /* Otherwise we will need to use a compare vs 0 to produce 3900 * the sign-extend, shift and merge. 3901 */ 3902 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 3903 tcg_constant_vec(type, MO_64, 0), v1); 3904 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3905 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3906 tcg_gen_or_vec(MO_64, v0, v0, t1); 3907 } 3908 tcg_temp_free_vec(t1); 3909 break; 3910 3911 default: 3912 g_assert_not_reached(); 3913 } 3914} 3915 3916static void expand_vec_rotli(TCGType type, unsigned vece, 3917 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3918{ 3919 TCGv_vec t; 3920 3921 if (vece != MO_8 && have_avx512vbmi2) { 3922 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3923 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3924 return; 3925 } 3926 3927 t = tcg_temp_new_vec(type); 3928 tcg_gen_shli_vec(vece, t, v1, imm); 3929 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3930 tcg_gen_or_vec(vece, v0, v0, t); 3931 tcg_temp_free_vec(t); 3932} 3933 3934static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3935 TCGv_vec v1, TCGv_vec sh, bool right) 3936{ 3937 TCGv_vec t; 3938 3939 if (have_avx512vbmi2) { 3940 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3941 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3942 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3943 return; 3944 } 3945 3946 t = tcg_temp_new_vec(type); 3947 tcg_gen_dupi_vec(vece, t, 8 << vece); 3948 tcg_gen_sub_vec(vece, t, t, sh); 3949 if (right) { 3950 tcg_gen_shlv_vec(vece, t, v1, t); 3951 tcg_gen_shrv_vec(vece, v0, v1, sh); 3952 } else { 3953 tcg_gen_shrv_vec(vece, t, v1, t); 3954 tcg_gen_shlv_vec(vece, v0, v1, sh); 3955 } 3956 tcg_gen_or_vec(vece, v0, v0, t); 3957 tcg_temp_free_vec(t); 3958} 3959 3960static void expand_vec_rotls(TCGType type, unsigned vece, 3961 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3962{ 3963 TCGv_vec t = tcg_temp_new_vec(type); 3964 3965 tcg_debug_assert(vece != MO_8); 3966 3967 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3968 tcg_gen_dup_i32_vec(vece, t, lsh); 3969 if (vece >= MO_32) { 3970 tcg_gen_rotlv_vec(vece, v0, v1, t); 3971 } else { 3972 expand_vec_rotv(type, vece, v0, v1, t, false); 3973 } 3974 } else { 3975 TCGv_i32 rsh = tcg_temp_new_i32(); 3976 3977 tcg_gen_neg_i32(rsh, lsh); 3978 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3979 tcg_gen_shls_vec(vece, t, v1, lsh); 3980 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3981 tcg_gen_or_vec(vece, v0, v0, t); 3982 3983 tcg_temp_free_i32(rsh); 3984 } 3985 3986 tcg_temp_free_vec(t); 3987} 3988 3989static void expand_vec_mul(TCGType type, unsigned vece, 3990 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3991{ 3992 TCGv_vec t1, t2, t3, t4, zero; 3993 3994 tcg_debug_assert(vece == MO_8); 3995 3996 /* 3997 * Unpack v1 bytes to words, 0 | x. 3998 * Unpack v2 bytes to words, y | 0. 3999 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4000 * Shift logical right by 8 bits to clear the high 8 bytes before 4001 * using an unsigned saturated pack. 4002 * 4003 * The difference between the V64, V128 and V256 cases is merely how 4004 * we distribute the expansion between temporaries. 4005 */ 4006 switch (type) { 4007 case TCG_TYPE_V64: 4008 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4009 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4010 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4011 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4012 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4013 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4014 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4015 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4016 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4017 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4018 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4019 tcg_temp_free_vec(t1); 4020 tcg_temp_free_vec(t2); 4021 break; 4022 4023 case TCG_TYPE_V128: 4024 case TCG_TYPE_V256: 4025 t1 = tcg_temp_new_vec(type); 4026 t2 = tcg_temp_new_vec(type); 4027 t3 = tcg_temp_new_vec(type); 4028 t4 = tcg_temp_new_vec(type); 4029 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4030 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4031 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4032 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4033 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4034 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4035 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4036 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4037 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4038 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4039 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4040 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4041 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4042 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4043 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4044 tcg_temp_free_vec(t1); 4045 tcg_temp_free_vec(t2); 4046 tcg_temp_free_vec(t3); 4047 tcg_temp_free_vec(t4); 4048 break; 4049 4050 default: 4051 g_assert_not_reached(); 4052 } 4053} 4054 4055static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4056 TCGArg *a1, TCGArg *a2, TCGCond cond) 4057{ 4058 /* 4059 * Without AVX512, there are no 64-bit unsigned comparisons. 4060 * We must bias the inputs so that they become signed. 4061 * All other swapping and inversion are handled during code generation. 4062 */ 4063 if (vece == MO_64 && is_unsigned_cond(cond)) { 4064 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4065 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4066 TCGv_vec t1 = tcg_temp_new_vec(type); 4067 TCGv_vec t2 = tcg_temp_new_vec(type); 4068 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4069 4070 tcg_gen_sub_vec(vece, t1, v1, t3); 4071 tcg_gen_sub_vec(vece, t2, v2, t3); 4072 *a1 = tcgv_vec_arg(t1); 4073 *a2 = tcgv_vec_arg(t2); 4074 cond = tcg_signed_cond(cond); 4075 } 4076 return cond; 4077} 4078 4079static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4080 TCGArg a1, TCGArg a2, TCGCond cond) 4081{ 4082 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4083 /* Expand directly; do not recurse. */ 4084 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4085} 4086 4087static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4088 TCGArg a1, TCGArg a2, 4089 TCGArg a3, TCGArg a4, TCGCond cond) 4090{ 4091 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4092 /* Expand directly; do not recurse. */ 4093 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4094} 4095 4096void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4097 TCGArg a0, ...) 4098{ 4099 va_list va; 4100 TCGArg a1, a2, a3, a4, a5; 4101 TCGv_vec v0, v1, v2; 4102 4103 va_start(va, a0); 4104 a1 = va_arg(va, TCGArg); 4105 a2 = va_arg(va, TCGArg); 4106 v0 = temp_tcgv_vec(arg_temp(a0)); 4107 v1 = temp_tcgv_vec(arg_temp(a1)); 4108 4109 switch (opc) { 4110 case INDEX_op_shli_vec: 4111 expand_vec_shi(type, vece, false, v0, v1, a2); 4112 break; 4113 case INDEX_op_shri_vec: 4114 expand_vec_shi(type, vece, true, v0, v1, a2); 4115 break; 4116 case INDEX_op_sari_vec: 4117 expand_vec_sari(type, vece, v0, v1, a2); 4118 break; 4119 4120 case INDEX_op_rotli_vec: 4121 expand_vec_rotli(type, vece, v0, v1, a2); 4122 break; 4123 4124 case INDEX_op_rotls_vec: 4125 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4126 break; 4127 4128 case INDEX_op_rotlv_vec: 4129 v2 = temp_tcgv_vec(arg_temp(a2)); 4130 expand_vec_rotv(type, vece, v0, v1, v2, false); 4131 break; 4132 case INDEX_op_rotrv_vec: 4133 v2 = temp_tcgv_vec(arg_temp(a2)); 4134 expand_vec_rotv(type, vece, v0, v1, v2, true); 4135 break; 4136 4137 case INDEX_op_mul_vec: 4138 v2 = temp_tcgv_vec(arg_temp(a2)); 4139 expand_vec_mul(type, vece, v0, v1, v2); 4140 break; 4141 4142 case INDEX_op_cmp_vec: 4143 a3 = va_arg(va, TCGArg); 4144 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4145 break; 4146 4147 case INDEX_op_cmpsel_vec: 4148 a3 = va_arg(va, TCGArg); 4149 a4 = va_arg(va, TCGArg); 4150 a5 = va_arg(va, TCGArg); 4151 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4152 break; 4153 4154 default: 4155 break; 4156 } 4157 4158 va_end(va); 4159} 4160 4161static const int tcg_target_callee_save_regs[] = { 4162#if TCG_TARGET_REG_BITS == 64 4163 TCG_REG_RBP, 4164 TCG_REG_RBX, 4165#if defined(_WIN64) 4166 TCG_REG_RDI, 4167 TCG_REG_RSI, 4168#endif 4169 TCG_REG_R12, 4170 TCG_REG_R13, 4171 TCG_REG_R14, /* Currently used for the global env. */ 4172 TCG_REG_R15, 4173#else 4174 TCG_REG_EBP, /* Currently used for the global env. */ 4175 TCG_REG_EBX, 4176 TCG_REG_ESI, 4177 TCG_REG_EDI, 4178#endif 4179}; 4180 4181/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4182 and tcg_register_jit. */ 4183 4184#define PUSH_SIZE \ 4185 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4186 * (TCG_TARGET_REG_BITS / 8)) 4187 4188#define FRAME_SIZE \ 4189 ((PUSH_SIZE \ 4190 + TCG_STATIC_CALL_ARGS_SIZE \ 4191 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4192 + TCG_TARGET_STACK_ALIGN - 1) \ 4193 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4194 4195/* Generate global QEMU prologue and epilogue code */ 4196static void tcg_target_qemu_prologue(TCGContext *s) 4197{ 4198 int i, stack_addend; 4199 4200 /* TB prologue */ 4201 4202 /* Reserve some stack space, also for TCG temps. */ 4203 stack_addend = FRAME_SIZE - PUSH_SIZE; 4204 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4205 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4206 4207 /* Save all callee saved registers. */ 4208 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4209 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4210 } 4211 4212 if (!tcg_use_softmmu && guest_base) { 4213 int seg = setup_guest_base_seg(); 4214 if (seg != 0) { 4215 x86_guest_base.seg = seg; 4216 } else if (guest_base == (int32_t)guest_base) { 4217 x86_guest_base.ofs = guest_base; 4218 } else { 4219 assert(TCG_TARGET_REG_BITS == 64); 4220 /* Choose R12 because, as a base, it requires a SIB byte. */ 4221 x86_guest_base.index = TCG_REG_R12; 4222 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4223 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4224 } 4225 } 4226 4227 if (TCG_TARGET_REG_BITS == 32) { 4228 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4229 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4230 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4231 /* jmp *tb. */ 4232 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4233 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4234 + stack_addend); 4235 } else { 4236 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4237 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4238 /* jmp *tb. */ 4239 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4240 } 4241 4242 /* 4243 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4244 * and fall through to the rest of the epilogue. 4245 */ 4246 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4247 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4248 4249 /* TB epilogue */ 4250 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4251 4252 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4253 4254 if (have_avx2) { 4255 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4256 } 4257 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4258 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4259 } 4260 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4261} 4262 4263static void tcg_out_tb_start(TCGContext *s) 4264{ 4265 /* nothing to do */ 4266} 4267 4268static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4269{ 4270 memset(p, 0x90, count); 4271} 4272 4273static void tcg_target_init(TCGContext *s) 4274{ 4275 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4276 if (TCG_TARGET_REG_BITS == 64) { 4277 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4278 } 4279 if (have_avx1) { 4280 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4281 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4282 } 4283 if (have_avx2) { 4284 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4285 } 4286 4287 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4288 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4289 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4290 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4291 if (TCG_TARGET_REG_BITS == 64) { 4292#if !defined(_WIN64) 4293 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4294 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4295#endif 4296 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4297 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4298 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4299 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4300 } 4301 4302 s->reserved_regs = 0; 4303 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4304 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4305#ifdef _WIN64 4306 /* These are call saved, and we don't save them, so don't use them. */ 4307 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4308 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4309 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4310 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4311 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4312 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4313 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4314 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4315 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4316 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4317#endif 4318} 4319 4320typedef struct { 4321 DebugFrameHeader h; 4322 uint8_t fde_def_cfa[4]; 4323 uint8_t fde_reg_ofs[14]; 4324} DebugFrame; 4325 4326/* We're expecting a 2 byte uleb128 encoded value. */ 4327QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4328 4329#if !defined(__ELF__) 4330 /* Host machine without ELF. */ 4331#elif TCG_TARGET_REG_BITS == 64 4332#define ELF_HOST_MACHINE EM_X86_64 4333static const DebugFrame debug_frame = { 4334 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4335 .h.cie.id = -1, 4336 .h.cie.version = 1, 4337 .h.cie.code_align = 1, 4338 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4339 .h.cie.return_column = 16, 4340 4341 /* Total FDE size does not include the "len" member. */ 4342 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4343 4344 .fde_def_cfa = { 4345 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4346 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4347 (FRAME_SIZE >> 7) 4348 }, 4349 .fde_reg_ofs = { 4350 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4351 /* The following ordering must match tcg_target_callee_save_regs. */ 4352 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4353 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4354 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4355 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4356 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4357 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4358 } 4359}; 4360#else 4361#define ELF_HOST_MACHINE EM_386 4362static const DebugFrame debug_frame = { 4363 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4364 .h.cie.id = -1, 4365 .h.cie.version = 1, 4366 .h.cie.code_align = 1, 4367 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4368 .h.cie.return_column = 8, 4369 4370 /* Total FDE size does not include the "len" member. */ 4371 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4372 4373 .fde_def_cfa = { 4374 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4375 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4376 (FRAME_SIZE >> 7) 4377 }, 4378 .fde_reg_ofs = { 4379 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4380 /* The following ordering must match tcg_target_callee_save_regs. */ 4381 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4382 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4383 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4384 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4385 } 4386}; 4387#endif 4388 4389#if defined(ELF_HOST_MACHINE) 4390void tcg_register_jit(const void *buf, size_t buf_size) 4391{ 4392 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4393} 4394#endif 4395