1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94#define TCG_TMP_VEC TCG_REG_XMM5 95 96static const int tcg_target_call_iarg_regs[] = { 97#if TCG_TARGET_REG_BITS == 64 98#if defined(_WIN64) 99 TCG_REG_RCX, 100 TCG_REG_RDX, 101#else 102 TCG_REG_RDI, 103 TCG_REG_RSI, 104 TCG_REG_RDX, 105 TCG_REG_RCX, 106#endif 107 TCG_REG_R8, 108 TCG_REG_R9, 109#else 110 /* 32 bit mode uses stack based calling convention (GCC default). */ 111#endif 112}; 113 114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 115{ 116 switch (kind) { 117 case TCG_CALL_RET_NORMAL: 118 tcg_debug_assert(slot >= 0 && slot <= 1); 119 return slot ? TCG_REG_EDX : TCG_REG_EAX; 120#ifdef _WIN64 121 case TCG_CALL_RET_BY_VEC: 122 tcg_debug_assert(slot == 0); 123 return TCG_REG_XMM0; 124#endif 125 default: 126 g_assert_not_reached(); 127 } 128} 129 130/* Constants we accept. */ 131#define TCG_CT_CONST_S32 0x100 132#define TCG_CT_CONST_U32 0x200 133#define TCG_CT_CONST_I32 0x400 134#define TCG_CT_CONST_WSZ 0x800 135#define TCG_CT_CONST_TST 0x1000 136 137/* Registers used with L constraint, which are the first argument 138 registers on x86_64, and two random call clobbered registers on 139 i386. */ 140#if TCG_TARGET_REG_BITS == 64 141# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 142# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 143#else 144# define TCG_REG_L0 TCG_REG_EAX 145# define TCG_REG_L1 TCG_REG_EDX 146#endif 147 148#if TCG_TARGET_REG_BITS == 64 149# define ALL_GENERAL_REGS 0x0000ffffu 150# define ALL_VECTOR_REGS 0xffff0000u 151# define ALL_BYTEL_REGS ALL_GENERAL_REGS 152#else 153# define ALL_GENERAL_REGS 0x000000ffu 154# define ALL_VECTOR_REGS 0x00ff0000u 155# define ALL_BYTEL_REGS 0x0000000fu 156#endif 157#define SOFTMMU_RESERVE_REGS \ 158 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 159 160/* For 64-bit, we always know that CMOV is available. */ 161#if TCG_TARGET_REG_BITS == 64 162# define have_cmov true 163#else 164# define have_cmov (cpuinfo & CPUINFO_CMOV) 165#endif 166#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 167#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 168 169static const tcg_insn_unit *tb_ret_addr; 170 171static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 172 intptr_t value, intptr_t addend) 173{ 174 value += addend; 175 switch(type) { 176 case R_386_PC32: 177 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 178 if (value != (int32_t)value) { 179 return false; 180 } 181 /* FALLTHRU */ 182 case R_386_32: 183 tcg_patch32(code_ptr, value); 184 break; 185 case R_386_PC8: 186 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 187 if (value != (int8_t)value) { 188 return false; 189 } 190 tcg_patch8(code_ptr, value); 191 break; 192 default: 193 g_assert_not_reached(); 194 } 195 return true; 196} 197 198/* test if a constant matches the constraint */ 199static bool tcg_target_const_match(int64_t val, int ct, 200 TCGType type, TCGCond cond, int vece) 201{ 202 if (ct & TCG_CT_CONST) { 203 return 1; 204 } 205 if (type == TCG_TYPE_I32) { 206 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 207 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 208 return 1; 209 } 210 } else { 211 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 212 return 1; 213 } 214 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 215 return 1; 216 } 217 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 218 return 1; 219 } 220 /* 221 * This will be used in combination with TCG_CT_CONST_S32, 222 * so "normal" TESTQ is already matched. Also accept: 223 * TESTQ -> TESTL (uint32_t) 224 * TESTQ -> BT (is_power_of_2) 225 */ 226 if ((ct & TCG_CT_CONST_TST) 227 && is_tst_cond(cond) 228 && (val == (uint32_t)val || is_power_of_2(val))) { 229 return 1; 230 } 231 } 232 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 233 return 1; 234 } 235 return 0; 236} 237 238# define LOWREGMASK(x) ((x) & 7) 239 240#define P_EXT 0x100 /* 0x0f opcode prefix */ 241#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 242#define P_DATA16 0x400 /* 0x66 opcode prefix */ 243#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 244#if TCG_TARGET_REG_BITS == 64 245# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 246# define P_REXB_R 0x2000 /* REG field as byte register */ 247# define P_REXB_RM 0x4000 /* R/M field as byte register */ 248# define P_GS 0x8000 /* gs segment override */ 249#else 250# define P_REXW 0 251# define P_REXB_R 0 252# define P_REXB_RM 0 253# define P_GS 0 254#endif 255#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 256#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 257#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 258#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 259#define P_EVEX 0x100000 /* Requires EVEX encoding */ 260 261#define OPC_ARITH_EbIb (0x80) 262#define OPC_ARITH_EvIz (0x81) 263#define OPC_ARITH_EvIb (0x83) 264#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 265#define OPC_ANDN (0xf2 | P_EXT38) 266#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 267#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 268#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 269#define OPC_BSF (0xbc | P_EXT) 270#define OPC_BSR (0xbd | P_EXT) 271#define OPC_BSWAP (0xc8 | P_EXT) 272#define OPC_CALL_Jz (0xe8) 273#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 274#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 275#define OPC_DEC_r32 (0x48) 276#define OPC_IMUL_GvEv (0xaf | P_EXT) 277#define OPC_IMUL_GvEvIb (0x6b) 278#define OPC_IMUL_GvEvIz (0x69) 279#define OPC_INC_r32 (0x40) 280#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 281#define OPC_JCC_short (0x70) /* ... plus condition code */ 282#define OPC_JMP_long (0xe9) 283#define OPC_JMP_short (0xeb) 284#define OPC_LEA (0x8d) 285#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 286#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 287#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 288#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 289#define OPC_MOVB_EvIz (0xc6) 290#define OPC_MOVL_EvIz (0xc7) 291#define OPC_MOVB_Ib (0xb0) 292#define OPC_MOVL_Iv (0xb8) 293#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 294#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 295#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 296#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 297#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 298#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 299#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 300#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 301#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 302#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 303#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 304#define OPC_MOVSBL (0xbe | P_EXT) 305#define OPC_MOVSWL (0xbf | P_EXT) 306#define OPC_MOVSLQ (0x63 | P_REXW) 307#define OPC_MOVZBL (0xb6 | P_EXT) 308#define OPC_MOVZWL (0xb7 | P_EXT) 309#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 310#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 311#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 312#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 313#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 314#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 315#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 316#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 317#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 318#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 319#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 320#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 321#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 322#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 323#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 324#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 325#define OPC_PAND (0xdb | P_EXT | P_DATA16) 326#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 327#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 328#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 329#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 330#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 331#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 332#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 333#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 334#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 335#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 336#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 337#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 338#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 339#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 340#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 341#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 342#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 343#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 344#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 345#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 346#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 347#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 348#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 349#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 350#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 351#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 352#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 353#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 354#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 355#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 356#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 357#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 358#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 359#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 360#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 361#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 362#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 363#define OPC_POR (0xeb | P_EXT | P_DATA16) 364#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 365#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 366#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 367#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 368#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 369#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 370#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 371#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 372#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 373#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 374#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 375#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 376#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 377#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 378#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 379#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 380#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 381#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 382#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 383#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 384#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 385#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 386#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 387#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 388#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 389#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 390#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 391#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 392#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 393#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 394#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 395#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 396#define OPC_PXOR (0xef | P_EXT | P_DATA16) 397#define OPC_POP_r32 (0x58) 398#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 399#define OPC_PUSH_r32 (0x50) 400#define OPC_PUSH_Iv (0x68) 401#define OPC_PUSH_Ib (0x6a) 402#define OPC_RET (0xc3) 403#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 404#define OPC_SHIFT_1 (0xd1) 405#define OPC_SHIFT_Ib (0xc1) 406#define OPC_SHIFT_cl (0xd3) 407#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 408#define OPC_SHUFPS (0xc6 | P_EXT) 409#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 410#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 411#define OPC_SHRD_Ib (0xac | P_EXT) 412#define OPC_TESTB (0x84) 413#define OPC_TESTL (0x85) 414#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 415#define OPC_UD2 (0x0b | P_EXT) 416#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 417#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 418#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 419#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 420#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 421#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 422#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 423#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 424#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 425#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 426#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 427#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 428#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 429#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 430#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 431#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 432#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 433#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 434#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 435#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 436#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 437#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 438#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 439#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 440#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 442#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 443#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 444#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 446#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 447#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 448#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 449#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 450#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 451#define OPC_VZEROUPPER (0x77 | P_EXT) 452#define OPC_XCHG_ax_r32 (0x90) 453#define OPC_XCHG_EvGv (0x87) 454 455#define OPC_GRP3_Eb (0xf6) 456#define OPC_GRP3_Ev (0xf7) 457#define OPC_GRP5 (0xff) 458#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 459#define OPC_GRPBT (0xba | P_EXT) 460 461#define OPC_GRPBT_BT 4 462#define OPC_GRPBT_BTS 5 463#define OPC_GRPBT_BTR 6 464#define OPC_GRPBT_BTC 7 465 466/* Group 1 opcode extensions for 0x80-0x83. 467 These are also used as modifiers for OPC_ARITH. */ 468#define ARITH_ADD 0 469#define ARITH_OR 1 470#define ARITH_ADC 2 471#define ARITH_SBB 3 472#define ARITH_AND 4 473#define ARITH_SUB 5 474#define ARITH_XOR 6 475#define ARITH_CMP 7 476 477/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 478#define SHIFT_ROL 0 479#define SHIFT_ROR 1 480#define SHIFT_SHL 4 481#define SHIFT_SHR 5 482#define SHIFT_SAR 7 483 484/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 485#define EXT3_TESTi 0 486#define EXT3_NOT 2 487#define EXT3_NEG 3 488#define EXT3_MUL 4 489#define EXT3_IMUL 5 490#define EXT3_DIV 6 491#define EXT3_IDIV 7 492 493/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 494#define EXT5_INC_Ev 0 495#define EXT5_DEC_Ev 1 496#define EXT5_CALLN_Ev 2 497#define EXT5_JMPN_Ev 4 498 499/* Condition codes to be added to OPC_JCC_{long,short}. */ 500#define JCC_JMP (-1) 501#define JCC_JO 0x0 502#define JCC_JNO 0x1 503#define JCC_JB 0x2 504#define JCC_JAE 0x3 505#define JCC_JE 0x4 506#define JCC_JNE 0x5 507#define JCC_JBE 0x6 508#define JCC_JA 0x7 509#define JCC_JS 0x8 510#define JCC_JNS 0x9 511#define JCC_JP 0xa 512#define JCC_JNP 0xb 513#define JCC_JL 0xc 514#define JCC_JGE 0xd 515#define JCC_JLE 0xe 516#define JCC_JG 0xf 517 518static const uint8_t tcg_cond_to_jcc[] = { 519 [TCG_COND_EQ] = JCC_JE, 520 [TCG_COND_NE] = JCC_JNE, 521 [TCG_COND_LT] = JCC_JL, 522 [TCG_COND_GE] = JCC_JGE, 523 [TCG_COND_LE] = JCC_JLE, 524 [TCG_COND_GT] = JCC_JG, 525 [TCG_COND_LTU] = JCC_JB, 526 [TCG_COND_GEU] = JCC_JAE, 527 [TCG_COND_LEU] = JCC_JBE, 528 [TCG_COND_GTU] = JCC_JA, 529 [TCG_COND_TSTEQ] = JCC_JE, 530 [TCG_COND_TSTNE] = JCC_JNE, 531}; 532 533#if TCG_TARGET_REG_BITS == 64 534static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 535{ 536 int rex; 537 538 if (opc & P_GS) { 539 tcg_out8(s, 0x65); 540 } 541 if (opc & P_DATA16) { 542 /* We should never be asking for both 16 and 64-bit operation. */ 543 tcg_debug_assert((opc & P_REXW) == 0); 544 tcg_out8(s, 0x66); 545 } 546 if (opc & P_SIMDF3) { 547 tcg_out8(s, 0xf3); 548 } else if (opc & P_SIMDF2) { 549 tcg_out8(s, 0xf2); 550 } 551 552 rex = 0; 553 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 554 rex |= (r & 8) >> 1; /* REX.R */ 555 rex |= (x & 8) >> 2; /* REX.X */ 556 rex |= (rm & 8) >> 3; /* REX.B */ 557 558 /* P_REXB_{R,RM} indicates that the given register is the low byte. 559 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 560 as otherwise the encoding indicates %[abcd]h. Note that the values 561 that are ORed in merely indicate that the REX byte must be present; 562 those bits get discarded in output. */ 563 rex |= opc & (r >= 4 ? P_REXB_R : 0); 564 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 565 566 if (rex) { 567 tcg_out8(s, (uint8_t)(rex | 0x40)); 568 } 569 570 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 571 tcg_out8(s, 0x0f); 572 if (opc & P_EXT38) { 573 tcg_out8(s, 0x38); 574 } else if (opc & P_EXT3A) { 575 tcg_out8(s, 0x3a); 576 } 577 } 578 579 tcg_out8(s, opc); 580} 581#else 582static void tcg_out_opc(TCGContext *s, int opc) 583{ 584 if (opc & P_DATA16) { 585 tcg_out8(s, 0x66); 586 } 587 if (opc & P_SIMDF3) { 588 tcg_out8(s, 0xf3); 589 } else if (opc & P_SIMDF2) { 590 tcg_out8(s, 0xf2); 591 } 592 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 593 tcg_out8(s, 0x0f); 594 if (opc & P_EXT38) { 595 tcg_out8(s, 0x38); 596 } else if (opc & P_EXT3A) { 597 tcg_out8(s, 0x3a); 598 } 599 } 600 tcg_out8(s, opc); 601} 602/* Discard the register arguments to tcg_out_opc early, so as not to penalize 603 the 32-bit compilation paths. This method works with all versions of gcc, 604 whereas relying on optimization may not be able to exclude them. */ 605#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 606#endif 607 608static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 609{ 610 tcg_out_opc(s, opc, r, rm, 0); 611 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 612} 613 614static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 615 int rm, int index) 616{ 617 int tmp; 618 619 if (opc & P_GS) { 620 tcg_out8(s, 0x65); 621 } 622 /* Use the two byte form if possible, which cannot encode 623 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 624 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 625 && ((rm | index) & 8) == 0) { 626 /* Two byte VEX prefix. */ 627 tcg_out8(s, 0xc5); 628 629 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 630 } else { 631 /* Three byte VEX prefix. */ 632 tcg_out8(s, 0xc4); 633 634 /* VEX.m-mmmm */ 635 if (opc & P_EXT3A) { 636 tmp = 3; 637 } else if (opc & P_EXT38) { 638 tmp = 2; 639 } else if (opc & P_EXT) { 640 tmp = 1; 641 } else { 642 g_assert_not_reached(); 643 } 644 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 645 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 646 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 647 tcg_out8(s, tmp); 648 649 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 650 } 651 652 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 653 /* VEX.pp */ 654 if (opc & P_DATA16) { 655 tmp |= 1; /* 0x66 */ 656 } else if (opc & P_SIMDF3) { 657 tmp |= 2; /* 0xf3 */ 658 } else if (opc & P_SIMDF2) { 659 tmp |= 3; /* 0xf2 */ 660 } 661 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 662 tcg_out8(s, tmp); 663 tcg_out8(s, opc); 664} 665 666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 667 int rm, int index) 668{ 669 /* The entire 4-byte evex prefix; with R' and V' set. */ 670 uint32_t p = 0x08041062; 671 int mm, pp; 672 673 tcg_debug_assert(have_avx512vl); 674 675 /* EVEX.mm */ 676 if (opc & P_EXT3A) { 677 mm = 3; 678 } else if (opc & P_EXT38) { 679 mm = 2; 680 } else if (opc & P_EXT) { 681 mm = 1; 682 } else { 683 g_assert_not_reached(); 684 } 685 686 /* EVEX.pp */ 687 if (opc & P_DATA16) { 688 pp = 1; /* 0x66 */ 689 } else if (opc & P_SIMDF3) { 690 pp = 2; /* 0xf3 */ 691 } else if (opc & P_SIMDF2) { 692 pp = 3; /* 0xf2 */ 693 } else { 694 pp = 0; 695 } 696 697 p = deposit32(p, 8, 2, mm); 698 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 699 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 700 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 701 p = deposit32(p, 16, 2, pp); 702 p = deposit32(p, 19, 4, ~v); 703 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 704 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 705 706 tcg_out32(s, p); 707 tcg_out8(s, opc); 708} 709 710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 711{ 712 if (opc & P_EVEX) { 713 tcg_out_evex_opc(s, opc, r, v, rm, 0); 714 } else { 715 tcg_out_vex_opc(s, opc, r, v, rm, 0); 716 } 717 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 718} 719 720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 721 We handle either RM and INDEX missing with a negative value. In 64-bit 722 mode for absolute addresses, ~RM is the size of the immediate operand 723 that will follow the instruction. */ 724 725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 726 int shift, intptr_t offset) 727{ 728 int mod, len; 729 730 if (index < 0 && rm < 0) { 731 if (TCG_TARGET_REG_BITS == 64) { 732 /* Try for a rip-relative addressing mode. This has replaced 733 the 32-bit-mode absolute addressing encoding. */ 734 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 735 intptr_t disp = offset - pc; 736 if (disp == (int32_t)disp) { 737 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 738 tcg_out32(s, disp); 739 return; 740 } 741 742 /* Try for an absolute address encoding. This requires the 743 use of the MODRM+SIB encoding and is therefore larger than 744 rip-relative addressing. */ 745 if (offset == (int32_t)offset) { 746 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 747 tcg_out8(s, (4 << 3) | 5); 748 tcg_out32(s, offset); 749 return; 750 } 751 752 /* ??? The memory isn't directly addressable. */ 753 g_assert_not_reached(); 754 } else { 755 /* Absolute address. */ 756 tcg_out8(s, (r << 3) | 5); 757 tcg_out32(s, offset); 758 return; 759 } 760 } 761 762 /* Find the length of the immediate addend. Note that the encoding 763 that would be used for (%ebp) indicates absolute addressing. */ 764 if (rm < 0) { 765 mod = 0, len = 4, rm = 5; 766 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 767 mod = 0, len = 0; 768 } else if (offset == (int8_t)offset) { 769 mod = 0x40, len = 1; 770 } else { 771 mod = 0x80, len = 4; 772 } 773 774 /* Use a single byte MODRM format if possible. Note that the encoding 775 that would be used for %esp is the escape to the two byte form. */ 776 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 777 /* Single byte MODRM format. */ 778 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 779 } else { 780 /* Two byte MODRM+SIB format. */ 781 782 /* Note that the encoding that would place %esp into the index 783 field indicates no index register. In 64-bit mode, the REX.X 784 bit counts, so %r12 can be used as the index. */ 785 if (index < 0) { 786 index = 4; 787 } else { 788 tcg_debug_assert(index != TCG_REG_ESP); 789 } 790 791 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 792 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 793 } 794 795 if (len == 1) { 796 tcg_out8(s, offset); 797 } else if (len == 4) { 798 tcg_out32(s, offset); 799 } 800} 801 802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 803 int index, int shift, intptr_t offset) 804{ 805 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 806 tcg_out_sib_offset(s, r, rm, index, shift, offset); 807} 808 809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 810 int rm, int index, int shift, 811 intptr_t offset) 812{ 813 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 814 tcg_out_sib_offset(s, r, rm, index, shift, offset); 815} 816 817/* A simplification of the above with no index or shift. */ 818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 819 int rm, intptr_t offset) 820{ 821 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 822} 823 824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 825 int v, int rm, intptr_t offset) 826{ 827 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 828} 829 830/* Output an opcode with an expected reference to the constant pool. */ 831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 832{ 833 tcg_out_opc(s, opc, r, 0, 0); 834 /* Absolute for 32-bit, pc-relative for 64-bit. */ 835 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 836 tcg_out32(s, 0); 837} 838 839/* Output an opcode with an expected reference to the constant pool. */ 840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 841{ 842 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 843 /* Absolute for 32-bit, pc-relative for 64-bit. */ 844 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 845 tcg_out32(s, 0); 846} 847 848/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 850{ 851 /* Propagate an opcode prefix, such as P_REXW. */ 852 int ext = subop & ~0x7; 853 subop &= 0x7; 854 855 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 856} 857 858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 859{ 860 int rexw = 0; 861 862 if (arg == ret) { 863 return true; 864 } 865 switch (type) { 866 case TCG_TYPE_I64: 867 rexw = P_REXW; 868 /* fallthru */ 869 case TCG_TYPE_I32: 870 if (ret < 16) { 871 if (arg < 16) { 872 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 873 } else { 874 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 875 } 876 } else { 877 if (arg < 16) { 878 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 879 } else { 880 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 881 } 882 } 883 break; 884 885 case TCG_TYPE_V64: 886 tcg_debug_assert(ret >= 16 && arg >= 16); 887 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 888 break; 889 case TCG_TYPE_V128: 890 tcg_debug_assert(ret >= 16 && arg >= 16); 891 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 892 break; 893 case TCG_TYPE_V256: 894 tcg_debug_assert(ret >= 16 && arg >= 16); 895 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 896 break; 897 898 default: 899 g_assert_not_reached(); 900 } 901 return true; 902} 903 904static const int avx2_dup_insn[4] = { 905 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 906 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 907}; 908 909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 910 TCGReg r, TCGReg a) 911{ 912 if (have_avx2) { 913 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 914 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 915 } else { 916 switch (vece) { 917 case MO_8: 918 /* ??? With zero in a register, use PSHUFB. */ 919 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 920 a = r; 921 /* FALLTHRU */ 922 case MO_16: 923 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 924 a = r; 925 /* FALLTHRU */ 926 case MO_32: 927 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 928 /* imm8 operand: all output lanes selected from input lane 0. */ 929 tcg_out8(s, 0); 930 break; 931 case MO_64: 932 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 933 break; 934 default: 935 g_assert_not_reached(); 936 } 937 } 938 return true; 939} 940 941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 942 TCGReg r, TCGReg base, intptr_t offset) 943{ 944 if (have_avx2) { 945 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 946 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 947 r, 0, base, offset); 948 } else { 949 switch (vece) { 950 case MO_64: 951 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 952 break; 953 case MO_32: 954 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 955 break; 956 case MO_16: 957 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 958 tcg_out8(s, 0); /* imm8 */ 959 tcg_out_dup_vec(s, type, vece, r, r); 960 break; 961 case MO_8: 962 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 963 tcg_out8(s, 0); /* imm8 */ 964 tcg_out_dup_vec(s, type, vece, r, r); 965 break; 966 default: 967 g_assert_not_reached(); 968 } 969 } 970 return true; 971} 972 973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 974 TCGReg ret, int64_t arg) 975{ 976 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 977 978 if (arg == 0) { 979 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 980 return; 981 } 982 if (arg == -1) { 983 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 984 return; 985 } 986 987 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 988 if (have_avx2) { 989 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 990 } else { 991 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 992 } 993 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 994 } else { 995 if (type == TCG_TYPE_V64) { 996 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 997 } else if (have_avx2) { 998 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 999 } else { 1000 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1001 } 1002 if (TCG_TARGET_REG_BITS == 64) { 1003 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1004 } else { 1005 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1006 } 1007 } 1008} 1009 1010static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1011 TCGReg ret, tcg_target_long arg) 1012{ 1013 if (arg == 0) { 1014 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1015 return; 1016 } 1017 if (arg == -1) { 1018 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1019 return; 1020 } 1021 1022 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1023 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1024 if (TCG_TARGET_REG_BITS == 64) { 1025 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1026 } else { 1027 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1028 } 1029} 1030 1031static void tcg_out_movi_int(TCGContext *s, TCGType type, 1032 TCGReg ret, tcg_target_long arg) 1033{ 1034 tcg_target_long diff; 1035 1036 if (arg == 0) { 1037 tgen_arithr(s, ARITH_XOR, ret, ret); 1038 return; 1039 } 1040 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1041 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1042 tcg_out32(s, arg); 1043 return; 1044 } 1045 if (arg == (int32_t)arg) { 1046 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1047 tcg_out32(s, arg); 1048 return; 1049 } 1050 1051 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1052 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1053 if (diff == (int32_t)diff) { 1054 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1055 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1056 tcg_out32(s, diff); 1057 return; 1058 } 1059 1060 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1061 tcg_out64(s, arg); 1062} 1063 1064static void tcg_out_movi(TCGContext *s, TCGType type, 1065 TCGReg ret, tcg_target_long arg) 1066{ 1067 switch (type) { 1068 case TCG_TYPE_I32: 1069#if TCG_TARGET_REG_BITS == 64 1070 case TCG_TYPE_I64: 1071#endif 1072 if (ret < 16) { 1073 tcg_out_movi_int(s, type, ret, arg); 1074 } else { 1075 tcg_out_movi_vec(s, type, ret, arg); 1076 } 1077 break; 1078 default: 1079 g_assert_not_reached(); 1080 } 1081} 1082 1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1084{ 1085 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1086 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1087 return true; 1088} 1089 1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1091 tcg_target_long imm) 1092{ 1093 /* This function is only used for passing structs by reference. */ 1094 tcg_debug_assert(imm == (int32_t)imm); 1095 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1096} 1097 1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1099{ 1100 if (val == (int8_t)val) { 1101 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1102 tcg_out8(s, val); 1103 } else if (val == (int32_t)val) { 1104 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1105 tcg_out32(s, val); 1106 } else { 1107 g_assert_not_reached(); 1108 } 1109} 1110 1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1112{ 1113 /* Given the strength of x86 memory ordering, we only need care for 1114 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1115 faster than "mfence", so don't bother with the sse insn. */ 1116 if (a0 & TCG_MO_ST_LD) { 1117 tcg_out8(s, 0xf0); 1118 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1119 tcg_out8(s, 0); 1120 } 1121} 1122 1123static inline void tcg_out_push(TCGContext *s, int reg) 1124{ 1125 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1126} 1127 1128static inline void tcg_out_pop(TCGContext *s, int reg) 1129{ 1130 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1131} 1132 1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1134 TCGReg arg1, intptr_t arg2) 1135{ 1136 switch (type) { 1137 case TCG_TYPE_I32: 1138 if (ret < 16) { 1139 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1140 } else { 1141 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1142 } 1143 break; 1144 case TCG_TYPE_I64: 1145 if (ret < 16) { 1146 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1147 break; 1148 } 1149 /* FALLTHRU */ 1150 case TCG_TYPE_V64: 1151 /* There is no instruction that can validate 8-byte alignment. */ 1152 tcg_debug_assert(ret >= 16); 1153 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1154 break; 1155 case TCG_TYPE_V128: 1156 /* 1157 * The gvec infrastructure is asserts that v128 vector loads 1158 * and stores use a 16-byte aligned offset. Validate that the 1159 * final pointer is aligned by using an insn that will SIGSEGV. 1160 */ 1161 tcg_debug_assert(ret >= 16); 1162 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1163 break; 1164 case TCG_TYPE_V256: 1165 /* 1166 * The gvec infrastructure only requires 16-byte alignment, 1167 * so here we must use an unaligned load. 1168 */ 1169 tcg_debug_assert(ret >= 16); 1170 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1171 ret, 0, arg1, arg2); 1172 break; 1173 default: 1174 g_assert_not_reached(); 1175 } 1176} 1177 1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1179 TCGReg arg1, intptr_t arg2) 1180{ 1181 switch (type) { 1182 case TCG_TYPE_I32: 1183 if (arg < 16) { 1184 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1185 } else { 1186 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1187 } 1188 break; 1189 case TCG_TYPE_I64: 1190 if (arg < 16) { 1191 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1192 break; 1193 } 1194 /* FALLTHRU */ 1195 case TCG_TYPE_V64: 1196 /* There is no instruction that can validate 8-byte alignment. */ 1197 tcg_debug_assert(arg >= 16); 1198 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1199 break; 1200 case TCG_TYPE_V128: 1201 /* 1202 * The gvec infrastructure is asserts that v128 vector loads 1203 * and stores use a 16-byte aligned offset. Validate that the 1204 * final pointer is aligned by using an insn that will SIGSEGV. 1205 * 1206 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1207 * for _WIN64, which must have SSE2 but may not have AVX. 1208 */ 1209 tcg_debug_assert(arg >= 16); 1210 if (have_avx1) { 1211 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1212 } else { 1213 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1214 } 1215 break; 1216 case TCG_TYPE_V256: 1217 /* 1218 * The gvec infrastructure only requires 16-byte alignment, 1219 * so here we must use an unaligned store. 1220 */ 1221 tcg_debug_assert(arg >= 16); 1222 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1223 arg, 0, arg1, arg2); 1224 break; 1225 default: 1226 g_assert_not_reached(); 1227 } 1228} 1229 1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1231 TCGReg base, intptr_t ofs) 1232{ 1233 int rexw = 0; 1234 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1235 if (val != (int32_t)val) { 1236 return false; 1237 } 1238 rexw = P_REXW; 1239 } else if (type != TCG_TYPE_I32) { 1240 return false; 1241 } 1242 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1243 tcg_out32(s, val); 1244 return true; 1245} 1246 1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1248{ 1249 /* Propagate an opcode prefix, such as P_DATA16. */ 1250 int ext = subopc & ~0x7; 1251 subopc &= 0x7; 1252 1253 if (count == 1) { 1254 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1255 } else { 1256 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1257 tcg_out8(s, count); 1258 } 1259} 1260 1261static inline void tcg_out_bswap32(TCGContext *s, int reg) 1262{ 1263 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1264} 1265 1266static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1267{ 1268 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1269} 1270 1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1272{ 1273 /* movzbl */ 1274 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1275 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1276} 1277 1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1279{ 1280 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1281 /* movsbl */ 1282 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1283 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1284} 1285 1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1287{ 1288 /* movzwl */ 1289 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1290} 1291 1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1293{ 1294 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1295 /* movsw[lq] */ 1296 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1297} 1298 1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1300{ 1301 /* 32-bit mov zero extends. */ 1302 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1303} 1304 1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1306{ 1307 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1308 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1309} 1310 1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1312{ 1313 tcg_out_ext32s(s, dest, src); 1314} 1315 1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1317{ 1318 if (dest != src) { 1319 tcg_out_ext32u(s, dest, src); 1320 } 1321} 1322 1323static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1324{ 1325 tcg_out_ext32u(s, dest, src); 1326} 1327 1328static inline void tcg_out_bswap64(TCGContext *s, int reg) 1329{ 1330 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1331} 1332 1333static void tgen_arithi(TCGContext *s, int c, int r0, 1334 tcg_target_long val, int cf) 1335{ 1336 int rexw = 0; 1337 1338 if (TCG_TARGET_REG_BITS == 64) { 1339 rexw = c & -8; 1340 c &= 7; 1341 } 1342 1343 switch (c) { 1344 case ARITH_ADD: 1345 case ARITH_SUB: 1346 if (!cf) { 1347 /* 1348 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1349 * partial flags update stalls on Pentium4 and are not recommended 1350 * by current Intel optimization manuals. 1351 */ 1352 if (val == 1 || val == -1) { 1353 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1354 if (TCG_TARGET_REG_BITS == 64) { 1355 /* 1356 * The single-byte increment encodings are re-tasked 1357 * as the REX prefixes. Use the MODRM encoding. 1358 */ 1359 tcg_out_modrm(s, OPC_GRP5 + rexw, 1360 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1361 } else { 1362 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1363 } 1364 return; 1365 } 1366 if (val == 128) { 1367 /* 1368 * Facilitate using an 8-bit immediate. Carry is inverted 1369 * by this transformation, so do it only if cf == 0. 1370 */ 1371 c ^= ARITH_ADD ^ ARITH_SUB; 1372 val = -128; 1373 } 1374 } 1375 break; 1376 1377 case ARITH_AND: 1378 if (TCG_TARGET_REG_BITS == 64) { 1379 if (val == 0xffffffffu) { 1380 tcg_out_ext32u(s, r0, r0); 1381 return; 1382 } 1383 if (val == (uint32_t)val) { 1384 /* AND with no high bits set can use a 32-bit operation. */ 1385 rexw = 0; 1386 } 1387 } 1388 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1389 tcg_out_ext8u(s, r0, r0); 1390 return; 1391 } 1392 if (val == 0xffffu) { 1393 tcg_out_ext16u(s, r0, r0); 1394 return; 1395 } 1396 break; 1397 1398 case ARITH_OR: 1399 case ARITH_XOR: 1400 if (val >= 0x80 && val <= 0xff 1401 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1402 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1403 tcg_out8(s, val); 1404 return; 1405 } 1406 break; 1407 } 1408 1409 if (val == (int8_t)val) { 1410 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1411 tcg_out8(s, val); 1412 return; 1413 } 1414 if (rexw == 0 || val == (int32_t)val) { 1415 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1416 tcg_out32(s, val); 1417 return; 1418 } 1419 1420 g_assert_not_reached(); 1421} 1422 1423static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1424{ 1425 if (val != 0) { 1426 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1427 } 1428} 1429 1430/* Set SMALL to force a short forward branch. */ 1431static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1432{ 1433 int32_t val, val1; 1434 1435 if (l->has_value) { 1436 val = tcg_pcrel_diff(s, l->u.value_ptr); 1437 val1 = val - 2; 1438 if ((int8_t)val1 == val1) { 1439 if (opc == -1) { 1440 tcg_out8(s, OPC_JMP_short); 1441 } else { 1442 tcg_out8(s, OPC_JCC_short + opc); 1443 } 1444 tcg_out8(s, val1); 1445 } else { 1446 tcg_debug_assert(!small); 1447 if (opc == -1) { 1448 tcg_out8(s, OPC_JMP_long); 1449 tcg_out32(s, val - 5); 1450 } else { 1451 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1452 tcg_out32(s, val - 6); 1453 } 1454 } 1455 } else if (small) { 1456 if (opc == -1) { 1457 tcg_out8(s, OPC_JMP_short); 1458 } else { 1459 tcg_out8(s, OPC_JCC_short + opc); 1460 } 1461 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1462 s->code_ptr += 1; 1463 } else { 1464 if (opc == -1) { 1465 tcg_out8(s, OPC_JMP_long); 1466 } else { 1467 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1468 } 1469 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1470 s->code_ptr += 4; 1471 } 1472} 1473 1474static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1475 TCGArg arg2, int const_arg2, int rexw) 1476{ 1477 int jz, js; 1478 1479 if (!is_tst_cond(cond)) { 1480 if (!const_arg2) { 1481 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1482 } else if (arg2 == 0) { 1483 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1484 } else { 1485 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1486 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1487 } 1488 return tcg_cond_to_jcc[cond]; 1489 } 1490 1491 jz = tcg_cond_to_jcc[cond]; 1492 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1493 1494 if (!const_arg2) { 1495 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1496 return jz; 1497 } 1498 1499 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1500 if (arg2 == 0x80) { 1501 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1502 return js; 1503 } 1504 if (arg2 == 0xff) { 1505 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1506 return jz; 1507 } 1508 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1509 tcg_out8(s, arg2); 1510 return jz; 1511 } 1512 1513 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1514 if (arg2 == 0x8000) { 1515 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1516 return js; 1517 } 1518 if (arg2 == 0xff00) { 1519 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1520 return jz; 1521 } 1522 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1523 tcg_out8(s, arg2 >> 8); 1524 return jz; 1525 } 1526 1527 if (arg2 == 0xffff) { 1528 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1529 return jz; 1530 } 1531 if (arg2 == 0xffffffffu) { 1532 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1533 return jz; 1534 } 1535 1536 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1537 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1538 int sh = ctz64(arg2); 1539 1540 rexw = (sh & 32 ? P_REXW : 0); 1541 if ((sh & 31) == 31) { 1542 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1543 return js; 1544 } else { 1545 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1546 tcg_out8(s, sh); 1547 return jc; 1548 } 1549 } 1550 1551 if (rexw) { 1552 if (arg2 == (uint32_t)arg2) { 1553 rexw = 0; 1554 } else { 1555 tcg_debug_assert(arg2 == (int32_t)arg2); 1556 } 1557 } 1558 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1559 tcg_out32(s, arg2); 1560 return jz; 1561} 1562 1563static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1564 TCGArg arg1, TCGArg arg2, int const_arg2, 1565 TCGLabel *label, bool small) 1566{ 1567 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1568 tcg_out_jxx(s, jcc, label, small); 1569} 1570 1571#if TCG_TARGET_REG_BITS == 32 1572static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1573 const int *const_args, bool small) 1574{ 1575 TCGLabel *label_next = gen_new_label(); 1576 TCGLabel *label_this = arg_label(args[5]); 1577 TCGCond cond = args[4]; 1578 1579 switch (cond) { 1580 case TCG_COND_EQ: 1581 case TCG_COND_TSTEQ: 1582 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1583 args[0], args[2], const_args[2], label_next, 1); 1584 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1585 label_this, small); 1586 break; 1587 case TCG_COND_NE: 1588 case TCG_COND_TSTNE: 1589 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1590 label_this, small); 1591 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1592 label_this, small); 1593 break; 1594 case TCG_COND_LT: 1595 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1596 label_this, small); 1597 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1598 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1599 label_this, small); 1600 break; 1601 case TCG_COND_LE: 1602 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1603 label_this, small); 1604 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1605 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1606 label_this, small); 1607 break; 1608 case TCG_COND_GT: 1609 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1610 label_this, small); 1611 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1612 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1613 label_this, small); 1614 break; 1615 case TCG_COND_GE: 1616 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1617 label_this, small); 1618 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1619 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1620 label_this, small); 1621 break; 1622 case TCG_COND_LTU: 1623 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1624 label_this, small); 1625 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1626 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1627 label_this, small); 1628 break; 1629 case TCG_COND_LEU: 1630 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1631 label_this, small); 1632 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1633 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1634 label_this, small); 1635 break; 1636 case TCG_COND_GTU: 1637 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1638 label_this, small); 1639 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1640 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1641 label_this, small); 1642 break; 1643 case TCG_COND_GEU: 1644 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1645 label_this, small); 1646 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1647 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1648 label_this, small); 1649 break; 1650 default: 1651 g_assert_not_reached(); 1652 } 1653 tcg_out_label(s, label_next); 1654} 1655#endif 1656 1657static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1658 TCGArg dest, TCGArg arg1, TCGArg arg2, 1659 int const_arg2, bool neg) 1660{ 1661 int cmp_rexw = rexw; 1662 bool inv = false; 1663 bool cleared; 1664 int jcc; 1665 1666 switch (cond) { 1667 case TCG_COND_NE: 1668 inv = true; 1669 /* fall through */ 1670 case TCG_COND_EQ: 1671 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1672 if (const_arg2 && arg2 == 0) { 1673 arg2 = 1; 1674 goto do_ltu; 1675 } 1676 break; 1677 1678 case TCG_COND_TSTNE: 1679 inv = true; 1680 /* fall through */ 1681 case TCG_COND_TSTEQ: 1682 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1683 if (const_arg2 && arg2 == 0xffffffffu) { 1684 arg2 = 1; 1685 cmp_rexw = 0; 1686 goto do_ltu; 1687 } 1688 break; 1689 1690 case TCG_COND_LEU: 1691 inv = true; 1692 /* fall through */ 1693 case TCG_COND_GTU: 1694 /* If arg2 is a register, swap for LTU/GEU. */ 1695 if (!const_arg2) { 1696 TCGReg t = arg1; 1697 arg1 = arg2; 1698 arg2 = t; 1699 goto do_ltu; 1700 } 1701 break; 1702 1703 case TCG_COND_GEU: 1704 inv = true; 1705 /* fall through */ 1706 case TCG_COND_LTU: 1707 do_ltu: 1708 /* 1709 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1710 * We can then use NEG or INC to produce the desired result. 1711 * This is always smaller than the SETCC expansion. 1712 */ 1713 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1714 1715 /* X - X - C = -C = (C ? -1 : 0) */ 1716 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1717 if (inv && neg) { 1718 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1719 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1720 } else if (inv) { 1721 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1722 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1723 } else if (!neg) { 1724 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1725 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1726 } 1727 return; 1728 1729 case TCG_COND_GE: 1730 inv = true; 1731 /* fall through */ 1732 case TCG_COND_LT: 1733 /* If arg2 is 0, extract the sign bit. */ 1734 if (const_arg2 && arg2 == 0) { 1735 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1736 if (inv) { 1737 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1738 } 1739 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1740 dest, rexw ? 63 : 31); 1741 return; 1742 } 1743 break; 1744 1745 default: 1746 break; 1747 } 1748 1749 /* 1750 * If dest does not overlap the inputs, clearing it first is preferred. 1751 * The XOR breaks any false dependency for the low-byte write to dest, 1752 * and is also one byte smaller than MOVZBL. 1753 */ 1754 cleared = false; 1755 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1756 tgen_arithr(s, ARITH_XOR, dest, dest); 1757 cleared = true; 1758 } 1759 1760 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1761 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1762 1763 if (!cleared) { 1764 tcg_out_ext8u(s, dest, dest); 1765 } 1766 if (neg) { 1767 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1768 } 1769} 1770 1771#if TCG_TARGET_REG_BITS == 32 1772static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1773 const int *const_args) 1774{ 1775 TCGArg new_args[6]; 1776 TCGLabel *label_true, *label_over; 1777 1778 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1779 1780 if (args[0] == args[1] || args[0] == args[2] 1781 || (!const_args[3] && args[0] == args[3]) 1782 || (!const_args[4] && args[0] == args[4])) { 1783 /* When the destination overlaps with one of the argument 1784 registers, don't do anything tricky. */ 1785 label_true = gen_new_label(); 1786 label_over = gen_new_label(); 1787 1788 new_args[5] = label_arg(label_true); 1789 tcg_out_brcond2(s, new_args, const_args+1, 1); 1790 1791 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1792 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1793 tcg_out_label(s, label_true); 1794 1795 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1796 tcg_out_label(s, label_over); 1797 } else { 1798 /* When the destination does not overlap one of the arguments, 1799 clear the destination first, jump if cond false, and emit an 1800 increment in the true case. This results in smaller code. */ 1801 1802 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1803 1804 label_over = gen_new_label(); 1805 new_args[4] = tcg_invert_cond(new_args[4]); 1806 new_args[5] = label_arg(label_over); 1807 tcg_out_brcond2(s, new_args, const_args+1, 1); 1808 1809 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1810 tcg_out_label(s, label_over); 1811 } 1812} 1813#endif 1814 1815static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1816 TCGReg dest, TCGReg v1) 1817{ 1818 if (have_cmov) { 1819 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1820 } else { 1821 TCGLabel *over = gen_new_label(); 1822 tcg_out_jxx(s, jcc ^ 1, over, 1); 1823 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1824 tcg_out_label(s, over); 1825 } 1826} 1827 1828static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1829 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1830 TCGReg v1) 1831{ 1832 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1833 tcg_out_cmov(s, jcc, rexw, dest, v1); 1834} 1835 1836static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1837 TCGArg arg2, bool const_a2) 1838{ 1839 if (have_bmi1) { 1840 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1841 if (const_a2) { 1842 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1843 } else { 1844 tcg_debug_assert(dest != arg2); 1845 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1846 } 1847 } else { 1848 tcg_debug_assert(dest != arg2); 1849 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1850 tcg_out_cmov(s, JCC_JE, rexw, dest, arg2); 1851 } 1852} 1853 1854static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1855 TCGArg arg2, bool const_a2) 1856{ 1857 if (have_lzcnt) { 1858 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1859 if (const_a2) { 1860 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1861 } else { 1862 tcg_debug_assert(dest != arg2); 1863 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1864 } 1865 } else { 1866 tcg_debug_assert(!const_a2); 1867 tcg_debug_assert(dest != arg1); 1868 tcg_debug_assert(dest != arg2); 1869 1870 /* Recall that the output of BSR is the index not the count. */ 1871 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1872 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1873 1874 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1875 int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw); 1876 tcg_out_cmov(s, jcc, rexw, dest, arg2); 1877 } 1878} 1879 1880static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1881{ 1882 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1883 1884 if (disp == (int32_t)disp) { 1885 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1886 tcg_out32(s, disp); 1887 } else { 1888 /* rip-relative addressing into the constant pool. 1889 This is 6 + 8 = 14 bytes, as compared to using an 1890 immediate load 10 + 6 = 16 bytes, plus we may 1891 be able to re-use the pool constant for more calls. */ 1892 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1893 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1894 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1895 tcg_out32(s, 0); 1896 } 1897} 1898 1899static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1900 const TCGHelperInfo *info) 1901{ 1902 tcg_out_branch(s, 1, dest); 1903 1904#ifndef _WIN32 1905 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1906 /* 1907 * The sysv i386 abi for struct return places a reference as the 1908 * first argument of the stack, and pops that argument with the 1909 * return statement. Since we want to retain the aligned stack 1910 * pointer for the callee, we do not want to actually push that 1911 * argument before the call but rely on the normal store to the 1912 * stack slot. But we do need to compensate for the pop in order 1913 * to reset our correct stack pointer value. 1914 * Pushing a garbage value back onto the stack is quickest. 1915 */ 1916 tcg_out_push(s, TCG_REG_EAX); 1917 } 1918#endif 1919} 1920 1921static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1922{ 1923 tcg_out_branch(s, 0, dest); 1924} 1925 1926static void tcg_out_nopn(TCGContext *s, int n) 1927{ 1928 int i; 1929 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1930 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1931 * duplicate prefix, and all of the interesting recent cores can 1932 * decode and discard the duplicates in a single cycle. 1933 */ 1934 tcg_debug_assert(n >= 1); 1935 for (i = 1; i < n; ++i) { 1936 tcg_out8(s, 0x66); 1937 } 1938 tcg_out8(s, 0x90); 1939} 1940 1941typedef struct { 1942 TCGReg base; 1943 int index; 1944 int ofs; 1945 int seg; 1946 TCGAtomAlign aa; 1947} HostAddress; 1948 1949bool tcg_target_has_memory_bswap(MemOp memop) 1950{ 1951 TCGAtomAlign aa; 1952 1953 if (!have_movbe) { 1954 return false; 1955 } 1956 if ((memop & MO_SIZE) < MO_128) { 1957 return true; 1958 } 1959 1960 /* 1961 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1962 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1963 */ 1964 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1965 return aa.atom < MO_128; 1966} 1967 1968/* 1969 * Because i686 has no register parameters and because x86_64 has xchg 1970 * to handle addr/data register overlap, we have placed all input arguments 1971 * before we need might need a scratch reg. 1972 * 1973 * Even then, a scratch is only needed for l->raddr. Rather than expose 1974 * a general-purpose scratch when we don't actually know it's available, 1975 * use the ra_gen hook to load into RAX if needed. 1976 */ 1977#if TCG_TARGET_REG_BITS == 64 1978static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1979{ 1980 if (arg < 0) { 1981 arg = TCG_REG_RAX; 1982 } 1983 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1984 return arg; 1985} 1986static const TCGLdstHelperParam ldst_helper_param = { 1987 .ra_gen = ldst_ra_gen 1988}; 1989#else 1990static const TCGLdstHelperParam ldst_helper_param = { }; 1991#endif 1992 1993static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 1994 TCGReg l, TCGReg h, TCGReg v) 1995{ 1996 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1997 1998 /* vpmov{d,q} %v, %l */ 1999 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2000 /* vpextr{d,q} $1, %v, %h */ 2001 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2002 tcg_out8(s, 1); 2003} 2004 2005static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2006 TCGReg v, TCGReg l, TCGReg h) 2007{ 2008 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2009 2010 /* vmov{d,q} %l, %v */ 2011 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2012 /* vpinsr{d,q} $1, %h, %v, %v */ 2013 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2014 tcg_out8(s, 1); 2015} 2016 2017/* 2018 * Generate code for the slow path for a load at the end of block 2019 */ 2020static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2021{ 2022 MemOp opc = get_memop(l->oi); 2023 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2024 2025 /* resolve label address */ 2026 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2027 if (label_ptr[1]) { 2028 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2029 } 2030 2031 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2032 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2033 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2034 2035 tcg_out_jmp(s, l->raddr); 2036 return true; 2037} 2038 2039/* 2040 * Generate code for the slow path for a store at the end of block 2041 */ 2042static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2043{ 2044 MemOp opc = get_memop(l->oi); 2045 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2046 2047 /* resolve label address */ 2048 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2049 if (label_ptr[1]) { 2050 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2051 } 2052 2053 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2054 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2055 2056 tcg_out_jmp(s, l->raddr); 2057 return true; 2058} 2059 2060#ifdef CONFIG_USER_ONLY 2061static HostAddress x86_guest_base = { 2062 .index = -1 2063}; 2064 2065#if defined(__x86_64__) && defined(__linux__) 2066# include <asm/prctl.h> 2067# include <sys/prctl.h> 2068int arch_prctl(int code, unsigned long addr); 2069static inline int setup_guest_base_seg(void) 2070{ 2071 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2072 return P_GS; 2073 } 2074 return 0; 2075} 2076#define setup_guest_base_seg setup_guest_base_seg 2077#elif defined(__x86_64__) && \ 2078 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2079# include <machine/sysarch.h> 2080static inline int setup_guest_base_seg(void) 2081{ 2082 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2083 return P_GS; 2084 } 2085 return 0; 2086} 2087#define setup_guest_base_seg setup_guest_base_seg 2088#endif 2089#else 2090# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2091#endif /* CONFIG_USER_ONLY */ 2092#ifndef setup_guest_base_seg 2093# define setup_guest_base_seg() 0 2094#endif 2095 2096#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2097 2098/* 2099 * For softmmu, perform the TLB load and compare. 2100 * For useronly, perform any required alignment tests. 2101 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2102 * is required and fill in @h with the host address for the fast path. 2103 */ 2104static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2105 TCGReg addrlo, TCGReg addrhi, 2106 MemOpIdx oi, bool is_ld) 2107{ 2108 TCGLabelQemuLdst *ldst = NULL; 2109 MemOp opc = get_memop(oi); 2110 MemOp s_bits = opc & MO_SIZE; 2111 unsigned a_mask; 2112 2113 if (tcg_use_softmmu) { 2114 h->index = TCG_REG_L0; 2115 h->ofs = 0; 2116 h->seg = 0; 2117 } else { 2118 *h = x86_guest_base; 2119 } 2120 h->base = addrlo; 2121 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2122 a_mask = (1 << h->aa.align) - 1; 2123 2124 if (tcg_use_softmmu) { 2125 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2126 : offsetof(CPUTLBEntry, addr_write); 2127 TCGType ttype = TCG_TYPE_I32; 2128 TCGType tlbtype = TCG_TYPE_I32; 2129 int trexw = 0, hrexw = 0, tlbrexw = 0; 2130 unsigned mem_index = get_mmuidx(oi); 2131 unsigned s_mask = (1 << s_bits) - 1; 2132 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2133 int tlb_mask; 2134 2135 ldst = new_ldst_label(s); 2136 ldst->is_ld = is_ld; 2137 ldst->oi = oi; 2138 ldst->addrlo_reg = addrlo; 2139 ldst->addrhi_reg = addrhi; 2140 2141 if (TCG_TARGET_REG_BITS == 64) { 2142 ttype = s->addr_type; 2143 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2144 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2145 hrexw = P_REXW; 2146 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2147 tlbtype = TCG_TYPE_I64; 2148 tlbrexw = P_REXW; 2149 } 2150 } 2151 } 2152 2153 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 2154 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2155 s->page_bits - CPU_TLB_ENTRY_BITS); 2156 2157 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2158 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2159 2160 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2161 fast_ofs + offsetof(CPUTLBDescFast, table)); 2162 2163 /* 2164 * If the required alignment is at least as large as the access, 2165 * simply copy the address and mask. For lesser alignments, 2166 * check that we don't cross pages for the complete access. 2167 */ 2168 if (a_mask >= s_mask) { 2169 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 2170 } else { 2171 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2172 addrlo, s_mask - a_mask); 2173 } 2174 tlb_mask = s->page_mask | a_mask; 2175 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2176 2177 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2178 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2179 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2180 2181 /* jne slow_path */ 2182 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2183 ldst->label_ptr[0] = s->code_ptr; 2184 s->code_ptr += 4; 2185 2186 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 2187 /* cmp 4(TCG_REG_L0), addrhi */ 2188 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, 2189 TCG_REG_L0, cmp_ofs + 4); 2190 2191 /* jne slow_path */ 2192 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2193 ldst->label_ptr[1] = s->code_ptr; 2194 s->code_ptr += 4; 2195 } 2196 2197 /* TLB Hit. */ 2198 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2199 offsetof(CPUTLBEntry, addend)); 2200 } else if (a_mask) { 2201 int jcc; 2202 2203 ldst = new_ldst_label(s); 2204 ldst->is_ld = is_ld; 2205 ldst->oi = oi; 2206 ldst->addrlo_reg = addrlo; 2207 ldst->addrhi_reg = addrhi; 2208 2209 /* jne slow_path */ 2210 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false); 2211 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2212 ldst->label_ptr[0] = s->code_ptr; 2213 s->code_ptr += 4; 2214 } 2215 2216 return ldst; 2217} 2218 2219static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2220 HostAddress h, TCGType type, MemOp memop) 2221{ 2222 bool use_movbe = false; 2223 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2224 int movop = OPC_MOVL_GvEv; 2225 2226 /* Do big-endian loads with movbe. */ 2227 if (memop & MO_BSWAP) { 2228 tcg_debug_assert(have_movbe); 2229 use_movbe = true; 2230 movop = OPC_MOVBE_GyMy; 2231 } 2232 2233 switch (memop & MO_SSIZE) { 2234 case MO_UB: 2235 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2236 h.base, h.index, 0, h.ofs); 2237 break; 2238 case MO_SB: 2239 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2240 h.base, h.index, 0, h.ofs); 2241 break; 2242 case MO_UW: 2243 if (use_movbe) { 2244 /* There is no extending movbe; only low 16-bits are modified. */ 2245 if (datalo != h.base && datalo != h.index) { 2246 /* XOR breaks dependency chains. */ 2247 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2248 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2249 datalo, h.base, h.index, 0, h.ofs); 2250 } else { 2251 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2252 datalo, h.base, h.index, 0, h.ofs); 2253 tcg_out_ext16u(s, datalo, datalo); 2254 } 2255 } else { 2256 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2257 h.base, h.index, 0, h.ofs); 2258 } 2259 break; 2260 case MO_SW: 2261 if (use_movbe) { 2262 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2263 datalo, h.base, h.index, 0, h.ofs); 2264 tcg_out_ext16s(s, type, datalo, datalo); 2265 } else { 2266 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2267 datalo, h.base, h.index, 0, h.ofs); 2268 } 2269 break; 2270 case MO_UL: 2271 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2272 h.base, h.index, 0, h.ofs); 2273 break; 2274#if TCG_TARGET_REG_BITS == 64 2275 case MO_SL: 2276 if (use_movbe) { 2277 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2278 h.base, h.index, 0, h.ofs); 2279 tcg_out_ext32s(s, datalo, datalo); 2280 } else { 2281 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2282 h.base, h.index, 0, h.ofs); 2283 } 2284 break; 2285#endif 2286 case MO_UQ: 2287 if (TCG_TARGET_REG_BITS == 64) { 2288 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2289 h.base, h.index, 0, h.ofs); 2290 break; 2291 } 2292 if (use_movbe) { 2293 TCGReg t = datalo; 2294 datalo = datahi; 2295 datahi = t; 2296 } 2297 if (h.base == datalo || h.index == datalo) { 2298 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2299 h.base, h.index, 0, h.ofs); 2300 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2301 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2302 } else { 2303 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2304 h.base, h.index, 0, h.ofs); 2305 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2306 h.base, h.index, 0, h.ofs + 4); 2307 } 2308 break; 2309 2310 case MO_128: 2311 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2312 2313 /* 2314 * Without 16-byte atomicity, use integer regs. 2315 * That is where we want the data, and it allows bswaps. 2316 */ 2317 if (h.aa.atom < MO_128) { 2318 if (use_movbe) { 2319 TCGReg t = datalo; 2320 datalo = datahi; 2321 datahi = t; 2322 } 2323 if (h.base == datalo || h.index == datalo) { 2324 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2325 h.base, h.index, 0, h.ofs); 2326 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2327 datalo, datahi, 0); 2328 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2329 datahi, datahi, 8); 2330 } else { 2331 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2332 h.base, h.index, 0, h.ofs); 2333 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2334 h.base, h.index, 0, h.ofs + 8); 2335 } 2336 break; 2337 } 2338 2339 /* 2340 * With 16-byte atomicity, a vector load is required. 2341 * If we already have 16-byte alignment, then VMOVDQA always works. 2342 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2343 * Else use we require a runtime test for alignment for VMOVDQA; 2344 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2345 */ 2346 if (h.aa.align >= MO_128) { 2347 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2348 TCG_TMP_VEC, 0, 2349 h.base, h.index, 0, h.ofs); 2350 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2351 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2352 TCG_TMP_VEC, 0, 2353 h.base, h.index, 0, h.ofs); 2354 } else { 2355 TCGLabel *l1 = gen_new_label(); 2356 TCGLabel *l2 = gen_new_label(); 2357 int jcc; 2358 2359 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2360 tcg_out_jxx(s, jcc, l1, true); 2361 2362 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2363 TCG_TMP_VEC, 0, 2364 h.base, h.index, 0, h.ofs); 2365 tcg_out_jxx(s, JCC_JMP, l2, true); 2366 2367 tcg_out_label(s, l1); 2368 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2369 TCG_TMP_VEC, 0, 2370 h.base, h.index, 0, h.ofs); 2371 tcg_out_label(s, l2); 2372 } 2373 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2374 break; 2375 2376 default: 2377 g_assert_not_reached(); 2378 } 2379} 2380 2381static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2382 TCGReg addrlo, TCGReg addrhi, 2383 MemOpIdx oi, TCGType data_type) 2384{ 2385 TCGLabelQemuLdst *ldst; 2386 HostAddress h; 2387 2388 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2389 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2390 2391 if (ldst) { 2392 ldst->type = data_type; 2393 ldst->datalo_reg = datalo; 2394 ldst->datahi_reg = datahi; 2395 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2396 } 2397} 2398 2399static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2400 HostAddress h, MemOp memop) 2401{ 2402 bool use_movbe = false; 2403 int movop = OPC_MOVL_EvGv; 2404 2405 /* 2406 * Do big-endian stores with movbe or system-mode. 2407 * User-only without movbe will have its swapping done generically. 2408 */ 2409 if (memop & MO_BSWAP) { 2410 tcg_debug_assert(have_movbe); 2411 use_movbe = true; 2412 movop = OPC_MOVBE_MyGy; 2413 } 2414 2415 switch (memop & MO_SIZE) { 2416 case MO_8: 2417 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2418 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2419 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2420 datalo, h.base, h.index, 0, h.ofs); 2421 break; 2422 case MO_16: 2423 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2424 h.base, h.index, 0, h.ofs); 2425 break; 2426 case MO_32: 2427 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2428 h.base, h.index, 0, h.ofs); 2429 break; 2430 case MO_64: 2431 if (TCG_TARGET_REG_BITS == 64) { 2432 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2433 h.base, h.index, 0, h.ofs); 2434 } else { 2435 if (use_movbe) { 2436 TCGReg t = datalo; 2437 datalo = datahi; 2438 datahi = t; 2439 } 2440 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2441 h.base, h.index, 0, h.ofs); 2442 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2443 h.base, h.index, 0, h.ofs + 4); 2444 } 2445 break; 2446 2447 case MO_128: 2448 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2449 2450 /* 2451 * Without 16-byte atomicity, use integer regs. 2452 * That is where we have the data, and it allows bswaps. 2453 */ 2454 if (h.aa.atom < MO_128) { 2455 if (use_movbe) { 2456 TCGReg t = datalo; 2457 datalo = datahi; 2458 datahi = t; 2459 } 2460 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2461 h.base, h.index, 0, h.ofs); 2462 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2463 h.base, h.index, 0, h.ofs + 8); 2464 break; 2465 } 2466 2467 /* 2468 * With 16-byte atomicity, a vector store is required. 2469 * If we already have 16-byte alignment, then VMOVDQA always works. 2470 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2471 * Else use we require a runtime test for alignment for VMOVDQA; 2472 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2473 */ 2474 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2475 if (h.aa.align >= MO_128) { 2476 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2477 TCG_TMP_VEC, 0, 2478 h.base, h.index, 0, h.ofs); 2479 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2480 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2481 TCG_TMP_VEC, 0, 2482 h.base, h.index, 0, h.ofs); 2483 } else { 2484 TCGLabel *l1 = gen_new_label(); 2485 TCGLabel *l2 = gen_new_label(); 2486 int jcc; 2487 2488 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2489 tcg_out_jxx(s, jcc, l1, true); 2490 2491 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2492 TCG_TMP_VEC, 0, 2493 h.base, h.index, 0, h.ofs); 2494 tcg_out_jxx(s, JCC_JMP, l2, true); 2495 2496 tcg_out_label(s, l1); 2497 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2498 TCG_TMP_VEC, 0, 2499 h.base, h.index, 0, h.ofs); 2500 tcg_out_label(s, l2); 2501 } 2502 break; 2503 2504 default: 2505 g_assert_not_reached(); 2506 } 2507} 2508 2509static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2510 TCGReg addrlo, TCGReg addrhi, 2511 MemOpIdx oi, TCGType data_type) 2512{ 2513 TCGLabelQemuLdst *ldst; 2514 HostAddress h; 2515 2516 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2517 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2518 2519 if (ldst) { 2520 ldst->type = data_type; 2521 ldst->datalo_reg = datalo; 2522 ldst->datahi_reg = datahi; 2523 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2524 } 2525} 2526 2527static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2528{ 2529 /* Reuse the zeroing that exists for goto_ptr. */ 2530 if (a0 == 0) { 2531 tcg_out_jmp(s, tcg_code_gen_epilogue); 2532 } else { 2533 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2534 tcg_out_jmp(s, tb_ret_addr); 2535 } 2536} 2537 2538static void tcg_out_goto_tb(TCGContext *s, int which) 2539{ 2540 /* 2541 * Jump displacement must be aligned for atomic patching; 2542 * see if we need to add extra nops before jump 2543 */ 2544 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2545 if (gap != 1) { 2546 tcg_out_nopn(s, gap - 1); 2547 } 2548 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2549 set_jmp_insn_offset(s, which); 2550 tcg_out32(s, 0); 2551 set_jmp_reset_offset(s, which); 2552} 2553 2554void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2555 uintptr_t jmp_rx, uintptr_t jmp_rw) 2556{ 2557 /* patch the branch destination */ 2558 uintptr_t addr = tb->jmp_target_addr[n]; 2559 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2560 /* no need to flush icache explicitly */ 2561} 2562 2563static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2564 const TCGArg args[TCG_MAX_OP_ARGS], 2565 const int const_args[TCG_MAX_OP_ARGS]) 2566{ 2567 TCGArg a0, a1, a2; 2568 int c, const_a2, vexop, rexw = 0; 2569 2570#if TCG_TARGET_REG_BITS == 64 2571# define OP_32_64(x) \ 2572 case glue(glue(INDEX_op_, x), _i64): \ 2573 rexw = P_REXW; /* FALLTHRU */ \ 2574 case glue(glue(INDEX_op_, x), _i32) 2575#else 2576# define OP_32_64(x) \ 2577 case glue(glue(INDEX_op_, x), _i32) 2578#endif 2579 2580 /* Hoist the loads of the most common arguments. */ 2581 a0 = args[0]; 2582 a1 = args[1]; 2583 a2 = args[2]; 2584 const_a2 = const_args[2]; 2585 2586 switch (opc) { 2587 case INDEX_op_goto_ptr: 2588 /* jmp to the given host address (could be epilogue) */ 2589 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2590 break; 2591 case INDEX_op_br: 2592 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2593 break; 2594 OP_32_64(ld8u): 2595 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2596 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2597 break; 2598 OP_32_64(ld8s): 2599 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2600 break; 2601 OP_32_64(ld16u): 2602 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2603 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2604 break; 2605 OP_32_64(ld16s): 2606 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2607 break; 2608#if TCG_TARGET_REG_BITS == 64 2609 case INDEX_op_ld32u_i64: 2610#endif 2611 case INDEX_op_ld_i32: 2612 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2613 break; 2614 2615 OP_32_64(st8): 2616 if (const_args[0]) { 2617 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2618 tcg_out8(s, a0); 2619 } else { 2620 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2621 } 2622 break; 2623 OP_32_64(st16): 2624 if (const_args[0]) { 2625 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2626 tcg_out16(s, a0); 2627 } else { 2628 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2629 } 2630 break; 2631#if TCG_TARGET_REG_BITS == 64 2632 case INDEX_op_st32_i64: 2633#endif 2634 case INDEX_op_st_i32: 2635 if (const_args[0]) { 2636 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2637 tcg_out32(s, a0); 2638 } else { 2639 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2640 } 2641 break; 2642 2643 OP_32_64(add): 2644 /* For 3-operand addition, use LEA. */ 2645 if (a0 != a1) { 2646 TCGArg c3 = 0; 2647 if (const_a2) { 2648 c3 = a2, a2 = -1; 2649 } else if (a0 == a2) { 2650 /* Watch out for dest = src + dest, since we've removed 2651 the matching constraint on the add. */ 2652 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2653 break; 2654 } 2655 2656 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2657 break; 2658 } 2659 c = ARITH_ADD; 2660 goto gen_arith; 2661 OP_32_64(sub): 2662 c = ARITH_SUB; 2663 goto gen_arith; 2664 OP_32_64(and): 2665 c = ARITH_AND; 2666 goto gen_arith; 2667 OP_32_64(or): 2668 c = ARITH_OR; 2669 goto gen_arith; 2670 OP_32_64(xor): 2671 c = ARITH_XOR; 2672 goto gen_arith; 2673 gen_arith: 2674 if (const_a2) { 2675 tgen_arithi(s, c + rexw, a0, a2, 0); 2676 } else { 2677 tgen_arithr(s, c + rexw, a0, a2); 2678 } 2679 break; 2680 2681 OP_32_64(andc): 2682 if (const_a2) { 2683 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2684 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2685 } else { 2686 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2687 } 2688 break; 2689 2690 OP_32_64(mul): 2691 if (const_a2) { 2692 int32_t val; 2693 val = a2; 2694 if (val == (int8_t)val) { 2695 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2696 tcg_out8(s, val); 2697 } else { 2698 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2699 tcg_out32(s, val); 2700 } 2701 } else { 2702 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2703 } 2704 break; 2705 2706 OP_32_64(div2): 2707 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2708 break; 2709 OP_32_64(divu2): 2710 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2711 break; 2712 2713 OP_32_64(shl): 2714 /* For small constant 3-operand shift, use LEA. */ 2715 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2716 if (a2 - 1 == 0) { 2717 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2718 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2719 } else { 2720 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2721 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2722 } 2723 break; 2724 } 2725 c = SHIFT_SHL; 2726 vexop = OPC_SHLX; 2727 goto gen_shift_maybe_vex; 2728 OP_32_64(shr): 2729 c = SHIFT_SHR; 2730 vexop = OPC_SHRX; 2731 goto gen_shift_maybe_vex; 2732 OP_32_64(sar): 2733 c = SHIFT_SAR; 2734 vexop = OPC_SARX; 2735 goto gen_shift_maybe_vex; 2736 OP_32_64(rotl): 2737 c = SHIFT_ROL; 2738 goto gen_shift; 2739 OP_32_64(rotr): 2740 c = SHIFT_ROR; 2741 goto gen_shift; 2742 gen_shift_maybe_vex: 2743 if (have_bmi2) { 2744 if (!const_a2) { 2745 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2746 break; 2747 } 2748 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2749 } 2750 /* FALLTHRU */ 2751 gen_shift: 2752 if (const_a2) { 2753 tcg_out_shifti(s, c + rexw, a0, a2); 2754 } else { 2755 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2756 } 2757 break; 2758 2759 OP_32_64(ctz): 2760 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2761 break; 2762 OP_32_64(clz): 2763 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2764 break; 2765 OP_32_64(ctpop): 2766 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2767 break; 2768 2769 OP_32_64(brcond): 2770 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 2771 arg_label(args[3]), 0); 2772 break; 2773 OP_32_64(setcond): 2774 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 2775 break; 2776 OP_32_64(negsetcond): 2777 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 2778 break; 2779 OP_32_64(movcond): 2780 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 2781 break; 2782 2783 OP_32_64(bswap16): 2784 if (a2 & TCG_BSWAP_OS) { 2785 /* Output must be sign-extended. */ 2786 if (rexw) { 2787 tcg_out_bswap64(s, a0); 2788 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2789 } else { 2790 tcg_out_bswap32(s, a0); 2791 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2792 } 2793 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2794 /* Output must be zero-extended, but input isn't. */ 2795 tcg_out_bswap32(s, a0); 2796 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2797 } else { 2798 tcg_out_rolw_8(s, a0); 2799 } 2800 break; 2801 OP_32_64(bswap32): 2802 tcg_out_bswap32(s, a0); 2803 if (rexw && (a2 & TCG_BSWAP_OS)) { 2804 tcg_out_ext32s(s, a0, a0); 2805 } 2806 break; 2807 2808 OP_32_64(neg): 2809 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2810 break; 2811 OP_32_64(not): 2812 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2813 break; 2814 2815 case INDEX_op_qemu_ld_a64_i32: 2816 if (TCG_TARGET_REG_BITS == 32) { 2817 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2818 break; 2819 } 2820 /* fall through */ 2821 case INDEX_op_qemu_ld_a32_i32: 2822 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2823 break; 2824 case INDEX_op_qemu_ld_a32_i64: 2825 if (TCG_TARGET_REG_BITS == 64) { 2826 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2827 } else { 2828 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2829 } 2830 break; 2831 case INDEX_op_qemu_ld_a64_i64: 2832 if (TCG_TARGET_REG_BITS == 64) { 2833 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2834 } else { 2835 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2836 } 2837 break; 2838 case INDEX_op_qemu_ld_a32_i128: 2839 case INDEX_op_qemu_ld_a64_i128: 2840 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2841 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2842 break; 2843 2844 case INDEX_op_qemu_st_a64_i32: 2845 case INDEX_op_qemu_st8_a64_i32: 2846 if (TCG_TARGET_REG_BITS == 32) { 2847 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2848 break; 2849 } 2850 /* fall through */ 2851 case INDEX_op_qemu_st_a32_i32: 2852 case INDEX_op_qemu_st8_a32_i32: 2853 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2854 break; 2855 case INDEX_op_qemu_st_a32_i64: 2856 if (TCG_TARGET_REG_BITS == 64) { 2857 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2858 } else { 2859 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2860 } 2861 break; 2862 case INDEX_op_qemu_st_a64_i64: 2863 if (TCG_TARGET_REG_BITS == 64) { 2864 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2865 } else { 2866 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2867 } 2868 break; 2869 case INDEX_op_qemu_st_a32_i128: 2870 case INDEX_op_qemu_st_a64_i128: 2871 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2872 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2873 break; 2874 2875 OP_32_64(mulu2): 2876 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2877 break; 2878 OP_32_64(muls2): 2879 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2880 break; 2881 OP_32_64(add2): 2882 if (const_args[4]) { 2883 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2884 } else { 2885 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2886 } 2887 if (const_args[5]) { 2888 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2889 } else { 2890 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2891 } 2892 break; 2893 OP_32_64(sub2): 2894 if (const_args[4]) { 2895 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2896 } else { 2897 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2898 } 2899 if (const_args[5]) { 2900 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2901 } else { 2902 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2903 } 2904 break; 2905 2906#if TCG_TARGET_REG_BITS == 32 2907 case INDEX_op_brcond2_i32: 2908 tcg_out_brcond2(s, args, const_args, 0); 2909 break; 2910 case INDEX_op_setcond2_i32: 2911 tcg_out_setcond2(s, args, const_args); 2912 break; 2913#else /* TCG_TARGET_REG_BITS == 64 */ 2914 case INDEX_op_ld32s_i64: 2915 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2916 break; 2917 case INDEX_op_ld_i64: 2918 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2919 break; 2920 case INDEX_op_st_i64: 2921 if (const_args[0]) { 2922 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2923 tcg_out32(s, a0); 2924 } else { 2925 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2926 } 2927 break; 2928 2929 case INDEX_op_bswap64_i64: 2930 tcg_out_bswap64(s, a0); 2931 break; 2932 case INDEX_op_extrh_i64_i32: 2933 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2934 break; 2935#endif 2936 2937 OP_32_64(deposit): 2938 if (args[3] == 0 && args[4] == 8) { 2939 /* load bits 0..7 */ 2940 if (const_a2) { 2941 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 2942 0, a0, 0); 2943 tcg_out8(s, a2); 2944 } else { 2945 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2946 } 2947 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 2948 /* load bits 8..15 */ 2949 if (const_a2) { 2950 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 2951 tcg_out8(s, a2); 2952 } else { 2953 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2954 } 2955 } else if (args[3] == 0 && args[4] == 16) { 2956 /* load bits 0..15 */ 2957 if (const_a2) { 2958 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 2959 0, a0, 0); 2960 tcg_out16(s, a2); 2961 } else { 2962 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2963 } 2964 } else { 2965 g_assert_not_reached(); 2966 } 2967 break; 2968 2969 case INDEX_op_extract_i64: 2970 if (a2 + args[3] == 32) { 2971 /* This is a 32-bit zero-extending right shift. */ 2972 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2973 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2974 break; 2975 } 2976 /* FALLTHRU */ 2977 case INDEX_op_extract_i32: 2978 /* On the off-chance that we can use the high-byte registers. 2979 Otherwise we emit the same ext16 + shift pattern that we 2980 would have gotten from the normal tcg-op.c expansion. */ 2981 tcg_debug_assert(a2 == 8 && args[3] == 8); 2982 if (a1 < 4 && a0 < 8) { 2983 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2984 } else { 2985 tcg_out_ext16u(s, a0, a1); 2986 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2987 } 2988 break; 2989 2990 case INDEX_op_sextract_i32: 2991 /* We don't implement sextract_i64, as we cannot sign-extend to 2992 64-bits without using the REX prefix that explicitly excludes 2993 access to the high-byte registers. */ 2994 tcg_debug_assert(a2 == 8 && args[3] == 8); 2995 if (a1 < 4 && a0 < 8) { 2996 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2997 } else { 2998 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 2999 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3000 } 3001 break; 3002 3003 OP_32_64(extract2): 3004 /* Note that SHRD outputs to the r/m operand. */ 3005 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3006 tcg_out8(s, args[3]); 3007 break; 3008 3009 case INDEX_op_mb: 3010 tcg_out_mb(s, a0); 3011 break; 3012 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 3013 case INDEX_op_mov_i64: 3014 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3015 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3016 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3017 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 3018 case INDEX_op_ext8s_i64: 3019 case INDEX_op_ext8u_i32: 3020 case INDEX_op_ext8u_i64: 3021 case INDEX_op_ext16s_i32: 3022 case INDEX_op_ext16s_i64: 3023 case INDEX_op_ext16u_i32: 3024 case INDEX_op_ext16u_i64: 3025 case INDEX_op_ext32s_i64: 3026 case INDEX_op_ext32u_i64: 3027 case INDEX_op_ext_i32_i64: 3028 case INDEX_op_extu_i32_i64: 3029 case INDEX_op_extrl_i64_i32: 3030 default: 3031 g_assert_not_reached(); 3032 } 3033 3034#undef OP_32_64 3035} 3036 3037static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3038 unsigned vecl, unsigned vece, 3039 const TCGArg args[TCG_MAX_OP_ARGS], 3040 const int const_args[TCG_MAX_OP_ARGS]) 3041{ 3042 static int const add_insn[4] = { 3043 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3044 }; 3045 static int const ssadd_insn[4] = { 3046 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3047 }; 3048 static int const usadd_insn[4] = { 3049 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3050 }; 3051 static int const sub_insn[4] = { 3052 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3053 }; 3054 static int const sssub_insn[4] = { 3055 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3056 }; 3057 static int const ussub_insn[4] = { 3058 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3059 }; 3060 static int const mul_insn[4] = { 3061 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3062 }; 3063 static int const shift_imm_insn[4] = { 3064 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3065 }; 3066 static int const cmpeq_insn[4] = { 3067 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3068 }; 3069 static int const cmpgt_insn[4] = { 3070 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3071 }; 3072 static int const punpckl_insn[4] = { 3073 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3074 }; 3075 static int const punpckh_insn[4] = { 3076 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3077 }; 3078 static int const packss_insn[4] = { 3079 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3080 }; 3081 static int const packus_insn[4] = { 3082 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3083 }; 3084 static int const smin_insn[4] = { 3085 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3086 }; 3087 static int const smax_insn[4] = { 3088 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3089 }; 3090 static int const umin_insn[4] = { 3091 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3092 }; 3093 static int const umax_insn[4] = { 3094 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3095 }; 3096 static int const rotlv_insn[4] = { 3097 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3098 }; 3099 static int const rotrv_insn[4] = { 3100 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3101 }; 3102 static int const shlv_insn[4] = { 3103 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3104 }; 3105 static int const shrv_insn[4] = { 3106 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3107 }; 3108 static int const sarv_insn[4] = { 3109 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3110 }; 3111 static int const shls_insn[4] = { 3112 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3113 }; 3114 static int const shrs_insn[4] = { 3115 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3116 }; 3117 static int const sars_insn[4] = { 3118 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3119 }; 3120 static int const vpshldi_insn[4] = { 3121 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3122 }; 3123 static int const vpshldv_insn[4] = { 3124 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3125 }; 3126 static int const vpshrdv_insn[4] = { 3127 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3128 }; 3129 static int const abs_insn[4] = { 3130 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3131 }; 3132 3133 TCGType type = vecl + TCG_TYPE_V64; 3134 int insn, sub; 3135 TCGArg a0, a1, a2, a3; 3136 3137 a0 = args[0]; 3138 a1 = args[1]; 3139 a2 = args[2]; 3140 3141 switch (opc) { 3142 case INDEX_op_add_vec: 3143 insn = add_insn[vece]; 3144 goto gen_simd; 3145 case INDEX_op_ssadd_vec: 3146 insn = ssadd_insn[vece]; 3147 goto gen_simd; 3148 case INDEX_op_usadd_vec: 3149 insn = usadd_insn[vece]; 3150 goto gen_simd; 3151 case INDEX_op_sub_vec: 3152 insn = sub_insn[vece]; 3153 goto gen_simd; 3154 case INDEX_op_sssub_vec: 3155 insn = sssub_insn[vece]; 3156 goto gen_simd; 3157 case INDEX_op_ussub_vec: 3158 insn = ussub_insn[vece]; 3159 goto gen_simd; 3160 case INDEX_op_mul_vec: 3161 insn = mul_insn[vece]; 3162 goto gen_simd; 3163 case INDEX_op_and_vec: 3164 insn = OPC_PAND; 3165 goto gen_simd; 3166 case INDEX_op_or_vec: 3167 insn = OPC_POR; 3168 goto gen_simd; 3169 case INDEX_op_xor_vec: 3170 insn = OPC_PXOR; 3171 goto gen_simd; 3172 case INDEX_op_smin_vec: 3173 insn = smin_insn[vece]; 3174 goto gen_simd; 3175 case INDEX_op_umin_vec: 3176 insn = umin_insn[vece]; 3177 goto gen_simd; 3178 case INDEX_op_smax_vec: 3179 insn = smax_insn[vece]; 3180 goto gen_simd; 3181 case INDEX_op_umax_vec: 3182 insn = umax_insn[vece]; 3183 goto gen_simd; 3184 case INDEX_op_shlv_vec: 3185 insn = shlv_insn[vece]; 3186 goto gen_simd; 3187 case INDEX_op_shrv_vec: 3188 insn = shrv_insn[vece]; 3189 goto gen_simd; 3190 case INDEX_op_sarv_vec: 3191 insn = sarv_insn[vece]; 3192 goto gen_simd; 3193 case INDEX_op_rotlv_vec: 3194 insn = rotlv_insn[vece]; 3195 goto gen_simd; 3196 case INDEX_op_rotrv_vec: 3197 insn = rotrv_insn[vece]; 3198 goto gen_simd; 3199 case INDEX_op_shls_vec: 3200 insn = shls_insn[vece]; 3201 goto gen_simd; 3202 case INDEX_op_shrs_vec: 3203 insn = shrs_insn[vece]; 3204 goto gen_simd; 3205 case INDEX_op_sars_vec: 3206 insn = sars_insn[vece]; 3207 goto gen_simd; 3208 case INDEX_op_x86_punpckl_vec: 3209 insn = punpckl_insn[vece]; 3210 goto gen_simd; 3211 case INDEX_op_x86_punpckh_vec: 3212 insn = punpckh_insn[vece]; 3213 goto gen_simd; 3214 case INDEX_op_x86_packss_vec: 3215 insn = packss_insn[vece]; 3216 goto gen_simd; 3217 case INDEX_op_x86_packus_vec: 3218 insn = packus_insn[vece]; 3219 goto gen_simd; 3220 case INDEX_op_x86_vpshldv_vec: 3221 insn = vpshldv_insn[vece]; 3222 a1 = a2; 3223 a2 = args[3]; 3224 goto gen_simd; 3225 case INDEX_op_x86_vpshrdv_vec: 3226 insn = vpshrdv_insn[vece]; 3227 a1 = a2; 3228 a2 = args[3]; 3229 goto gen_simd; 3230#if TCG_TARGET_REG_BITS == 32 3231 case INDEX_op_dup2_vec: 3232 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3233 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3234 /* Then replicate the 64-bit elements across the rest of the vector. */ 3235 if (type != TCG_TYPE_V64) { 3236 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3237 } 3238 break; 3239#endif 3240 case INDEX_op_abs_vec: 3241 insn = abs_insn[vece]; 3242 a2 = a1; 3243 a1 = 0; 3244 goto gen_simd; 3245 gen_simd: 3246 tcg_debug_assert(insn != OPC_UD2); 3247 if (type == TCG_TYPE_V256) { 3248 insn |= P_VEXL; 3249 } 3250 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3251 break; 3252 3253 case INDEX_op_cmp_vec: 3254 sub = args[3]; 3255 if (sub == TCG_COND_EQ) { 3256 insn = cmpeq_insn[vece]; 3257 } else if (sub == TCG_COND_GT) { 3258 insn = cmpgt_insn[vece]; 3259 } else { 3260 g_assert_not_reached(); 3261 } 3262 goto gen_simd; 3263 3264 case INDEX_op_andc_vec: 3265 insn = OPC_PANDN; 3266 if (type == TCG_TYPE_V256) { 3267 insn |= P_VEXL; 3268 } 3269 tcg_out_vex_modrm(s, insn, a0, a2, a1); 3270 break; 3271 3272 case INDEX_op_shli_vec: 3273 insn = shift_imm_insn[vece]; 3274 sub = 6; 3275 goto gen_shift; 3276 case INDEX_op_shri_vec: 3277 insn = shift_imm_insn[vece]; 3278 sub = 2; 3279 goto gen_shift; 3280 case INDEX_op_sari_vec: 3281 if (vece == MO_64) { 3282 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3283 } else { 3284 insn = shift_imm_insn[vece]; 3285 } 3286 sub = 4; 3287 goto gen_shift; 3288 case INDEX_op_rotli_vec: 3289 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3290 if (vece == MO_64) { 3291 insn |= P_VEXW; 3292 } 3293 sub = 1; 3294 goto gen_shift; 3295 gen_shift: 3296 tcg_debug_assert(vece != MO_8); 3297 if (type == TCG_TYPE_V256) { 3298 insn |= P_VEXL; 3299 } 3300 tcg_out_vex_modrm(s, insn, sub, a0, a1); 3301 tcg_out8(s, a2); 3302 break; 3303 3304 case INDEX_op_ld_vec: 3305 tcg_out_ld(s, type, a0, a1, a2); 3306 break; 3307 case INDEX_op_st_vec: 3308 tcg_out_st(s, type, a0, a1, a2); 3309 break; 3310 case INDEX_op_dupm_vec: 3311 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3312 break; 3313 3314 case INDEX_op_x86_shufps_vec: 3315 insn = OPC_SHUFPS; 3316 sub = args[3]; 3317 goto gen_simd_imm8; 3318 case INDEX_op_x86_blend_vec: 3319 if (vece == MO_16) { 3320 insn = OPC_PBLENDW; 3321 } else if (vece == MO_32) { 3322 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3323 } else { 3324 g_assert_not_reached(); 3325 } 3326 sub = args[3]; 3327 goto gen_simd_imm8; 3328 case INDEX_op_x86_vperm2i128_vec: 3329 insn = OPC_VPERM2I128; 3330 sub = args[3]; 3331 goto gen_simd_imm8; 3332 case INDEX_op_x86_vpshldi_vec: 3333 insn = vpshldi_insn[vece]; 3334 sub = args[3]; 3335 goto gen_simd_imm8; 3336 3337 case INDEX_op_not_vec: 3338 insn = OPC_VPTERNLOGQ; 3339 a2 = a1; 3340 sub = 0x33; /* !B */ 3341 goto gen_simd_imm8; 3342 case INDEX_op_nor_vec: 3343 insn = OPC_VPTERNLOGQ; 3344 sub = 0x11; /* norCB */ 3345 goto gen_simd_imm8; 3346 case INDEX_op_nand_vec: 3347 insn = OPC_VPTERNLOGQ; 3348 sub = 0x77; /* nandCB */ 3349 goto gen_simd_imm8; 3350 case INDEX_op_eqv_vec: 3351 insn = OPC_VPTERNLOGQ; 3352 sub = 0x99; /* xnorCB */ 3353 goto gen_simd_imm8; 3354 case INDEX_op_orc_vec: 3355 insn = OPC_VPTERNLOGQ; 3356 sub = 0xdd; /* orB!C */ 3357 goto gen_simd_imm8; 3358 3359 case INDEX_op_bitsel_vec: 3360 insn = OPC_VPTERNLOGQ; 3361 a3 = args[3]; 3362 if (a0 == a1) { 3363 a1 = a2; 3364 a2 = a3; 3365 sub = 0xca; /* A?B:C */ 3366 } else if (a0 == a2) { 3367 a2 = a3; 3368 sub = 0xe2; /* B?A:C */ 3369 } else { 3370 tcg_out_mov(s, type, a0, a3); 3371 sub = 0xb8; /* B?C:A */ 3372 } 3373 goto gen_simd_imm8; 3374 3375 gen_simd_imm8: 3376 tcg_debug_assert(insn != OPC_UD2); 3377 if (type == TCG_TYPE_V256) { 3378 insn |= P_VEXL; 3379 } 3380 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3381 tcg_out8(s, sub); 3382 break; 3383 3384 case INDEX_op_x86_vpblendvb_vec: 3385 insn = OPC_VPBLENDVB; 3386 if (type == TCG_TYPE_V256) { 3387 insn |= P_VEXL; 3388 } 3389 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3390 tcg_out8(s, args[3] << 4); 3391 break; 3392 3393 case INDEX_op_x86_psrldq_vec: 3394 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3395 tcg_out8(s, a2); 3396 break; 3397 3398 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3399 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3400 default: 3401 g_assert_not_reached(); 3402 } 3403} 3404 3405static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3406{ 3407 switch (op) { 3408 case INDEX_op_goto_ptr: 3409 return C_O0_I1(r); 3410 3411 case INDEX_op_ld8u_i32: 3412 case INDEX_op_ld8u_i64: 3413 case INDEX_op_ld8s_i32: 3414 case INDEX_op_ld8s_i64: 3415 case INDEX_op_ld16u_i32: 3416 case INDEX_op_ld16u_i64: 3417 case INDEX_op_ld16s_i32: 3418 case INDEX_op_ld16s_i64: 3419 case INDEX_op_ld_i32: 3420 case INDEX_op_ld32u_i64: 3421 case INDEX_op_ld32s_i64: 3422 case INDEX_op_ld_i64: 3423 return C_O1_I1(r, r); 3424 3425 case INDEX_op_st8_i32: 3426 case INDEX_op_st8_i64: 3427 return C_O0_I2(qi, r); 3428 3429 case INDEX_op_st16_i32: 3430 case INDEX_op_st16_i64: 3431 case INDEX_op_st_i32: 3432 case INDEX_op_st32_i64: 3433 return C_O0_I2(ri, r); 3434 3435 case INDEX_op_st_i64: 3436 return C_O0_I2(re, r); 3437 3438 case INDEX_op_add_i32: 3439 case INDEX_op_add_i64: 3440 return C_O1_I2(r, r, re); 3441 3442 case INDEX_op_sub_i32: 3443 case INDEX_op_sub_i64: 3444 case INDEX_op_mul_i32: 3445 case INDEX_op_mul_i64: 3446 case INDEX_op_or_i32: 3447 case INDEX_op_or_i64: 3448 case INDEX_op_xor_i32: 3449 case INDEX_op_xor_i64: 3450 return C_O1_I2(r, 0, re); 3451 3452 case INDEX_op_and_i32: 3453 case INDEX_op_and_i64: 3454 return C_O1_I2(r, 0, reZ); 3455 3456 case INDEX_op_andc_i32: 3457 case INDEX_op_andc_i64: 3458 return C_O1_I2(r, r, rI); 3459 3460 case INDEX_op_shl_i32: 3461 case INDEX_op_shl_i64: 3462 case INDEX_op_shr_i32: 3463 case INDEX_op_shr_i64: 3464 case INDEX_op_sar_i32: 3465 case INDEX_op_sar_i64: 3466 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3467 3468 case INDEX_op_rotl_i32: 3469 case INDEX_op_rotl_i64: 3470 case INDEX_op_rotr_i32: 3471 case INDEX_op_rotr_i64: 3472 return C_O1_I2(r, 0, ci); 3473 3474 case INDEX_op_brcond_i32: 3475 case INDEX_op_brcond_i64: 3476 return C_O0_I2(r, reT); 3477 3478 case INDEX_op_bswap16_i32: 3479 case INDEX_op_bswap16_i64: 3480 case INDEX_op_bswap32_i32: 3481 case INDEX_op_bswap32_i64: 3482 case INDEX_op_bswap64_i64: 3483 case INDEX_op_neg_i32: 3484 case INDEX_op_neg_i64: 3485 case INDEX_op_not_i32: 3486 case INDEX_op_not_i64: 3487 case INDEX_op_extrh_i64_i32: 3488 return C_O1_I1(r, 0); 3489 3490 case INDEX_op_ext8s_i32: 3491 case INDEX_op_ext8s_i64: 3492 case INDEX_op_ext8u_i32: 3493 case INDEX_op_ext8u_i64: 3494 return C_O1_I1(r, q); 3495 3496 case INDEX_op_ext16s_i32: 3497 case INDEX_op_ext16s_i64: 3498 case INDEX_op_ext16u_i32: 3499 case INDEX_op_ext16u_i64: 3500 case INDEX_op_ext32s_i64: 3501 case INDEX_op_ext32u_i64: 3502 case INDEX_op_ext_i32_i64: 3503 case INDEX_op_extu_i32_i64: 3504 case INDEX_op_extrl_i64_i32: 3505 case INDEX_op_extract_i32: 3506 case INDEX_op_extract_i64: 3507 case INDEX_op_sextract_i32: 3508 case INDEX_op_ctpop_i32: 3509 case INDEX_op_ctpop_i64: 3510 return C_O1_I1(r, r); 3511 3512 case INDEX_op_extract2_i32: 3513 case INDEX_op_extract2_i64: 3514 return C_O1_I2(r, 0, r); 3515 3516 case INDEX_op_deposit_i32: 3517 case INDEX_op_deposit_i64: 3518 return C_O1_I2(q, 0, qi); 3519 3520 case INDEX_op_setcond_i32: 3521 case INDEX_op_setcond_i64: 3522 case INDEX_op_negsetcond_i32: 3523 case INDEX_op_negsetcond_i64: 3524 return C_O1_I2(q, r, reT); 3525 3526 case INDEX_op_movcond_i32: 3527 case INDEX_op_movcond_i64: 3528 return C_O1_I4(r, r, reT, r, 0); 3529 3530 case INDEX_op_div2_i32: 3531 case INDEX_op_div2_i64: 3532 case INDEX_op_divu2_i32: 3533 case INDEX_op_divu2_i64: 3534 return C_O2_I3(a, d, 0, 1, r); 3535 3536 case INDEX_op_mulu2_i32: 3537 case INDEX_op_mulu2_i64: 3538 case INDEX_op_muls2_i32: 3539 case INDEX_op_muls2_i64: 3540 return C_O2_I2(a, d, a, r); 3541 3542 case INDEX_op_add2_i32: 3543 case INDEX_op_add2_i64: 3544 case INDEX_op_sub2_i32: 3545 case INDEX_op_sub2_i64: 3546 return C_N1_O1_I4(r, r, 0, 1, re, re); 3547 3548 case INDEX_op_ctz_i32: 3549 case INDEX_op_ctz_i64: 3550 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3551 3552 case INDEX_op_clz_i32: 3553 case INDEX_op_clz_i64: 3554 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3555 3556 case INDEX_op_qemu_ld_a32_i32: 3557 return C_O1_I1(r, L); 3558 case INDEX_op_qemu_ld_a64_i32: 3559 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3560 3561 case INDEX_op_qemu_st_a32_i32: 3562 return C_O0_I2(L, L); 3563 case INDEX_op_qemu_st_a64_i32: 3564 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3565 case INDEX_op_qemu_st8_a32_i32: 3566 return C_O0_I2(s, L); 3567 case INDEX_op_qemu_st8_a64_i32: 3568 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3569 3570 case INDEX_op_qemu_ld_a32_i64: 3571 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3572 case INDEX_op_qemu_ld_a64_i64: 3573 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3574 3575 case INDEX_op_qemu_st_a32_i64: 3576 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3577 case INDEX_op_qemu_st_a64_i64: 3578 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3579 3580 case INDEX_op_qemu_ld_a32_i128: 3581 case INDEX_op_qemu_ld_a64_i128: 3582 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3583 return C_O2_I1(r, r, L); 3584 case INDEX_op_qemu_st_a32_i128: 3585 case INDEX_op_qemu_st_a64_i128: 3586 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3587 return C_O0_I3(L, L, L); 3588 3589 case INDEX_op_brcond2_i32: 3590 return C_O0_I4(r, r, ri, ri); 3591 3592 case INDEX_op_setcond2_i32: 3593 return C_O1_I4(r, r, r, ri, ri); 3594 3595 case INDEX_op_ld_vec: 3596 case INDEX_op_dupm_vec: 3597 return C_O1_I1(x, r); 3598 3599 case INDEX_op_st_vec: 3600 return C_O0_I2(x, r); 3601 3602 case INDEX_op_add_vec: 3603 case INDEX_op_sub_vec: 3604 case INDEX_op_mul_vec: 3605 case INDEX_op_and_vec: 3606 case INDEX_op_or_vec: 3607 case INDEX_op_xor_vec: 3608 case INDEX_op_andc_vec: 3609 case INDEX_op_orc_vec: 3610 case INDEX_op_nand_vec: 3611 case INDEX_op_nor_vec: 3612 case INDEX_op_eqv_vec: 3613 case INDEX_op_ssadd_vec: 3614 case INDEX_op_usadd_vec: 3615 case INDEX_op_sssub_vec: 3616 case INDEX_op_ussub_vec: 3617 case INDEX_op_smin_vec: 3618 case INDEX_op_umin_vec: 3619 case INDEX_op_smax_vec: 3620 case INDEX_op_umax_vec: 3621 case INDEX_op_shlv_vec: 3622 case INDEX_op_shrv_vec: 3623 case INDEX_op_sarv_vec: 3624 case INDEX_op_rotlv_vec: 3625 case INDEX_op_rotrv_vec: 3626 case INDEX_op_shls_vec: 3627 case INDEX_op_shrs_vec: 3628 case INDEX_op_sars_vec: 3629 case INDEX_op_cmp_vec: 3630 case INDEX_op_x86_shufps_vec: 3631 case INDEX_op_x86_blend_vec: 3632 case INDEX_op_x86_packss_vec: 3633 case INDEX_op_x86_packus_vec: 3634 case INDEX_op_x86_vperm2i128_vec: 3635 case INDEX_op_x86_punpckl_vec: 3636 case INDEX_op_x86_punpckh_vec: 3637 case INDEX_op_x86_vpshldi_vec: 3638#if TCG_TARGET_REG_BITS == 32 3639 case INDEX_op_dup2_vec: 3640#endif 3641 return C_O1_I2(x, x, x); 3642 3643 case INDEX_op_abs_vec: 3644 case INDEX_op_dup_vec: 3645 case INDEX_op_not_vec: 3646 case INDEX_op_shli_vec: 3647 case INDEX_op_shri_vec: 3648 case INDEX_op_sari_vec: 3649 case INDEX_op_rotli_vec: 3650 case INDEX_op_x86_psrldq_vec: 3651 return C_O1_I1(x, x); 3652 3653 case INDEX_op_x86_vpshldv_vec: 3654 case INDEX_op_x86_vpshrdv_vec: 3655 return C_O1_I3(x, 0, x, x); 3656 3657 case INDEX_op_bitsel_vec: 3658 case INDEX_op_x86_vpblendvb_vec: 3659 return C_O1_I3(x, x, x, x); 3660 3661 default: 3662 g_assert_not_reached(); 3663 } 3664} 3665 3666int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3667{ 3668 switch (opc) { 3669 case INDEX_op_add_vec: 3670 case INDEX_op_sub_vec: 3671 case INDEX_op_and_vec: 3672 case INDEX_op_or_vec: 3673 case INDEX_op_xor_vec: 3674 case INDEX_op_andc_vec: 3675 case INDEX_op_orc_vec: 3676 case INDEX_op_nand_vec: 3677 case INDEX_op_nor_vec: 3678 case INDEX_op_eqv_vec: 3679 case INDEX_op_not_vec: 3680 case INDEX_op_bitsel_vec: 3681 return 1; 3682 case INDEX_op_cmp_vec: 3683 case INDEX_op_cmpsel_vec: 3684 return -1; 3685 3686 case INDEX_op_rotli_vec: 3687 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3688 3689 case INDEX_op_shli_vec: 3690 case INDEX_op_shri_vec: 3691 /* We must expand the operation for MO_8. */ 3692 return vece == MO_8 ? -1 : 1; 3693 3694 case INDEX_op_sari_vec: 3695 switch (vece) { 3696 case MO_8: 3697 return -1; 3698 case MO_16: 3699 case MO_32: 3700 return 1; 3701 case MO_64: 3702 if (have_avx512vl) { 3703 return 1; 3704 } 3705 /* 3706 * We can emulate this for MO_64, but it does not pay off 3707 * unless we're producing at least 4 values. 3708 */ 3709 return type >= TCG_TYPE_V256 ? -1 : 0; 3710 } 3711 return 0; 3712 3713 case INDEX_op_shls_vec: 3714 case INDEX_op_shrs_vec: 3715 return vece >= MO_16; 3716 case INDEX_op_sars_vec: 3717 switch (vece) { 3718 case MO_16: 3719 case MO_32: 3720 return 1; 3721 case MO_64: 3722 return have_avx512vl; 3723 } 3724 return 0; 3725 case INDEX_op_rotls_vec: 3726 return vece >= MO_16 ? -1 : 0; 3727 3728 case INDEX_op_shlv_vec: 3729 case INDEX_op_shrv_vec: 3730 switch (vece) { 3731 case MO_16: 3732 return have_avx512bw; 3733 case MO_32: 3734 case MO_64: 3735 return have_avx2; 3736 } 3737 return 0; 3738 case INDEX_op_sarv_vec: 3739 switch (vece) { 3740 case MO_16: 3741 return have_avx512bw; 3742 case MO_32: 3743 return have_avx2; 3744 case MO_64: 3745 return have_avx512vl; 3746 } 3747 return 0; 3748 case INDEX_op_rotlv_vec: 3749 case INDEX_op_rotrv_vec: 3750 switch (vece) { 3751 case MO_16: 3752 return have_avx512vbmi2 ? -1 : 0; 3753 case MO_32: 3754 case MO_64: 3755 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3756 } 3757 return 0; 3758 3759 case INDEX_op_mul_vec: 3760 switch (vece) { 3761 case MO_8: 3762 return -1; 3763 case MO_64: 3764 return have_avx512dq; 3765 } 3766 return 1; 3767 3768 case INDEX_op_ssadd_vec: 3769 case INDEX_op_usadd_vec: 3770 case INDEX_op_sssub_vec: 3771 case INDEX_op_ussub_vec: 3772 return vece <= MO_16; 3773 case INDEX_op_smin_vec: 3774 case INDEX_op_smax_vec: 3775 case INDEX_op_umin_vec: 3776 case INDEX_op_umax_vec: 3777 case INDEX_op_abs_vec: 3778 return vece <= MO_32 || have_avx512vl; 3779 3780 default: 3781 return 0; 3782 } 3783} 3784 3785static void expand_vec_shi(TCGType type, unsigned vece, bool right, 3786 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3787{ 3788 uint8_t mask; 3789 3790 tcg_debug_assert(vece == MO_8); 3791 if (right) { 3792 mask = 0xff >> imm; 3793 tcg_gen_shri_vec(MO_16, v0, v1, imm); 3794 } else { 3795 mask = 0xff << imm; 3796 tcg_gen_shli_vec(MO_16, v0, v1, imm); 3797 } 3798 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 3799} 3800 3801static void expand_vec_sari(TCGType type, unsigned vece, 3802 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3803{ 3804 TCGv_vec t1, t2; 3805 3806 switch (vece) { 3807 case MO_8: 3808 /* Unpack to 16-bit, shift, and repack. */ 3809 t1 = tcg_temp_new_vec(type); 3810 t2 = tcg_temp_new_vec(type); 3811 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3812 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3813 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3814 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3815 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3816 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3817 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3818 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3819 tcg_temp_free_vec(t1); 3820 tcg_temp_free_vec(t2); 3821 break; 3822 3823 case MO_64: 3824 t1 = tcg_temp_new_vec(type); 3825 if (imm <= 32) { 3826 /* 3827 * We can emulate a small sign extend by performing an arithmetic 3828 * 32-bit shift and overwriting the high half of a 64-bit logical 3829 * shift. Note that the ISA says shift of 32 is valid, but TCG 3830 * does not, so we have to bound the smaller shift -- we get the 3831 * same result in the high half either way. 3832 */ 3833 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3834 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3835 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3836 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3837 tcgv_vec_arg(t1), 0xaa); 3838 } else { 3839 /* Otherwise we will need to use a compare vs 0 to produce 3840 * the sign-extend, shift and merge. 3841 */ 3842 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 3843 tcg_constant_vec(type, MO_64, 0), v1); 3844 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3845 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3846 tcg_gen_or_vec(MO_64, v0, v0, t1); 3847 } 3848 tcg_temp_free_vec(t1); 3849 break; 3850 3851 default: 3852 g_assert_not_reached(); 3853 } 3854} 3855 3856static void expand_vec_rotli(TCGType type, unsigned vece, 3857 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3858{ 3859 TCGv_vec t; 3860 3861 if (vece != MO_8 && have_avx512vbmi2) { 3862 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3863 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3864 return; 3865 } 3866 3867 t = tcg_temp_new_vec(type); 3868 tcg_gen_shli_vec(vece, t, v1, imm); 3869 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3870 tcg_gen_or_vec(vece, v0, v0, t); 3871 tcg_temp_free_vec(t); 3872} 3873 3874static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3875 TCGv_vec v1, TCGv_vec sh, bool right) 3876{ 3877 TCGv_vec t; 3878 3879 if (have_avx512vbmi2) { 3880 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3881 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3882 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3883 return; 3884 } 3885 3886 t = tcg_temp_new_vec(type); 3887 tcg_gen_dupi_vec(vece, t, 8 << vece); 3888 tcg_gen_sub_vec(vece, t, t, sh); 3889 if (right) { 3890 tcg_gen_shlv_vec(vece, t, v1, t); 3891 tcg_gen_shrv_vec(vece, v0, v1, sh); 3892 } else { 3893 tcg_gen_shrv_vec(vece, t, v1, t); 3894 tcg_gen_shlv_vec(vece, v0, v1, sh); 3895 } 3896 tcg_gen_or_vec(vece, v0, v0, t); 3897 tcg_temp_free_vec(t); 3898} 3899 3900static void expand_vec_rotls(TCGType type, unsigned vece, 3901 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3902{ 3903 TCGv_vec t = tcg_temp_new_vec(type); 3904 3905 tcg_debug_assert(vece != MO_8); 3906 3907 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3908 tcg_gen_dup_i32_vec(vece, t, lsh); 3909 if (vece >= MO_32) { 3910 tcg_gen_rotlv_vec(vece, v0, v1, t); 3911 } else { 3912 expand_vec_rotv(type, vece, v0, v1, t, false); 3913 } 3914 } else { 3915 TCGv_i32 rsh = tcg_temp_new_i32(); 3916 3917 tcg_gen_neg_i32(rsh, lsh); 3918 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3919 tcg_gen_shls_vec(vece, t, v1, lsh); 3920 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3921 tcg_gen_or_vec(vece, v0, v0, t); 3922 3923 tcg_temp_free_i32(rsh); 3924 } 3925 3926 tcg_temp_free_vec(t); 3927} 3928 3929static void expand_vec_mul(TCGType type, unsigned vece, 3930 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3931{ 3932 TCGv_vec t1, t2, t3, t4, zero; 3933 3934 tcg_debug_assert(vece == MO_8); 3935 3936 /* 3937 * Unpack v1 bytes to words, 0 | x. 3938 * Unpack v2 bytes to words, y | 0. 3939 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3940 * Shift logical right by 8 bits to clear the high 8 bytes before 3941 * using an unsigned saturated pack. 3942 * 3943 * The difference between the V64, V128 and V256 cases is merely how 3944 * we distribute the expansion between temporaries. 3945 */ 3946 switch (type) { 3947 case TCG_TYPE_V64: 3948 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3949 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3950 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3951 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3952 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3953 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3954 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3955 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3956 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3957 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3958 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3959 tcg_temp_free_vec(t1); 3960 tcg_temp_free_vec(t2); 3961 break; 3962 3963 case TCG_TYPE_V128: 3964 case TCG_TYPE_V256: 3965 t1 = tcg_temp_new_vec(type); 3966 t2 = tcg_temp_new_vec(type); 3967 t3 = tcg_temp_new_vec(type); 3968 t4 = tcg_temp_new_vec(type); 3969 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3970 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3971 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3972 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3973 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3974 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3975 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3976 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3977 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3978 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3979 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3980 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3981 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3982 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3983 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3984 tcg_temp_free_vec(t1); 3985 tcg_temp_free_vec(t2); 3986 tcg_temp_free_vec(t3); 3987 tcg_temp_free_vec(t4); 3988 break; 3989 3990 default: 3991 g_assert_not_reached(); 3992 } 3993} 3994 3995static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3996 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3997{ 3998 enum { 3999 NEED_INV = 1, 4000 NEED_SWAP = 2, 4001 NEED_BIAS = 4, 4002 NEED_UMIN = 8, 4003 NEED_UMAX = 16, 4004 }; 4005 TCGv_vec t1, t2, t3; 4006 uint8_t fixup; 4007 4008 switch (cond) { 4009 case TCG_COND_EQ: 4010 case TCG_COND_GT: 4011 fixup = 0; 4012 break; 4013 case TCG_COND_NE: 4014 case TCG_COND_LE: 4015 fixup = NEED_INV; 4016 break; 4017 case TCG_COND_LT: 4018 fixup = NEED_SWAP; 4019 break; 4020 case TCG_COND_GE: 4021 fixup = NEED_SWAP | NEED_INV; 4022 break; 4023 case TCG_COND_LEU: 4024 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 4025 fixup = NEED_UMIN; 4026 } else { 4027 fixup = NEED_BIAS | NEED_INV; 4028 } 4029 break; 4030 case TCG_COND_GTU: 4031 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 4032 fixup = NEED_UMIN | NEED_INV; 4033 } else { 4034 fixup = NEED_BIAS; 4035 } 4036 break; 4037 case TCG_COND_GEU: 4038 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 4039 fixup = NEED_UMAX; 4040 } else { 4041 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 4042 } 4043 break; 4044 case TCG_COND_LTU: 4045 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 4046 fixup = NEED_UMAX | NEED_INV; 4047 } else { 4048 fixup = NEED_BIAS | NEED_SWAP; 4049 } 4050 break; 4051 default: 4052 g_assert_not_reached(); 4053 } 4054 4055 if (fixup & NEED_INV) { 4056 cond = tcg_invert_cond(cond); 4057 } 4058 if (fixup & NEED_SWAP) { 4059 t1 = v1, v1 = v2, v2 = t1; 4060 cond = tcg_swap_cond(cond); 4061 } 4062 4063 t1 = t2 = NULL; 4064 if (fixup & (NEED_UMIN | NEED_UMAX)) { 4065 t1 = tcg_temp_new_vec(type); 4066 if (fixup & NEED_UMIN) { 4067 tcg_gen_umin_vec(vece, t1, v1, v2); 4068 } else { 4069 tcg_gen_umax_vec(vece, t1, v1, v2); 4070 } 4071 v2 = t1; 4072 cond = TCG_COND_EQ; 4073 } else if (fixup & NEED_BIAS) { 4074 t1 = tcg_temp_new_vec(type); 4075 t2 = tcg_temp_new_vec(type); 4076 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4077 tcg_gen_sub_vec(vece, t1, v1, t3); 4078 tcg_gen_sub_vec(vece, t2, v2, t3); 4079 v1 = t1; 4080 v2 = t2; 4081 cond = tcg_signed_cond(cond); 4082 } 4083 4084 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 4085 /* Expand directly; do not recurse. */ 4086 vec_gen_4(INDEX_op_cmp_vec, type, vece, 4087 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 4088 4089 if (t1) { 4090 tcg_temp_free_vec(t1); 4091 if (t2) { 4092 tcg_temp_free_vec(t2); 4093 } 4094 } 4095 return fixup & NEED_INV; 4096} 4097 4098static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 4099 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 4100{ 4101 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 4102 tcg_gen_not_vec(vece, v0, v0); 4103 } 4104} 4105 4106static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 4107 TCGv_vec c1, TCGv_vec c2, 4108 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 4109{ 4110 TCGv_vec t = tcg_temp_new_vec(type); 4111 4112 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 4113 /* Invert the sense of the compare by swapping arguments. */ 4114 TCGv_vec x; 4115 x = v3, v3 = v4, v4 = x; 4116 } 4117 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 4118 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 4119 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 4120 tcg_temp_free_vec(t); 4121} 4122 4123void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4124 TCGArg a0, ...) 4125{ 4126 va_list va; 4127 TCGArg a2; 4128 TCGv_vec v0, v1, v2, v3, v4; 4129 4130 va_start(va, a0); 4131 v0 = temp_tcgv_vec(arg_temp(a0)); 4132 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4133 a2 = va_arg(va, TCGArg); 4134 4135 switch (opc) { 4136 case INDEX_op_shli_vec: 4137 expand_vec_shi(type, vece, false, v0, v1, a2); 4138 break; 4139 case INDEX_op_shri_vec: 4140 expand_vec_shi(type, vece, true, v0, v1, a2); 4141 break; 4142 case INDEX_op_sari_vec: 4143 expand_vec_sari(type, vece, v0, v1, a2); 4144 break; 4145 4146 case INDEX_op_rotli_vec: 4147 expand_vec_rotli(type, vece, v0, v1, a2); 4148 break; 4149 4150 case INDEX_op_rotls_vec: 4151 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4152 break; 4153 4154 case INDEX_op_rotlv_vec: 4155 v2 = temp_tcgv_vec(arg_temp(a2)); 4156 expand_vec_rotv(type, vece, v0, v1, v2, false); 4157 break; 4158 case INDEX_op_rotrv_vec: 4159 v2 = temp_tcgv_vec(arg_temp(a2)); 4160 expand_vec_rotv(type, vece, v0, v1, v2, true); 4161 break; 4162 4163 case INDEX_op_mul_vec: 4164 v2 = temp_tcgv_vec(arg_temp(a2)); 4165 expand_vec_mul(type, vece, v0, v1, v2); 4166 break; 4167 4168 case INDEX_op_cmp_vec: 4169 v2 = temp_tcgv_vec(arg_temp(a2)); 4170 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 4171 break; 4172 4173 case INDEX_op_cmpsel_vec: 4174 v2 = temp_tcgv_vec(arg_temp(a2)); 4175 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4176 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4177 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 4178 break; 4179 4180 default: 4181 break; 4182 } 4183 4184 va_end(va); 4185} 4186 4187static const int tcg_target_callee_save_regs[] = { 4188#if TCG_TARGET_REG_BITS == 64 4189 TCG_REG_RBP, 4190 TCG_REG_RBX, 4191#if defined(_WIN64) 4192 TCG_REG_RDI, 4193 TCG_REG_RSI, 4194#endif 4195 TCG_REG_R12, 4196 TCG_REG_R13, 4197 TCG_REG_R14, /* Currently used for the global env. */ 4198 TCG_REG_R15, 4199#else 4200 TCG_REG_EBP, /* Currently used for the global env. */ 4201 TCG_REG_EBX, 4202 TCG_REG_ESI, 4203 TCG_REG_EDI, 4204#endif 4205}; 4206 4207/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4208 and tcg_register_jit. */ 4209 4210#define PUSH_SIZE \ 4211 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4212 * (TCG_TARGET_REG_BITS / 8)) 4213 4214#define FRAME_SIZE \ 4215 ((PUSH_SIZE \ 4216 + TCG_STATIC_CALL_ARGS_SIZE \ 4217 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4218 + TCG_TARGET_STACK_ALIGN - 1) \ 4219 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4220 4221/* Generate global QEMU prologue and epilogue code */ 4222static void tcg_target_qemu_prologue(TCGContext *s) 4223{ 4224 int i, stack_addend; 4225 4226 /* TB prologue */ 4227 4228 /* Reserve some stack space, also for TCG temps. */ 4229 stack_addend = FRAME_SIZE - PUSH_SIZE; 4230 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4231 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4232 4233 /* Save all callee saved registers. */ 4234 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4235 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4236 } 4237 4238 if (!tcg_use_softmmu && guest_base) { 4239 int seg = setup_guest_base_seg(); 4240 if (seg != 0) { 4241 x86_guest_base.seg = seg; 4242 } else if (guest_base == (int32_t)guest_base) { 4243 x86_guest_base.ofs = guest_base; 4244 } else { 4245 assert(TCG_TARGET_REG_BITS == 64); 4246 /* Choose R12 because, as a base, it requires a SIB byte. */ 4247 x86_guest_base.index = TCG_REG_R12; 4248 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4249 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4250 } 4251 } 4252 4253 if (TCG_TARGET_REG_BITS == 32) { 4254 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4255 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4256 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4257 /* jmp *tb. */ 4258 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4259 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4260 + stack_addend); 4261 } else { 4262 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4263 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4264 /* jmp *tb. */ 4265 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4266 } 4267 4268 /* 4269 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4270 * and fall through to the rest of the epilogue. 4271 */ 4272 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4273 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4274 4275 /* TB epilogue */ 4276 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4277 4278 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4279 4280 if (have_avx2) { 4281 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4282 } 4283 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4284 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4285 } 4286 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4287} 4288 4289static void tcg_out_tb_start(TCGContext *s) 4290{ 4291 /* nothing to do */ 4292} 4293 4294static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4295{ 4296 memset(p, 0x90, count); 4297} 4298 4299static void tcg_target_init(TCGContext *s) 4300{ 4301 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4302 if (TCG_TARGET_REG_BITS == 64) { 4303 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4304 } 4305 if (have_avx1) { 4306 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4307 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4308 } 4309 if (have_avx2) { 4310 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4311 } 4312 4313 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4314 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4315 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4316 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4317 if (TCG_TARGET_REG_BITS == 64) { 4318#if !defined(_WIN64) 4319 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4320 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4321#endif 4322 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4323 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4324 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4325 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4326 } 4327 4328 s->reserved_regs = 0; 4329 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4330 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4331#ifdef _WIN64 4332 /* These are call saved, and we don't save them, so don't use them. */ 4333 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4334 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4335 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4336 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4337 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4338 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4339 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4340 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4341 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4342 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4343#endif 4344} 4345 4346typedef struct { 4347 DebugFrameHeader h; 4348 uint8_t fde_def_cfa[4]; 4349 uint8_t fde_reg_ofs[14]; 4350} DebugFrame; 4351 4352/* We're expecting a 2 byte uleb128 encoded value. */ 4353QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4354 4355#if !defined(__ELF__) 4356 /* Host machine without ELF. */ 4357#elif TCG_TARGET_REG_BITS == 64 4358#define ELF_HOST_MACHINE EM_X86_64 4359static const DebugFrame debug_frame = { 4360 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4361 .h.cie.id = -1, 4362 .h.cie.version = 1, 4363 .h.cie.code_align = 1, 4364 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4365 .h.cie.return_column = 16, 4366 4367 /* Total FDE size does not include the "len" member. */ 4368 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4369 4370 .fde_def_cfa = { 4371 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4372 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4373 (FRAME_SIZE >> 7) 4374 }, 4375 .fde_reg_ofs = { 4376 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4377 /* The following ordering must match tcg_target_callee_save_regs. */ 4378 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4379 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4380 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4381 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4382 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4383 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4384 } 4385}; 4386#else 4387#define ELF_HOST_MACHINE EM_386 4388static const DebugFrame debug_frame = { 4389 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4390 .h.cie.id = -1, 4391 .h.cie.version = 1, 4392 .h.cie.code_align = 1, 4393 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4394 .h.cie.return_column = 8, 4395 4396 /* Total FDE size does not include the "len" member. */ 4397 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4398 4399 .fde_def_cfa = { 4400 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4401 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4402 (FRAME_SIZE >> 7) 4403 }, 4404 .fde_reg_ofs = { 4405 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4406 /* The following ordering must match tcg_target_callee_save_regs. */ 4407 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4408 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4409 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4410 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4411 } 4412}; 4413#endif 4414 4415#if defined(ELF_HOST_MACHINE) 4416void tcg_register_jit(const void *buf, size_t buf_size) 4417{ 4418 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4419} 4420#endif 4421