1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94static const int tcg_target_call_iarg_regs[] = { 95#if TCG_TARGET_REG_BITS == 64 96#if defined(_WIN64) 97 TCG_REG_RCX, 98 TCG_REG_RDX, 99#else 100 TCG_REG_RDI, 101 TCG_REG_RSI, 102 TCG_REG_RDX, 103 TCG_REG_RCX, 104#endif 105 TCG_REG_R8, 106 TCG_REG_R9, 107#else 108 /* 32 bit mode uses stack based calling convention (GCC default). */ 109#endif 110}; 111 112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 113{ 114 switch (kind) { 115 case TCG_CALL_RET_NORMAL: 116 tcg_debug_assert(slot >= 0 && slot <= 1); 117 return slot ? TCG_REG_EDX : TCG_REG_EAX; 118#ifdef _WIN64 119 case TCG_CALL_RET_BY_VEC: 120 tcg_debug_assert(slot == 0); 121 return TCG_REG_XMM0; 122#endif 123 default: 124 g_assert_not_reached(); 125 } 126} 127 128/* Constants we accept. */ 129#define TCG_CT_CONST_S32 0x100 130#define TCG_CT_CONST_U32 0x200 131#define TCG_CT_CONST_I32 0x400 132#define TCG_CT_CONST_WSZ 0x800 133 134/* Registers used with L constraint, which are the first argument 135 registers on x86_64, and two random call clobbered registers on 136 i386. */ 137#if TCG_TARGET_REG_BITS == 64 138# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 139# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 140#else 141# define TCG_REG_L0 TCG_REG_EAX 142# define TCG_REG_L1 TCG_REG_EDX 143#endif 144 145#define ALL_BYTEH_REGS 0x0000000fu 146#if TCG_TARGET_REG_BITS == 64 147# define ALL_GENERAL_REGS 0x0000ffffu 148# define ALL_VECTOR_REGS 0xffff0000u 149# define ALL_BYTEL_REGS ALL_GENERAL_REGS 150#else 151# define ALL_GENERAL_REGS 0x000000ffu 152# define ALL_VECTOR_REGS 0x00ff0000u 153# define ALL_BYTEL_REGS ALL_BYTEH_REGS 154#endif 155#ifdef CONFIG_SOFTMMU 156# define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1)) 157#else 158# define SOFTMMU_RESERVE_REGS 0 159#endif 160 161/* For 64-bit, we always know that CMOV is available. */ 162#if TCG_TARGET_REG_BITS == 64 163# define have_cmov true 164#else 165# define have_cmov (cpuinfo & CPUINFO_CMOV) 166#endif 167#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 168#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 169 170static const tcg_insn_unit *tb_ret_addr; 171 172static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 173 intptr_t value, intptr_t addend) 174{ 175 value += addend; 176 switch(type) { 177 case R_386_PC32: 178 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 179 if (value != (int32_t)value) { 180 return false; 181 } 182 /* FALLTHRU */ 183 case R_386_32: 184 tcg_patch32(code_ptr, value); 185 break; 186 case R_386_PC8: 187 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 188 if (value != (int8_t)value) { 189 return false; 190 } 191 tcg_patch8(code_ptr, value); 192 break; 193 default: 194 g_assert_not_reached(); 195 } 196 return true; 197} 198 199/* test if a constant matches the constraint */ 200static bool tcg_target_const_match(int64_t val, TCGType type, int ct) 201{ 202 if (ct & TCG_CT_CONST) { 203 return 1; 204 } 205 if (type == TCG_TYPE_I32) { 206 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) { 207 return 1; 208 } 209 } else { 210 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 211 return 1; 212 } 213 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 214 return 1; 215 } 216 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 217 return 1; 218 } 219 } 220 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 221 return 1; 222 } 223 return 0; 224} 225 226# define LOWREGMASK(x) ((x) & 7) 227 228#define P_EXT 0x100 /* 0x0f opcode prefix */ 229#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 230#define P_DATA16 0x400 /* 0x66 opcode prefix */ 231#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 232#if TCG_TARGET_REG_BITS == 64 233# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 234# define P_REXB_R 0x2000 /* REG field as byte register */ 235# define P_REXB_RM 0x4000 /* R/M field as byte register */ 236# define P_GS 0x8000 /* gs segment override */ 237#else 238# define P_REXW 0 239# define P_REXB_R 0 240# define P_REXB_RM 0 241# define P_GS 0 242#endif 243#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 244#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 245#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 246#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 247#define P_EVEX 0x100000 /* Requires EVEX encoding */ 248 249#define OPC_ARITH_EvIz (0x81) 250#define OPC_ARITH_EvIb (0x83) 251#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 252#define OPC_ANDN (0xf2 | P_EXT38) 253#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 254#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 255#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 256#define OPC_BSF (0xbc | P_EXT) 257#define OPC_BSR (0xbd | P_EXT) 258#define OPC_BSWAP (0xc8 | P_EXT) 259#define OPC_CALL_Jz (0xe8) 260#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 261#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 262#define OPC_DEC_r32 (0x48) 263#define OPC_IMUL_GvEv (0xaf | P_EXT) 264#define OPC_IMUL_GvEvIb (0x6b) 265#define OPC_IMUL_GvEvIz (0x69) 266#define OPC_INC_r32 (0x40) 267#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 268#define OPC_JCC_short (0x70) /* ... plus condition code */ 269#define OPC_JMP_long (0xe9) 270#define OPC_JMP_short (0xeb) 271#define OPC_LEA (0x8d) 272#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 273#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 274#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 275#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 276#define OPC_MOVB_EvIz (0xc6) 277#define OPC_MOVL_EvIz (0xc7) 278#define OPC_MOVL_Iv (0xb8) 279#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 280#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 281#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 282#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 283#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 284#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 285#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 286#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 287#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 288#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 289#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 290#define OPC_MOVSBL (0xbe | P_EXT) 291#define OPC_MOVSWL (0xbf | P_EXT) 292#define OPC_MOVSLQ (0x63 | P_REXW) 293#define OPC_MOVZBL (0xb6 | P_EXT) 294#define OPC_MOVZWL (0xb7 | P_EXT) 295#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 296#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 297#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 298#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 299#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 300#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 301#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 302#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 303#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 304#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 305#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 306#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 307#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 308#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 309#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 310#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 311#define OPC_PAND (0xdb | P_EXT | P_DATA16) 312#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 313#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 314#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 315#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 316#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 317#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 318#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 319#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 320#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 321#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 322#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 323#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 324#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 325#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 326#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 327#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 328#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 329#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 330#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 331#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 332#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 333#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 334#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 335#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 336#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 337#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 338#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 339#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 340#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 341#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 342#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 343#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 344#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 345#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 346#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 347#define OPC_POR (0xeb | P_EXT | P_DATA16) 348#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 349#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 350#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 351#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 352#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 353#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 354#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 355#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 356#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 357#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 358#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 359#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 360#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 362#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 363#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 364#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 365#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 366#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 367#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 368#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 369#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 370#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 371#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 372#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 373#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 374#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 375#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 376#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 377#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 378#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 379#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 380#define OPC_PXOR (0xef | P_EXT | P_DATA16) 381#define OPC_POP_r32 (0x58) 382#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 383#define OPC_PUSH_r32 (0x50) 384#define OPC_PUSH_Iv (0x68) 385#define OPC_PUSH_Ib (0x6a) 386#define OPC_RET (0xc3) 387#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 388#define OPC_SHIFT_1 (0xd1) 389#define OPC_SHIFT_Ib (0xc1) 390#define OPC_SHIFT_cl (0xd3) 391#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 392#define OPC_SHUFPS (0xc6 | P_EXT) 393#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 394#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 395#define OPC_SHRD_Ib (0xac | P_EXT) 396#define OPC_TESTL (0x85) 397#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 398#define OPC_UD2 (0x0b | P_EXT) 399#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 400#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 401#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 402#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 403#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 404#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 405#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 406#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 407#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 408#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 409#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 410#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 411#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 412#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 413#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 414#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 415#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 416#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 417#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 418#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 419#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 420#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 421#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 422#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 423#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 424#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 425#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 426#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 427#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 428#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 429#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 430#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 431#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 432#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 433#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 434#define OPC_VZEROUPPER (0x77 | P_EXT) 435#define OPC_XCHG_ax_r32 (0x90) 436#define OPC_XCHG_EvGv (0x87) 437 438#define OPC_GRP3_Eb (0xf6) 439#define OPC_GRP3_Ev (0xf7) 440#define OPC_GRP5 (0xff) 441#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 442 443/* Group 1 opcode extensions for 0x80-0x83. 444 These are also used as modifiers for OPC_ARITH. */ 445#define ARITH_ADD 0 446#define ARITH_OR 1 447#define ARITH_ADC 2 448#define ARITH_SBB 3 449#define ARITH_AND 4 450#define ARITH_SUB 5 451#define ARITH_XOR 6 452#define ARITH_CMP 7 453 454/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 455#define SHIFT_ROL 0 456#define SHIFT_ROR 1 457#define SHIFT_SHL 4 458#define SHIFT_SHR 5 459#define SHIFT_SAR 7 460 461/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 462#define EXT3_TESTi 0 463#define EXT3_NOT 2 464#define EXT3_NEG 3 465#define EXT3_MUL 4 466#define EXT3_IMUL 5 467#define EXT3_DIV 6 468#define EXT3_IDIV 7 469 470/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 471#define EXT5_INC_Ev 0 472#define EXT5_DEC_Ev 1 473#define EXT5_CALLN_Ev 2 474#define EXT5_JMPN_Ev 4 475 476/* Condition codes to be added to OPC_JCC_{long,short}. */ 477#define JCC_JMP (-1) 478#define JCC_JO 0x0 479#define JCC_JNO 0x1 480#define JCC_JB 0x2 481#define JCC_JAE 0x3 482#define JCC_JE 0x4 483#define JCC_JNE 0x5 484#define JCC_JBE 0x6 485#define JCC_JA 0x7 486#define JCC_JS 0x8 487#define JCC_JNS 0x9 488#define JCC_JP 0xa 489#define JCC_JNP 0xb 490#define JCC_JL 0xc 491#define JCC_JGE 0xd 492#define JCC_JLE 0xe 493#define JCC_JG 0xf 494 495static const uint8_t tcg_cond_to_jcc[] = { 496 [TCG_COND_EQ] = JCC_JE, 497 [TCG_COND_NE] = JCC_JNE, 498 [TCG_COND_LT] = JCC_JL, 499 [TCG_COND_GE] = JCC_JGE, 500 [TCG_COND_LE] = JCC_JLE, 501 [TCG_COND_GT] = JCC_JG, 502 [TCG_COND_LTU] = JCC_JB, 503 [TCG_COND_GEU] = JCC_JAE, 504 [TCG_COND_LEU] = JCC_JBE, 505 [TCG_COND_GTU] = JCC_JA, 506}; 507 508#if TCG_TARGET_REG_BITS == 64 509static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 510{ 511 int rex; 512 513 if (opc & P_GS) { 514 tcg_out8(s, 0x65); 515 } 516 if (opc & P_DATA16) { 517 /* We should never be asking for both 16 and 64-bit operation. */ 518 tcg_debug_assert((opc & P_REXW) == 0); 519 tcg_out8(s, 0x66); 520 } 521 if (opc & P_SIMDF3) { 522 tcg_out8(s, 0xf3); 523 } else if (opc & P_SIMDF2) { 524 tcg_out8(s, 0xf2); 525 } 526 527 rex = 0; 528 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 529 rex |= (r & 8) >> 1; /* REX.R */ 530 rex |= (x & 8) >> 2; /* REX.X */ 531 rex |= (rm & 8) >> 3; /* REX.B */ 532 533 /* P_REXB_{R,RM} indicates that the given register is the low byte. 534 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 535 as otherwise the encoding indicates %[abcd]h. Note that the values 536 that are ORed in merely indicate that the REX byte must be present; 537 those bits get discarded in output. */ 538 rex |= opc & (r >= 4 ? P_REXB_R : 0); 539 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 540 541 if (rex) { 542 tcg_out8(s, (uint8_t)(rex | 0x40)); 543 } 544 545 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 546 tcg_out8(s, 0x0f); 547 if (opc & P_EXT38) { 548 tcg_out8(s, 0x38); 549 } else if (opc & P_EXT3A) { 550 tcg_out8(s, 0x3a); 551 } 552 } 553 554 tcg_out8(s, opc); 555} 556#else 557static void tcg_out_opc(TCGContext *s, int opc) 558{ 559 if (opc & P_DATA16) { 560 tcg_out8(s, 0x66); 561 } 562 if (opc & P_SIMDF3) { 563 tcg_out8(s, 0xf3); 564 } else if (opc & P_SIMDF2) { 565 tcg_out8(s, 0xf2); 566 } 567 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 568 tcg_out8(s, 0x0f); 569 if (opc & P_EXT38) { 570 tcg_out8(s, 0x38); 571 } else if (opc & P_EXT3A) { 572 tcg_out8(s, 0x3a); 573 } 574 } 575 tcg_out8(s, opc); 576} 577/* Discard the register arguments to tcg_out_opc early, so as not to penalize 578 the 32-bit compilation paths. This method works with all versions of gcc, 579 whereas relying on optimization may not be able to exclude them. */ 580#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 581#endif 582 583static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 584{ 585 tcg_out_opc(s, opc, r, rm, 0); 586 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 587} 588 589static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 590 int rm, int index) 591{ 592 int tmp; 593 594 /* Use the two byte form if possible, which cannot encode 595 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 596 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 597 && ((rm | index) & 8) == 0) { 598 /* Two byte VEX prefix. */ 599 tcg_out8(s, 0xc5); 600 601 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 602 } else { 603 /* Three byte VEX prefix. */ 604 tcg_out8(s, 0xc4); 605 606 /* VEX.m-mmmm */ 607 if (opc & P_EXT3A) { 608 tmp = 3; 609 } else if (opc & P_EXT38) { 610 tmp = 2; 611 } else if (opc & P_EXT) { 612 tmp = 1; 613 } else { 614 g_assert_not_reached(); 615 } 616 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 617 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 618 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 619 tcg_out8(s, tmp); 620 621 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 622 } 623 624 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 625 /* VEX.pp */ 626 if (opc & P_DATA16) { 627 tmp |= 1; /* 0x66 */ 628 } else if (opc & P_SIMDF3) { 629 tmp |= 2; /* 0xf3 */ 630 } else if (opc & P_SIMDF2) { 631 tmp |= 3; /* 0xf2 */ 632 } 633 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 634 tcg_out8(s, tmp); 635 tcg_out8(s, opc); 636} 637 638static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 639 int rm, int index) 640{ 641 /* The entire 4-byte evex prefix; with R' and V' set. */ 642 uint32_t p = 0x08041062; 643 int mm, pp; 644 645 tcg_debug_assert(have_avx512vl); 646 647 /* EVEX.mm */ 648 if (opc & P_EXT3A) { 649 mm = 3; 650 } else if (opc & P_EXT38) { 651 mm = 2; 652 } else if (opc & P_EXT) { 653 mm = 1; 654 } else { 655 g_assert_not_reached(); 656 } 657 658 /* EVEX.pp */ 659 if (opc & P_DATA16) { 660 pp = 1; /* 0x66 */ 661 } else if (opc & P_SIMDF3) { 662 pp = 2; /* 0xf3 */ 663 } else if (opc & P_SIMDF2) { 664 pp = 3; /* 0xf2 */ 665 } else { 666 pp = 0; 667 } 668 669 p = deposit32(p, 8, 2, mm); 670 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 671 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 672 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 673 p = deposit32(p, 16, 2, pp); 674 p = deposit32(p, 19, 4, ~v); 675 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 676 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 677 678 tcg_out32(s, p); 679 tcg_out8(s, opc); 680} 681 682static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 683{ 684 if (opc & P_EVEX) { 685 tcg_out_evex_opc(s, opc, r, v, rm, 0); 686 } else { 687 tcg_out_vex_opc(s, opc, r, v, rm, 0); 688 } 689 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 690} 691 692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 693 We handle either RM and INDEX missing with a negative value. In 64-bit 694 mode for absolute addresses, ~RM is the size of the immediate operand 695 that will follow the instruction. */ 696 697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 698 int shift, intptr_t offset) 699{ 700 int mod, len; 701 702 if (index < 0 && rm < 0) { 703 if (TCG_TARGET_REG_BITS == 64) { 704 /* Try for a rip-relative addressing mode. This has replaced 705 the 32-bit-mode absolute addressing encoding. */ 706 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 707 intptr_t disp = offset - pc; 708 if (disp == (int32_t)disp) { 709 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 710 tcg_out32(s, disp); 711 return; 712 } 713 714 /* Try for an absolute address encoding. This requires the 715 use of the MODRM+SIB encoding and is therefore larger than 716 rip-relative addressing. */ 717 if (offset == (int32_t)offset) { 718 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 719 tcg_out8(s, (4 << 3) | 5); 720 tcg_out32(s, offset); 721 return; 722 } 723 724 /* ??? The memory isn't directly addressable. */ 725 g_assert_not_reached(); 726 } else { 727 /* Absolute address. */ 728 tcg_out8(s, (r << 3) | 5); 729 tcg_out32(s, offset); 730 return; 731 } 732 } 733 734 /* Find the length of the immediate addend. Note that the encoding 735 that would be used for (%ebp) indicates absolute addressing. */ 736 if (rm < 0) { 737 mod = 0, len = 4, rm = 5; 738 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 739 mod = 0, len = 0; 740 } else if (offset == (int8_t)offset) { 741 mod = 0x40, len = 1; 742 } else { 743 mod = 0x80, len = 4; 744 } 745 746 /* Use a single byte MODRM format if possible. Note that the encoding 747 that would be used for %esp is the escape to the two byte form. */ 748 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 749 /* Single byte MODRM format. */ 750 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 751 } else { 752 /* Two byte MODRM+SIB format. */ 753 754 /* Note that the encoding that would place %esp into the index 755 field indicates no index register. In 64-bit mode, the REX.X 756 bit counts, so %r12 can be used as the index. */ 757 if (index < 0) { 758 index = 4; 759 } else { 760 tcg_debug_assert(index != TCG_REG_ESP); 761 } 762 763 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 764 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 765 } 766 767 if (len == 1) { 768 tcg_out8(s, offset); 769 } else if (len == 4) { 770 tcg_out32(s, offset); 771 } 772} 773 774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 775 int index, int shift, intptr_t offset) 776{ 777 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 778 tcg_out_sib_offset(s, r, rm, index, shift, offset); 779} 780 781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 782 int rm, int index, int shift, 783 intptr_t offset) 784{ 785 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 786 tcg_out_sib_offset(s, r, rm, index, shift, offset); 787} 788 789/* A simplification of the above with no index or shift. */ 790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 791 int rm, intptr_t offset) 792{ 793 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 794} 795 796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 797 int v, int rm, intptr_t offset) 798{ 799 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 800} 801 802/* Output an opcode with an expected reference to the constant pool. */ 803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 804{ 805 tcg_out_opc(s, opc, r, 0, 0); 806 /* Absolute for 32-bit, pc-relative for 64-bit. */ 807 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 808 tcg_out32(s, 0); 809} 810 811/* Output an opcode with an expected reference to the constant pool. */ 812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 813{ 814 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 815 /* Absolute for 32-bit, pc-relative for 64-bit. */ 816 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 817 tcg_out32(s, 0); 818} 819 820/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 822{ 823 /* Propagate an opcode prefix, such as P_REXW. */ 824 int ext = subop & ~0x7; 825 subop &= 0x7; 826 827 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 828} 829 830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 831{ 832 int rexw = 0; 833 834 if (arg == ret) { 835 return true; 836 } 837 switch (type) { 838 case TCG_TYPE_I64: 839 rexw = P_REXW; 840 /* fallthru */ 841 case TCG_TYPE_I32: 842 if (ret < 16) { 843 if (arg < 16) { 844 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 845 } else { 846 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 847 } 848 } else { 849 if (arg < 16) { 850 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 851 } else { 852 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 853 } 854 } 855 break; 856 857 case TCG_TYPE_V64: 858 tcg_debug_assert(ret >= 16 && arg >= 16); 859 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 860 break; 861 case TCG_TYPE_V128: 862 tcg_debug_assert(ret >= 16 && arg >= 16); 863 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 864 break; 865 case TCG_TYPE_V256: 866 tcg_debug_assert(ret >= 16 && arg >= 16); 867 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 868 break; 869 870 default: 871 g_assert_not_reached(); 872 } 873 return true; 874} 875 876static const int avx2_dup_insn[4] = { 877 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 878 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 879}; 880 881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 882 TCGReg r, TCGReg a) 883{ 884 if (have_avx2) { 885 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 886 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 887 } else { 888 switch (vece) { 889 case MO_8: 890 /* ??? With zero in a register, use PSHUFB. */ 891 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 892 a = r; 893 /* FALLTHRU */ 894 case MO_16: 895 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 896 a = r; 897 /* FALLTHRU */ 898 case MO_32: 899 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 900 /* imm8 operand: all output lanes selected from input lane 0. */ 901 tcg_out8(s, 0); 902 break; 903 case MO_64: 904 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 905 break; 906 default: 907 g_assert_not_reached(); 908 } 909 } 910 return true; 911} 912 913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 914 TCGReg r, TCGReg base, intptr_t offset) 915{ 916 if (have_avx2) { 917 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 918 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 919 r, 0, base, offset); 920 } else { 921 switch (vece) { 922 case MO_64: 923 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 924 break; 925 case MO_32: 926 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 927 break; 928 case MO_16: 929 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 930 tcg_out8(s, 0); /* imm8 */ 931 tcg_out_dup_vec(s, type, vece, r, r); 932 break; 933 case MO_8: 934 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 935 tcg_out8(s, 0); /* imm8 */ 936 tcg_out_dup_vec(s, type, vece, r, r); 937 break; 938 default: 939 g_assert_not_reached(); 940 } 941 } 942 return true; 943} 944 945static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 946 TCGReg ret, int64_t arg) 947{ 948 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 949 950 if (arg == 0) { 951 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 952 return; 953 } 954 if (arg == -1) { 955 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 956 return; 957 } 958 959 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 960 if (have_avx2) { 961 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 962 } else { 963 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 964 } 965 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 966 } else { 967 if (type == TCG_TYPE_V64) { 968 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 969 } else if (have_avx2) { 970 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 971 } else { 972 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 973 } 974 if (TCG_TARGET_REG_BITS == 64) { 975 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 976 } else { 977 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 978 } 979 } 980} 981 982static void tcg_out_movi_vec(TCGContext *s, TCGType type, 983 TCGReg ret, tcg_target_long arg) 984{ 985 if (arg == 0) { 986 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 987 return; 988 } 989 if (arg == -1) { 990 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 991 return; 992 } 993 994 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 995 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 996 if (TCG_TARGET_REG_BITS == 64) { 997 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 998 } else { 999 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1000 } 1001} 1002 1003static void tcg_out_movi_int(TCGContext *s, TCGType type, 1004 TCGReg ret, tcg_target_long arg) 1005{ 1006 tcg_target_long diff; 1007 1008 if (arg == 0) { 1009 tgen_arithr(s, ARITH_XOR, ret, ret); 1010 return; 1011 } 1012 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1013 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1014 tcg_out32(s, arg); 1015 return; 1016 } 1017 if (arg == (int32_t)arg) { 1018 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1019 tcg_out32(s, arg); 1020 return; 1021 } 1022 1023 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1024 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1025 if (diff == (int32_t)diff) { 1026 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1027 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1028 tcg_out32(s, diff); 1029 return; 1030 } 1031 1032 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1033 tcg_out64(s, arg); 1034} 1035 1036static void tcg_out_movi(TCGContext *s, TCGType type, 1037 TCGReg ret, tcg_target_long arg) 1038{ 1039 switch (type) { 1040 case TCG_TYPE_I32: 1041#if TCG_TARGET_REG_BITS == 64 1042 case TCG_TYPE_I64: 1043#endif 1044 if (ret < 16) { 1045 tcg_out_movi_int(s, type, ret, arg); 1046 } else { 1047 tcg_out_movi_vec(s, type, ret, arg); 1048 } 1049 break; 1050 default: 1051 g_assert_not_reached(); 1052 } 1053} 1054 1055static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1056{ 1057 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1058 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1059 return true; 1060} 1061 1062static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1063 tcg_target_long imm) 1064{ 1065 /* This function is only used for passing structs by reference. */ 1066 tcg_debug_assert(imm == (int32_t)imm); 1067 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1068} 1069 1070static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1071{ 1072 if (val == (int8_t)val) { 1073 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1074 tcg_out8(s, val); 1075 } else if (val == (int32_t)val) { 1076 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1077 tcg_out32(s, val); 1078 } else { 1079 g_assert_not_reached(); 1080 } 1081} 1082 1083static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1084{ 1085 /* Given the strength of x86 memory ordering, we only need care for 1086 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1087 faster than "mfence", so don't bother with the sse insn. */ 1088 if (a0 & TCG_MO_ST_LD) { 1089 tcg_out8(s, 0xf0); 1090 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1091 tcg_out8(s, 0); 1092 } 1093} 1094 1095static inline void tcg_out_push(TCGContext *s, int reg) 1096{ 1097 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1098} 1099 1100static inline void tcg_out_pop(TCGContext *s, int reg) 1101{ 1102 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1103} 1104 1105static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1106 TCGReg arg1, intptr_t arg2) 1107{ 1108 switch (type) { 1109 case TCG_TYPE_I32: 1110 if (ret < 16) { 1111 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1112 } else { 1113 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1114 } 1115 break; 1116 case TCG_TYPE_I64: 1117 if (ret < 16) { 1118 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1119 break; 1120 } 1121 /* FALLTHRU */ 1122 case TCG_TYPE_V64: 1123 /* There is no instruction that can validate 8-byte alignment. */ 1124 tcg_debug_assert(ret >= 16); 1125 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1126 break; 1127 case TCG_TYPE_V128: 1128 /* 1129 * The gvec infrastructure is asserts that v128 vector loads 1130 * and stores use a 16-byte aligned offset. Validate that the 1131 * final pointer is aligned by using an insn that will SIGSEGV. 1132 */ 1133 tcg_debug_assert(ret >= 16); 1134 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1135 break; 1136 case TCG_TYPE_V256: 1137 /* 1138 * The gvec infrastructure only requires 16-byte alignment, 1139 * so here we must use an unaligned load. 1140 */ 1141 tcg_debug_assert(ret >= 16); 1142 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1143 ret, 0, arg1, arg2); 1144 break; 1145 default: 1146 g_assert_not_reached(); 1147 } 1148} 1149 1150static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1151 TCGReg arg1, intptr_t arg2) 1152{ 1153 switch (type) { 1154 case TCG_TYPE_I32: 1155 if (arg < 16) { 1156 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1157 } else { 1158 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1159 } 1160 break; 1161 case TCG_TYPE_I64: 1162 if (arg < 16) { 1163 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1164 break; 1165 } 1166 /* FALLTHRU */ 1167 case TCG_TYPE_V64: 1168 /* There is no instruction that can validate 8-byte alignment. */ 1169 tcg_debug_assert(arg >= 16); 1170 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1171 break; 1172 case TCG_TYPE_V128: 1173 /* 1174 * The gvec infrastructure is asserts that v128 vector loads 1175 * and stores use a 16-byte aligned offset. Validate that the 1176 * final pointer is aligned by using an insn that will SIGSEGV. 1177 * 1178 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1179 * for _WIN64, which must have SSE2 but may not have AVX. 1180 */ 1181 tcg_debug_assert(arg >= 16); 1182 if (have_avx1) { 1183 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1184 } else { 1185 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1186 } 1187 break; 1188 case TCG_TYPE_V256: 1189 /* 1190 * The gvec infrastructure only requires 16-byte alignment, 1191 * so here we must use an unaligned store. 1192 */ 1193 tcg_debug_assert(arg >= 16); 1194 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1195 arg, 0, arg1, arg2); 1196 break; 1197 default: 1198 g_assert_not_reached(); 1199 } 1200} 1201 1202static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1203 TCGReg base, intptr_t ofs) 1204{ 1205 int rexw = 0; 1206 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1207 if (val != (int32_t)val) { 1208 return false; 1209 } 1210 rexw = P_REXW; 1211 } else if (type != TCG_TYPE_I32) { 1212 return false; 1213 } 1214 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1215 tcg_out32(s, val); 1216 return true; 1217} 1218 1219static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1220{ 1221 /* Propagate an opcode prefix, such as P_DATA16. */ 1222 int ext = subopc & ~0x7; 1223 subopc &= 0x7; 1224 1225 if (count == 1) { 1226 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1227 } else { 1228 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1229 tcg_out8(s, count); 1230 } 1231} 1232 1233static inline void tcg_out_bswap32(TCGContext *s, int reg) 1234{ 1235 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1236} 1237 1238static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1239{ 1240 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1241} 1242 1243static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1244{ 1245 /* movzbl */ 1246 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1247 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1248} 1249 1250static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1251{ 1252 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1253 /* movsbl */ 1254 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1255 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1256} 1257 1258static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1259{ 1260 /* movzwl */ 1261 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1262} 1263 1264static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1265{ 1266 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1267 /* movsw[lq] */ 1268 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1269} 1270 1271static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1272{ 1273 /* 32-bit mov zero extends. */ 1274 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1275} 1276 1277static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1278{ 1279 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1280 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1281} 1282 1283static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1284{ 1285 tcg_out_ext32s(s, dest, src); 1286} 1287 1288static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1289{ 1290 if (dest != src) { 1291 tcg_out_ext32u(s, dest, src); 1292 } 1293} 1294 1295static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1296{ 1297 tcg_out_ext32u(s, dest, src); 1298} 1299 1300static inline void tcg_out_bswap64(TCGContext *s, int reg) 1301{ 1302 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1303} 1304 1305static void tgen_arithi(TCGContext *s, int c, int r0, 1306 tcg_target_long val, int cf) 1307{ 1308 int rexw = 0; 1309 1310 if (TCG_TARGET_REG_BITS == 64) { 1311 rexw = c & -8; 1312 c &= 7; 1313 } 1314 1315 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1316 partial flags update stalls on Pentium4 and are not recommended 1317 by current Intel optimization manuals. */ 1318 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) { 1319 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1320 if (TCG_TARGET_REG_BITS == 64) { 1321 /* The single-byte increment encodings are re-tasked as the 1322 REX prefixes. Use the MODRM encoding. */ 1323 tcg_out_modrm(s, OPC_GRP5 + rexw, 1324 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1325 } else { 1326 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1327 } 1328 return; 1329 } 1330 1331 if (c == ARITH_AND) { 1332 if (TCG_TARGET_REG_BITS == 64) { 1333 if (val == 0xffffffffu) { 1334 tcg_out_ext32u(s, r0, r0); 1335 return; 1336 } 1337 if (val == (uint32_t)val) { 1338 /* AND with no high bits set can use a 32-bit operation. */ 1339 rexw = 0; 1340 } 1341 } 1342 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1343 tcg_out_ext8u(s, r0, r0); 1344 return; 1345 } 1346 if (val == 0xffffu) { 1347 tcg_out_ext16u(s, r0, r0); 1348 return; 1349 } 1350 } 1351 1352 if (val == (int8_t)val) { 1353 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1354 tcg_out8(s, val); 1355 return; 1356 } 1357 if (rexw == 0 || val == (int32_t)val) { 1358 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1359 tcg_out32(s, val); 1360 return; 1361 } 1362 1363 g_assert_not_reached(); 1364} 1365 1366static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1367{ 1368 if (val != 0) { 1369 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1370 } 1371} 1372 1373/* Set SMALL to force a short forward branch. */ 1374static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1375{ 1376 int32_t val, val1; 1377 1378 if (l->has_value) { 1379 val = tcg_pcrel_diff(s, l->u.value_ptr); 1380 val1 = val - 2; 1381 if ((int8_t)val1 == val1) { 1382 if (opc == -1) { 1383 tcg_out8(s, OPC_JMP_short); 1384 } else { 1385 tcg_out8(s, OPC_JCC_short + opc); 1386 } 1387 tcg_out8(s, val1); 1388 } else { 1389 tcg_debug_assert(!small); 1390 if (opc == -1) { 1391 tcg_out8(s, OPC_JMP_long); 1392 tcg_out32(s, val - 5); 1393 } else { 1394 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1395 tcg_out32(s, val - 6); 1396 } 1397 } 1398 } else if (small) { 1399 if (opc == -1) { 1400 tcg_out8(s, OPC_JMP_short); 1401 } else { 1402 tcg_out8(s, OPC_JCC_short + opc); 1403 } 1404 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1405 s->code_ptr += 1; 1406 } else { 1407 if (opc == -1) { 1408 tcg_out8(s, OPC_JMP_long); 1409 } else { 1410 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1411 } 1412 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1413 s->code_ptr += 4; 1414 } 1415} 1416 1417static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1418 int const_arg2, int rexw) 1419{ 1420 if (const_arg2) { 1421 if (arg2 == 0) { 1422 /* test r, r */ 1423 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1424 } else { 1425 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1426 } 1427 } else { 1428 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1429 } 1430} 1431 1432static void tcg_out_brcond32(TCGContext *s, TCGCond cond, 1433 TCGArg arg1, TCGArg arg2, int const_arg2, 1434 TCGLabel *label, int small) 1435{ 1436 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1437 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1438} 1439 1440#if TCG_TARGET_REG_BITS == 64 1441static void tcg_out_brcond64(TCGContext *s, TCGCond cond, 1442 TCGArg arg1, TCGArg arg2, int const_arg2, 1443 TCGLabel *label, int small) 1444{ 1445 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1446 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1447} 1448#else 1449/* XXX: we implement it at the target level to avoid having to 1450 handle cross basic blocks temporaries */ 1451static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1452 const int *const_args, int small) 1453{ 1454 TCGLabel *label_next = gen_new_label(); 1455 TCGLabel *label_this = arg_label(args[5]); 1456 1457 switch(args[4]) { 1458 case TCG_COND_EQ: 1459 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1460 label_next, 1); 1461 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3], 1462 label_this, small); 1463 break; 1464 case TCG_COND_NE: 1465 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1466 label_this, small); 1467 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3], 1468 label_this, small); 1469 break; 1470 case TCG_COND_LT: 1471 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1472 label_this, small); 1473 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1474 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1475 label_this, small); 1476 break; 1477 case TCG_COND_LE: 1478 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1479 label_this, small); 1480 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1481 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1482 label_this, small); 1483 break; 1484 case TCG_COND_GT: 1485 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1486 label_this, small); 1487 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1488 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1489 label_this, small); 1490 break; 1491 case TCG_COND_GE: 1492 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1493 label_this, small); 1494 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1495 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1496 label_this, small); 1497 break; 1498 case TCG_COND_LTU: 1499 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1500 label_this, small); 1501 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1502 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1503 label_this, small); 1504 break; 1505 case TCG_COND_LEU: 1506 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1507 label_this, small); 1508 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1509 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1510 label_this, small); 1511 break; 1512 case TCG_COND_GTU: 1513 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1514 label_this, small); 1515 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1516 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1517 label_this, small); 1518 break; 1519 case TCG_COND_GEU: 1520 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1521 label_this, small); 1522 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1523 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1524 label_this, small); 1525 break; 1526 default: 1527 g_assert_not_reached(); 1528 } 1529 tcg_out_label(s, label_next); 1530} 1531#endif 1532 1533static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest, 1534 TCGArg arg1, TCGArg arg2, int const_arg2) 1535{ 1536 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1537 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1538 tcg_out_ext8u(s, dest, dest); 1539} 1540 1541#if TCG_TARGET_REG_BITS == 64 1542static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest, 1543 TCGArg arg1, TCGArg arg2, int const_arg2) 1544{ 1545 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1546 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1547 tcg_out_ext8u(s, dest, dest); 1548} 1549#else 1550static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1551 const int *const_args) 1552{ 1553 TCGArg new_args[6]; 1554 TCGLabel *label_true, *label_over; 1555 1556 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1557 1558 if (args[0] == args[1] || args[0] == args[2] 1559 || (!const_args[3] && args[0] == args[3]) 1560 || (!const_args[4] && args[0] == args[4])) { 1561 /* When the destination overlaps with one of the argument 1562 registers, don't do anything tricky. */ 1563 label_true = gen_new_label(); 1564 label_over = gen_new_label(); 1565 1566 new_args[5] = label_arg(label_true); 1567 tcg_out_brcond2(s, new_args, const_args+1, 1); 1568 1569 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1570 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1571 tcg_out_label(s, label_true); 1572 1573 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1574 tcg_out_label(s, label_over); 1575 } else { 1576 /* When the destination does not overlap one of the arguments, 1577 clear the destination first, jump if cond false, and emit an 1578 increment in the true case. This results in smaller code. */ 1579 1580 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1581 1582 label_over = gen_new_label(); 1583 new_args[4] = tcg_invert_cond(new_args[4]); 1584 new_args[5] = label_arg(label_over); 1585 tcg_out_brcond2(s, new_args, const_args+1, 1); 1586 1587 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1588 tcg_out_label(s, label_over); 1589 } 1590} 1591#endif 1592 1593static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1594 TCGReg dest, TCGReg v1) 1595{ 1596 if (have_cmov) { 1597 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1598 } else { 1599 TCGLabel *over = gen_new_label(); 1600 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1601 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1602 tcg_out_label(s, over); 1603 } 1604} 1605 1606static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest, 1607 TCGReg c1, TCGArg c2, int const_c2, 1608 TCGReg v1) 1609{ 1610 tcg_out_cmp(s, c1, c2, const_c2, 0); 1611 tcg_out_cmov(s, cond, 0, dest, v1); 1612} 1613 1614#if TCG_TARGET_REG_BITS == 64 1615static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest, 1616 TCGReg c1, TCGArg c2, int const_c2, 1617 TCGReg v1) 1618{ 1619 tcg_out_cmp(s, c1, c2, const_c2, P_REXW); 1620 tcg_out_cmov(s, cond, P_REXW, dest, v1); 1621} 1622#endif 1623 1624static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1625 TCGArg arg2, bool const_a2) 1626{ 1627 if (have_bmi1) { 1628 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1629 if (const_a2) { 1630 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1631 } else { 1632 tcg_debug_assert(dest != arg2); 1633 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1634 } 1635 } else { 1636 tcg_debug_assert(dest != arg2); 1637 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1638 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1639 } 1640} 1641 1642static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1643 TCGArg arg2, bool const_a2) 1644{ 1645 if (have_lzcnt) { 1646 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1647 if (const_a2) { 1648 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1649 } else { 1650 tcg_debug_assert(dest != arg2); 1651 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1652 } 1653 } else { 1654 tcg_debug_assert(!const_a2); 1655 tcg_debug_assert(dest != arg1); 1656 tcg_debug_assert(dest != arg2); 1657 1658 /* Recall that the output of BSR is the index not the count. */ 1659 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1660 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1661 1662 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1663 tcg_out_cmp(s, arg1, 0, 1, rexw); 1664 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1665 } 1666} 1667 1668static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1669{ 1670 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1671 1672 if (disp == (int32_t)disp) { 1673 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1674 tcg_out32(s, disp); 1675 } else { 1676 /* rip-relative addressing into the constant pool. 1677 This is 6 + 8 = 14 bytes, as compared to using an 1678 immediate load 10 + 6 = 16 bytes, plus we may 1679 be able to re-use the pool constant for more calls. */ 1680 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1681 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1682 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1683 tcg_out32(s, 0); 1684 } 1685} 1686 1687static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1688 const TCGHelperInfo *info) 1689{ 1690 tcg_out_branch(s, 1, dest); 1691 1692#ifndef _WIN32 1693 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1694 /* 1695 * The sysv i386 abi for struct return places a reference as the 1696 * first argument of the stack, and pops that argument with the 1697 * return statement. Since we want to retain the aligned stack 1698 * pointer for the callee, we do not want to actually push that 1699 * argument before the call but rely on the normal store to the 1700 * stack slot. But we do need to compensate for the pop in order 1701 * to reset our correct stack pointer value. 1702 * Pushing a garbage value back onto the stack is quickest. 1703 */ 1704 tcg_out_push(s, TCG_REG_EAX); 1705 } 1706#endif 1707} 1708 1709static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1710{ 1711 tcg_out_branch(s, 0, dest); 1712} 1713 1714static void tcg_out_nopn(TCGContext *s, int n) 1715{ 1716 int i; 1717 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1718 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1719 * duplicate prefix, and all of the interesting recent cores can 1720 * decode and discard the duplicates in a single cycle. 1721 */ 1722 tcg_debug_assert(n >= 1); 1723 for (i = 1; i < n; ++i) { 1724 tcg_out8(s, 0x66); 1725 } 1726 tcg_out8(s, 0x90); 1727} 1728 1729/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */ 1730static void __attribute__((unused)) 1731tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i) 1732{ 1733 /* 1734 * This is used for testing alignment, so we can usually use testb. 1735 * For i686, we have to use testl for %esi/%edi. 1736 */ 1737 if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) { 1738 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r); 1739 tcg_out8(s, i); 1740 } else { 1741 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r); 1742 tcg_out32(s, i); 1743 } 1744} 1745 1746typedef struct { 1747 TCGReg base; 1748 int index; 1749 int ofs; 1750 int seg; 1751 TCGAtomAlign aa; 1752} HostAddress; 1753 1754bool tcg_target_has_memory_bswap(MemOp memop) 1755{ 1756 return have_movbe; 1757} 1758 1759/* 1760 * Because i686 has no register parameters and because x86_64 has xchg 1761 * to handle addr/data register overlap, we have placed all input arguments 1762 * before we need might need a scratch reg. 1763 * 1764 * Even then, a scratch is only needed for l->raddr. Rather than expose 1765 * a general-purpose scratch when we don't actually know it's available, 1766 * use the ra_gen hook to load into RAX if needed. 1767 */ 1768#if TCG_TARGET_REG_BITS == 64 1769static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1770{ 1771 if (arg < 0) { 1772 arg = TCG_REG_RAX; 1773 } 1774 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1775 return arg; 1776} 1777static const TCGLdstHelperParam ldst_helper_param = { 1778 .ra_gen = ldst_ra_gen 1779}; 1780#else 1781static const TCGLdstHelperParam ldst_helper_param = { }; 1782#endif 1783 1784/* 1785 * Generate code for the slow path for a load at the end of block 1786 */ 1787static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1788{ 1789 MemOp opc = get_memop(l->oi); 1790 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1791 1792 /* resolve label address */ 1793 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1794 if (label_ptr[1]) { 1795 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1796 } 1797 1798 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 1799 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 1800 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 1801 1802 tcg_out_jmp(s, l->raddr); 1803 return true; 1804} 1805 1806/* 1807 * Generate code for the slow path for a store at the end of block 1808 */ 1809static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1810{ 1811 MemOp opc = get_memop(l->oi); 1812 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1813 1814 /* resolve label address */ 1815 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1816 if (label_ptr[1]) { 1817 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1818 } 1819 1820 tcg_out_st_helper_args(s, l, &ldst_helper_param); 1821 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 1822 1823 tcg_out_jmp(s, l->raddr); 1824 return true; 1825} 1826 1827#ifndef CONFIG_SOFTMMU 1828static HostAddress x86_guest_base = { 1829 .index = -1 1830}; 1831 1832#if defined(__x86_64__) && defined(__linux__) 1833# include <asm/prctl.h> 1834# include <sys/prctl.h> 1835int arch_prctl(int code, unsigned long addr); 1836static inline int setup_guest_base_seg(void) 1837{ 1838 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 1839 return P_GS; 1840 } 1841 return 0; 1842} 1843#elif defined(__x86_64__) && \ 1844 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 1845# include <machine/sysarch.h> 1846static inline int setup_guest_base_seg(void) 1847{ 1848 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 1849 return P_GS; 1850 } 1851 return 0; 1852} 1853#else 1854static inline int setup_guest_base_seg(void) 1855{ 1856 return 0; 1857} 1858#endif /* setup_guest_base_seg */ 1859#endif /* !SOFTMMU */ 1860 1861/* 1862 * For softmmu, perform the TLB load and compare. 1863 * For useronly, perform any required alignment tests. 1864 * In both cases, return a TCGLabelQemuLdst structure if the slow path 1865 * is required and fill in @h with the host address for the fast path. 1866 */ 1867static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 1868 TCGReg addrlo, TCGReg addrhi, 1869 MemOpIdx oi, bool is_ld) 1870{ 1871 TCGLabelQemuLdst *ldst = NULL; 1872 MemOp opc = get_memop(oi); 1873 unsigned a_mask; 1874 1875#ifdef CONFIG_SOFTMMU 1876 h->index = TCG_REG_L0; 1877 h->ofs = 0; 1878 h->seg = 0; 1879#else 1880 *h = x86_guest_base; 1881#endif 1882 h->base = addrlo; 1883 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); 1884 a_mask = (1 << h->aa.align) - 1; 1885 1886#ifdef CONFIG_SOFTMMU 1887 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 1888 : offsetof(CPUTLBEntry, addr_write); 1889 TCGType ttype = TCG_TYPE_I32; 1890 TCGType tlbtype = TCG_TYPE_I32; 1891 int trexw = 0, hrexw = 0, tlbrexw = 0; 1892 unsigned mem_index = get_mmuidx(oi); 1893 unsigned s_bits = opc & MO_SIZE; 1894 unsigned s_mask = (1 << s_bits) - 1; 1895 int tlb_mask; 1896 1897 ldst = new_ldst_label(s); 1898 ldst->is_ld = is_ld; 1899 ldst->oi = oi; 1900 ldst->addrlo_reg = addrlo; 1901 ldst->addrhi_reg = addrhi; 1902 1903 if (TCG_TARGET_REG_BITS == 64) { 1904 ttype = s->addr_type; 1905 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 1906 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 1907 hrexw = P_REXW; 1908 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 1909 tlbtype = TCG_TYPE_I64; 1910 tlbrexw = P_REXW; 1911 } 1912 } 1913 } 1914 1915 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 1916 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 1917 s->page_bits - CPU_TLB_ENTRY_BITS); 1918 1919 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 1920 TLB_MASK_TABLE_OFS(mem_index) + 1921 offsetof(CPUTLBDescFast, mask)); 1922 1923 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 1924 TLB_MASK_TABLE_OFS(mem_index) + 1925 offsetof(CPUTLBDescFast, table)); 1926 1927 /* 1928 * If the required alignment is at least as large as the access, simply 1929 * copy the address and mask. For lesser alignments, check that we don't 1930 * cross pages for the complete access. 1931 */ 1932 if (a_mask >= s_mask) { 1933 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 1934 } else { 1935 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 1936 addrlo, s_mask - a_mask); 1937 } 1938 tlb_mask = s->page_mask | a_mask; 1939 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 1940 1941 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 1942 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 1943 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 1944 1945 /* jne slow_path */ 1946 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1947 ldst->label_ptr[0] = s->code_ptr; 1948 s->code_ptr += 4; 1949 1950 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 1951 /* cmp 4(TCG_REG_L0), addrhi */ 1952 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4); 1953 1954 /* jne slow_path */ 1955 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1956 ldst->label_ptr[1] = s->code_ptr; 1957 s->code_ptr += 4; 1958 } 1959 1960 /* TLB Hit. */ 1961 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 1962 offsetof(CPUTLBEntry, addend)); 1963#else 1964 if (a_mask) { 1965 ldst = new_ldst_label(s); 1966 1967 ldst->is_ld = is_ld; 1968 ldst->oi = oi; 1969 ldst->addrlo_reg = addrlo; 1970 ldst->addrhi_reg = addrhi; 1971 1972 tcg_out_testi(s, addrlo, a_mask); 1973 /* jne slow_path */ 1974 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1975 ldst->label_ptr[0] = s->code_ptr; 1976 s->code_ptr += 4; 1977 } 1978#endif 1979 1980 return ldst; 1981} 1982 1983static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 1984 HostAddress h, TCGType type, MemOp memop) 1985{ 1986 bool use_movbe = false; 1987 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1988 int movop = OPC_MOVL_GvEv; 1989 1990 /* Do big-endian loads with movbe. */ 1991 if (memop & MO_BSWAP) { 1992 tcg_debug_assert(have_movbe); 1993 use_movbe = true; 1994 movop = OPC_MOVBE_GyMy; 1995 } 1996 1997 switch (memop & MO_SSIZE) { 1998 case MO_UB: 1999 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2000 h.base, h.index, 0, h.ofs); 2001 break; 2002 case MO_SB: 2003 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2004 h.base, h.index, 0, h.ofs); 2005 break; 2006 case MO_UW: 2007 if (use_movbe) { 2008 /* There is no extending movbe; only low 16-bits are modified. */ 2009 if (datalo != h.base && datalo != h.index) { 2010 /* XOR breaks dependency chains. */ 2011 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2012 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2013 datalo, h.base, h.index, 0, h.ofs); 2014 } else { 2015 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2016 datalo, h.base, h.index, 0, h.ofs); 2017 tcg_out_ext16u(s, datalo, datalo); 2018 } 2019 } else { 2020 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2021 h.base, h.index, 0, h.ofs); 2022 } 2023 break; 2024 case MO_SW: 2025 if (use_movbe) { 2026 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2027 datalo, h.base, h.index, 0, h.ofs); 2028 tcg_out_ext16s(s, type, datalo, datalo); 2029 } else { 2030 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2031 datalo, h.base, h.index, 0, h.ofs); 2032 } 2033 break; 2034 case MO_UL: 2035 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2036 h.base, h.index, 0, h.ofs); 2037 break; 2038#if TCG_TARGET_REG_BITS == 64 2039 case MO_SL: 2040 if (use_movbe) { 2041 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2042 h.base, h.index, 0, h.ofs); 2043 tcg_out_ext32s(s, datalo, datalo); 2044 } else { 2045 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2046 h.base, h.index, 0, h.ofs); 2047 } 2048 break; 2049#endif 2050 case MO_UQ: 2051 if (TCG_TARGET_REG_BITS == 64) { 2052 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2053 h.base, h.index, 0, h.ofs); 2054 break; 2055 } 2056 if (use_movbe) { 2057 TCGReg t = datalo; 2058 datalo = datahi; 2059 datahi = t; 2060 } 2061 if (h.base == datalo || h.index == datalo) { 2062 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2063 h.base, h.index, 0, h.ofs); 2064 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2065 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2066 } else { 2067 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2068 h.base, h.index, 0, h.ofs); 2069 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2070 h.base, h.index, 0, h.ofs + 4); 2071 } 2072 break; 2073 default: 2074 g_assert_not_reached(); 2075 } 2076} 2077 2078static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2079 TCGReg addrlo, TCGReg addrhi, 2080 MemOpIdx oi, TCGType data_type) 2081{ 2082 TCGLabelQemuLdst *ldst; 2083 HostAddress h; 2084 2085 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2086 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2087 2088 if (ldst) { 2089 ldst->type = data_type; 2090 ldst->datalo_reg = datalo; 2091 ldst->datahi_reg = datahi; 2092 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2093 } 2094} 2095 2096static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2097 HostAddress h, MemOp memop) 2098{ 2099 bool use_movbe = false; 2100 int movop = OPC_MOVL_EvGv; 2101 2102 /* 2103 * Do big-endian stores with movbe or softmmu. 2104 * User-only without movbe will have its swapping done generically. 2105 */ 2106 if (memop & MO_BSWAP) { 2107 tcg_debug_assert(have_movbe); 2108 use_movbe = true; 2109 movop = OPC_MOVBE_MyGy; 2110 } 2111 2112 switch (memop & MO_SIZE) { 2113 case MO_8: 2114 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2115 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2116 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2117 datalo, h.base, h.index, 0, h.ofs); 2118 break; 2119 case MO_16: 2120 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2121 h.base, h.index, 0, h.ofs); 2122 break; 2123 case MO_32: 2124 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2125 h.base, h.index, 0, h.ofs); 2126 break; 2127 case MO_64: 2128 if (TCG_TARGET_REG_BITS == 64) { 2129 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2130 h.base, h.index, 0, h.ofs); 2131 } else { 2132 if (use_movbe) { 2133 TCGReg t = datalo; 2134 datalo = datahi; 2135 datahi = t; 2136 } 2137 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2138 h.base, h.index, 0, h.ofs); 2139 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2140 h.base, h.index, 0, h.ofs + 4); 2141 } 2142 break; 2143 default: 2144 g_assert_not_reached(); 2145 } 2146} 2147 2148static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2149 TCGReg addrlo, TCGReg addrhi, 2150 MemOpIdx oi, TCGType data_type) 2151{ 2152 TCGLabelQemuLdst *ldst; 2153 HostAddress h; 2154 2155 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2156 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2157 2158 if (ldst) { 2159 ldst->type = data_type; 2160 ldst->datalo_reg = datalo; 2161 ldst->datahi_reg = datahi; 2162 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2163 } 2164} 2165 2166static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2167{ 2168 /* Reuse the zeroing that exists for goto_ptr. */ 2169 if (a0 == 0) { 2170 tcg_out_jmp(s, tcg_code_gen_epilogue); 2171 } else { 2172 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2173 tcg_out_jmp(s, tb_ret_addr); 2174 } 2175} 2176 2177static void tcg_out_goto_tb(TCGContext *s, int which) 2178{ 2179 /* 2180 * Jump displacement must be aligned for atomic patching; 2181 * see if we need to add extra nops before jump 2182 */ 2183 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2184 if (gap != 1) { 2185 tcg_out_nopn(s, gap - 1); 2186 } 2187 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2188 set_jmp_insn_offset(s, which); 2189 tcg_out32(s, 0); 2190 set_jmp_reset_offset(s, which); 2191} 2192 2193void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2194 uintptr_t jmp_rx, uintptr_t jmp_rw) 2195{ 2196 /* patch the branch destination */ 2197 uintptr_t addr = tb->jmp_target_addr[n]; 2198 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2199 /* no need to flush icache explicitly */ 2200} 2201 2202static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2203 const TCGArg args[TCG_MAX_OP_ARGS], 2204 const int const_args[TCG_MAX_OP_ARGS]) 2205{ 2206 TCGArg a0, a1, a2; 2207 int c, const_a2, vexop, rexw = 0; 2208 2209#if TCG_TARGET_REG_BITS == 64 2210# define OP_32_64(x) \ 2211 case glue(glue(INDEX_op_, x), _i64): \ 2212 rexw = P_REXW; /* FALLTHRU */ \ 2213 case glue(glue(INDEX_op_, x), _i32) 2214#else 2215# define OP_32_64(x) \ 2216 case glue(glue(INDEX_op_, x), _i32) 2217#endif 2218 2219 /* Hoist the loads of the most common arguments. */ 2220 a0 = args[0]; 2221 a1 = args[1]; 2222 a2 = args[2]; 2223 const_a2 = const_args[2]; 2224 2225 switch (opc) { 2226 case INDEX_op_goto_ptr: 2227 /* jmp to the given host address (could be epilogue) */ 2228 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2229 break; 2230 case INDEX_op_br: 2231 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2232 break; 2233 OP_32_64(ld8u): 2234 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2235 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2236 break; 2237 OP_32_64(ld8s): 2238 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2239 break; 2240 OP_32_64(ld16u): 2241 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2242 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2243 break; 2244 OP_32_64(ld16s): 2245 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2246 break; 2247#if TCG_TARGET_REG_BITS == 64 2248 case INDEX_op_ld32u_i64: 2249#endif 2250 case INDEX_op_ld_i32: 2251 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2252 break; 2253 2254 OP_32_64(st8): 2255 if (const_args[0]) { 2256 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2257 tcg_out8(s, a0); 2258 } else { 2259 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2260 } 2261 break; 2262 OP_32_64(st16): 2263 if (const_args[0]) { 2264 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2265 tcg_out16(s, a0); 2266 } else { 2267 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2268 } 2269 break; 2270#if TCG_TARGET_REG_BITS == 64 2271 case INDEX_op_st32_i64: 2272#endif 2273 case INDEX_op_st_i32: 2274 if (const_args[0]) { 2275 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2276 tcg_out32(s, a0); 2277 } else { 2278 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2279 } 2280 break; 2281 2282 OP_32_64(add): 2283 /* For 3-operand addition, use LEA. */ 2284 if (a0 != a1) { 2285 TCGArg c3 = 0; 2286 if (const_a2) { 2287 c3 = a2, a2 = -1; 2288 } else if (a0 == a2) { 2289 /* Watch out for dest = src + dest, since we've removed 2290 the matching constraint on the add. */ 2291 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2292 break; 2293 } 2294 2295 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2296 break; 2297 } 2298 c = ARITH_ADD; 2299 goto gen_arith; 2300 OP_32_64(sub): 2301 c = ARITH_SUB; 2302 goto gen_arith; 2303 OP_32_64(and): 2304 c = ARITH_AND; 2305 goto gen_arith; 2306 OP_32_64(or): 2307 c = ARITH_OR; 2308 goto gen_arith; 2309 OP_32_64(xor): 2310 c = ARITH_XOR; 2311 goto gen_arith; 2312 gen_arith: 2313 if (const_a2) { 2314 tgen_arithi(s, c + rexw, a0, a2, 0); 2315 } else { 2316 tgen_arithr(s, c + rexw, a0, a2); 2317 } 2318 break; 2319 2320 OP_32_64(andc): 2321 if (const_a2) { 2322 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2323 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2324 } else { 2325 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2326 } 2327 break; 2328 2329 OP_32_64(mul): 2330 if (const_a2) { 2331 int32_t val; 2332 val = a2; 2333 if (val == (int8_t)val) { 2334 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2335 tcg_out8(s, val); 2336 } else { 2337 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2338 tcg_out32(s, val); 2339 } 2340 } else { 2341 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2342 } 2343 break; 2344 2345 OP_32_64(div2): 2346 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2347 break; 2348 OP_32_64(divu2): 2349 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2350 break; 2351 2352 OP_32_64(shl): 2353 /* For small constant 3-operand shift, use LEA. */ 2354 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2355 if (a2 - 1 == 0) { 2356 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2357 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2358 } else { 2359 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2360 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2361 } 2362 break; 2363 } 2364 c = SHIFT_SHL; 2365 vexop = OPC_SHLX; 2366 goto gen_shift_maybe_vex; 2367 OP_32_64(shr): 2368 c = SHIFT_SHR; 2369 vexop = OPC_SHRX; 2370 goto gen_shift_maybe_vex; 2371 OP_32_64(sar): 2372 c = SHIFT_SAR; 2373 vexop = OPC_SARX; 2374 goto gen_shift_maybe_vex; 2375 OP_32_64(rotl): 2376 c = SHIFT_ROL; 2377 goto gen_shift; 2378 OP_32_64(rotr): 2379 c = SHIFT_ROR; 2380 goto gen_shift; 2381 gen_shift_maybe_vex: 2382 if (have_bmi2) { 2383 if (!const_a2) { 2384 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2385 break; 2386 } 2387 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2388 } 2389 /* FALLTHRU */ 2390 gen_shift: 2391 if (const_a2) { 2392 tcg_out_shifti(s, c + rexw, a0, a2); 2393 } else { 2394 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2395 } 2396 break; 2397 2398 OP_32_64(ctz): 2399 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2400 break; 2401 OP_32_64(clz): 2402 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2403 break; 2404 OP_32_64(ctpop): 2405 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2406 break; 2407 2408 case INDEX_op_brcond_i32: 2409 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2410 break; 2411 case INDEX_op_setcond_i32: 2412 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2); 2413 break; 2414 case INDEX_op_movcond_i32: 2415 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]); 2416 break; 2417 2418 OP_32_64(bswap16): 2419 if (a2 & TCG_BSWAP_OS) { 2420 /* Output must be sign-extended. */ 2421 if (rexw) { 2422 tcg_out_bswap64(s, a0); 2423 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2424 } else { 2425 tcg_out_bswap32(s, a0); 2426 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2427 } 2428 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2429 /* Output must be zero-extended, but input isn't. */ 2430 tcg_out_bswap32(s, a0); 2431 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2432 } else { 2433 tcg_out_rolw_8(s, a0); 2434 } 2435 break; 2436 OP_32_64(bswap32): 2437 tcg_out_bswap32(s, a0); 2438 if (rexw && (a2 & TCG_BSWAP_OS)) { 2439 tcg_out_ext32s(s, a0, a0); 2440 } 2441 break; 2442 2443 OP_32_64(neg): 2444 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2445 break; 2446 OP_32_64(not): 2447 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2448 break; 2449 2450 case INDEX_op_qemu_ld_a64_i32: 2451 if (TCG_TARGET_REG_BITS == 32) { 2452 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2453 break; 2454 } 2455 /* fall through */ 2456 case INDEX_op_qemu_ld_a32_i32: 2457 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2458 break; 2459 case INDEX_op_qemu_ld_a32_i64: 2460 if (TCG_TARGET_REG_BITS == 64) { 2461 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2462 } else { 2463 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2464 } 2465 break; 2466 case INDEX_op_qemu_ld_a64_i64: 2467 if (TCG_TARGET_REG_BITS == 64) { 2468 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2469 } else { 2470 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2471 } 2472 break; 2473 2474 case INDEX_op_qemu_st_a64_i32: 2475 case INDEX_op_qemu_st8_a64_i32: 2476 if (TCG_TARGET_REG_BITS == 32) { 2477 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2478 break; 2479 } 2480 /* fall through */ 2481 case INDEX_op_qemu_st_a32_i32: 2482 case INDEX_op_qemu_st8_a32_i32: 2483 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2484 break; 2485 case INDEX_op_qemu_st_a32_i64: 2486 if (TCG_TARGET_REG_BITS == 64) { 2487 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2488 } else { 2489 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2490 } 2491 break; 2492 case INDEX_op_qemu_st_a64_i64: 2493 if (TCG_TARGET_REG_BITS == 64) { 2494 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2495 } else { 2496 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2497 } 2498 break; 2499 2500 OP_32_64(mulu2): 2501 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2502 break; 2503 OP_32_64(muls2): 2504 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2505 break; 2506 OP_32_64(add2): 2507 if (const_args[4]) { 2508 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2509 } else { 2510 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2511 } 2512 if (const_args[5]) { 2513 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2514 } else { 2515 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2516 } 2517 break; 2518 OP_32_64(sub2): 2519 if (const_args[4]) { 2520 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2521 } else { 2522 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2523 } 2524 if (const_args[5]) { 2525 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2526 } else { 2527 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2528 } 2529 break; 2530 2531#if TCG_TARGET_REG_BITS == 32 2532 case INDEX_op_brcond2_i32: 2533 tcg_out_brcond2(s, args, const_args, 0); 2534 break; 2535 case INDEX_op_setcond2_i32: 2536 tcg_out_setcond2(s, args, const_args); 2537 break; 2538#else /* TCG_TARGET_REG_BITS == 64 */ 2539 case INDEX_op_ld32s_i64: 2540 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2541 break; 2542 case INDEX_op_ld_i64: 2543 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2544 break; 2545 case INDEX_op_st_i64: 2546 if (const_args[0]) { 2547 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2548 tcg_out32(s, a0); 2549 } else { 2550 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2551 } 2552 break; 2553 2554 case INDEX_op_brcond_i64: 2555 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2556 break; 2557 case INDEX_op_setcond_i64: 2558 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2); 2559 break; 2560 case INDEX_op_movcond_i64: 2561 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]); 2562 break; 2563 2564 case INDEX_op_bswap64_i64: 2565 tcg_out_bswap64(s, a0); 2566 break; 2567 case INDEX_op_extrh_i64_i32: 2568 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2569 break; 2570#endif 2571 2572 OP_32_64(deposit): 2573 if (args[3] == 0 && args[4] == 8) { 2574 /* load bits 0..7 */ 2575 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2576 } else if (args[3] == 8 && args[4] == 8) { 2577 /* load bits 8..15 */ 2578 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2579 } else if (args[3] == 0 && args[4] == 16) { 2580 /* load bits 0..15 */ 2581 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2582 } else { 2583 g_assert_not_reached(); 2584 } 2585 break; 2586 2587 case INDEX_op_extract_i64: 2588 if (a2 + args[3] == 32) { 2589 /* This is a 32-bit zero-extending right shift. */ 2590 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2591 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2592 break; 2593 } 2594 /* FALLTHRU */ 2595 case INDEX_op_extract_i32: 2596 /* On the off-chance that we can use the high-byte registers. 2597 Otherwise we emit the same ext16 + shift pattern that we 2598 would have gotten from the normal tcg-op.c expansion. */ 2599 tcg_debug_assert(a2 == 8 && args[3] == 8); 2600 if (a1 < 4 && a0 < 8) { 2601 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2602 } else { 2603 tcg_out_ext16u(s, a0, a1); 2604 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2605 } 2606 break; 2607 2608 case INDEX_op_sextract_i32: 2609 /* We don't implement sextract_i64, as we cannot sign-extend to 2610 64-bits without using the REX prefix that explicitly excludes 2611 access to the high-byte registers. */ 2612 tcg_debug_assert(a2 == 8 && args[3] == 8); 2613 if (a1 < 4 && a0 < 8) { 2614 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2615 } else { 2616 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 2617 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2618 } 2619 break; 2620 2621 OP_32_64(extract2): 2622 /* Note that SHRD outputs to the r/m operand. */ 2623 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2624 tcg_out8(s, args[3]); 2625 break; 2626 2627 case INDEX_op_mb: 2628 tcg_out_mb(s, a0); 2629 break; 2630 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2631 case INDEX_op_mov_i64: 2632 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2633 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 2634 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 2635 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 2636 case INDEX_op_ext8s_i64: 2637 case INDEX_op_ext8u_i32: 2638 case INDEX_op_ext8u_i64: 2639 case INDEX_op_ext16s_i32: 2640 case INDEX_op_ext16s_i64: 2641 case INDEX_op_ext16u_i32: 2642 case INDEX_op_ext16u_i64: 2643 case INDEX_op_ext32s_i64: 2644 case INDEX_op_ext32u_i64: 2645 case INDEX_op_ext_i32_i64: 2646 case INDEX_op_extu_i32_i64: 2647 case INDEX_op_extrl_i64_i32: 2648 default: 2649 g_assert_not_reached(); 2650 } 2651 2652#undef OP_32_64 2653} 2654 2655static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2656 unsigned vecl, unsigned vece, 2657 const TCGArg args[TCG_MAX_OP_ARGS], 2658 const int const_args[TCG_MAX_OP_ARGS]) 2659{ 2660 static int const add_insn[4] = { 2661 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2662 }; 2663 static int const ssadd_insn[4] = { 2664 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2665 }; 2666 static int const usadd_insn[4] = { 2667 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2668 }; 2669 static int const sub_insn[4] = { 2670 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2671 }; 2672 static int const sssub_insn[4] = { 2673 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2674 }; 2675 static int const ussub_insn[4] = { 2676 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2677 }; 2678 static int const mul_insn[4] = { 2679 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 2680 }; 2681 static int const shift_imm_insn[4] = { 2682 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2683 }; 2684 static int const cmpeq_insn[4] = { 2685 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2686 }; 2687 static int const cmpgt_insn[4] = { 2688 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2689 }; 2690 static int const punpckl_insn[4] = { 2691 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2692 }; 2693 static int const punpckh_insn[4] = { 2694 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2695 }; 2696 static int const packss_insn[4] = { 2697 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2698 }; 2699 static int const packus_insn[4] = { 2700 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2701 }; 2702 static int const smin_insn[4] = { 2703 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 2704 }; 2705 static int const smax_insn[4] = { 2706 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 2707 }; 2708 static int const umin_insn[4] = { 2709 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 2710 }; 2711 static int const umax_insn[4] = { 2712 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 2713 }; 2714 static int const rotlv_insn[4] = { 2715 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 2716 }; 2717 static int const rotrv_insn[4] = { 2718 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 2719 }; 2720 static int const shlv_insn[4] = { 2721 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 2722 }; 2723 static int const shrv_insn[4] = { 2724 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 2725 }; 2726 static int const sarv_insn[4] = { 2727 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 2728 }; 2729 static int const shls_insn[4] = { 2730 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 2731 }; 2732 static int const shrs_insn[4] = { 2733 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 2734 }; 2735 static int const sars_insn[4] = { 2736 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 2737 }; 2738 static int const vpshldi_insn[4] = { 2739 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 2740 }; 2741 static int const vpshldv_insn[4] = { 2742 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 2743 }; 2744 static int const vpshrdv_insn[4] = { 2745 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 2746 }; 2747 static int const abs_insn[4] = { 2748 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 2749 }; 2750 2751 TCGType type = vecl + TCG_TYPE_V64; 2752 int insn, sub; 2753 TCGArg a0, a1, a2, a3; 2754 2755 a0 = args[0]; 2756 a1 = args[1]; 2757 a2 = args[2]; 2758 2759 switch (opc) { 2760 case INDEX_op_add_vec: 2761 insn = add_insn[vece]; 2762 goto gen_simd; 2763 case INDEX_op_ssadd_vec: 2764 insn = ssadd_insn[vece]; 2765 goto gen_simd; 2766 case INDEX_op_usadd_vec: 2767 insn = usadd_insn[vece]; 2768 goto gen_simd; 2769 case INDEX_op_sub_vec: 2770 insn = sub_insn[vece]; 2771 goto gen_simd; 2772 case INDEX_op_sssub_vec: 2773 insn = sssub_insn[vece]; 2774 goto gen_simd; 2775 case INDEX_op_ussub_vec: 2776 insn = ussub_insn[vece]; 2777 goto gen_simd; 2778 case INDEX_op_mul_vec: 2779 insn = mul_insn[vece]; 2780 goto gen_simd; 2781 case INDEX_op_and_vec: 2782 insn = OPC_PAND; 2783 goto gen_simd; 2784 case INDEX_op_or_vec: 2785 insn = OPC_POR; 2786 goto gen_simd; 2787 case INDEX_op_xor_vec: 2788 insn = OPC_PXOR; 2789 goto gen_simd; 2790 case INDEX_op_smin_vec: 2791 insn = smin_insn[vece]; 2792 goto gen_simd; 2793 case INDEX_op_umin_vec: 2794 insn = umin_insn[vece]; 2795 goto gen_simd; 2796 case INDEX_op_smax_vec: 2797 insn = smax_insn[vece]; 2798 goto gen_simd; 2799 case INDEX_op_umax_vec: 2800 insn = umax_insn[vece]; 2801 goto gen_simd; 2802 case INDEX_op_shlv_vec: 2803 insn = shlv_insn[vece]; 2804 goto gen_simd; 2805 case INDEX_op_shrv_vec: 2806 insn = shrv_insn[vece]; 2807 goto gen_simd; 2808 case INDEX_op_sarv_vec: 2809 insn = sarv_insn[vece]; 2810 goto gen_simd; 2811 case INDEX_op_rotlv_vec: 2812 insn = rotlv_insn[vece]; 2813 goto gen_simd; 2814 case INDEX_op_rotrv_vec: 2815 insn = rotrv_insn[vece]; 2816 goto gen_simd; 2817 case INDEX_op_shls_vec: 2818 insn = shls_insn[vece]; 2819 goto gen_simd; 2820 case INDEX_op_shrs_vec: 2821 insn = shrs_insn[vece]; 2822 goto gen_simd; 2823 case INDEX_op_sars_vec: 2824 insn = sars_insn[vece]; 2825 goto gen_simd; 2826 case INDEX_op_x86_punpckl_vec: 2827 insn = punpckl_insn[vece]; 2828 goto gen_simd; 2829 case INDEX_op_x86_punpckh_vec: 2830 insn = punpckh_insn[vece]; 2831 goto gen_simd; 2832 case INDEX_op_x86_packss_vec: 2833 insn = packss_insn[vece]; 2834 goto gen_simd; 2835 case INDEX_op_x86_packus_vec: 2836 insn = packus_insn[vece]; 2837 goto gen_simd; 2838 case INDEX_op_x86_vpshldv_vec: 2839 insn = vpshldv_insn[vece]; 2840 a1 = a2; 2841 a2 = args[3]; 2842 goto gen_simd; 2843 case INDEX_op_x86_vpshrdv_vec: 2844 insn = vpshrdv_insn[vece]; 2845 a1 = a2; 2846 a2 = args[3]; 2847 goto gen_simd; 2848#if TCG_TARGET_REG_BITS == 32 2849 case INDEX_op_dup2_vec: 2850 /* First merge the two 32-bit inputs to a single 64-bit element. */ 2851 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 2852 /* Then replicate the 64-bit elements across the rest of the vector. */ 2853 if (type != TCG_TYPE_V64) { 2854 tcg_out_dup_vec(s, type, MO_64, a0, a0); 2855 } 2856 break; 2857#endif 2858 case INDEX_op_abs_vec: 2859 insn = abs_insn[vece]; 2860 a2 = a1; 2861 a1 = 0; 2862 goto gen_simd; 2863 gen_simd: 2864 tcg_debug_assert(insn != OPC_UD2); 2865 if (type == TCG_TYPE_V256) { 2866 insn |= P_VEXL; 2867 } 2868 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2869 break; 2870 2871 case INDEX_op_cmp_vec: 2872 sub = args[3]; 2873 if (sub == TCG_COND_EQ) { 2874 insn = cmpeq_insn[vece]; 2875 } else if (sub == TCG_COND_GT) { 2876 insn = cmpgt_insn[vece]; 2877 } else { 2878 g_assert_not_reached(); 2879 } 2880 goto gen_simd; 2881 2882 case INDEX_op_andc_vec: 2883 insn = OPC_PANDN; 2884 if (type == TCG_TYPE_V256) { 2885 insn |= P_VEXL; 2886 } 2887 tcg_out_vex_modrm(s, insn, a0, a2, a1); 2888 break; 2889 2890 case INDEX_op_shli_vec: 2891 insn = shift_imm_insn[vece]; 2892 sub = 6; 2893 goto gen_shift; 2894 case INDEX_op_shri_vec: 2895 insn = shift_imm_insn[vece]; 2896 sub = 2; 2897 goto gen_shift; 2898 case INDEX_op_sari_vec: 2899 if (vece == MO_64) { 2900 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 2901 } else { 2902 insn = shift_imm_insn[vece]; 2903 } 2904 sub = 4; 2905 goto gen_shift; 2906 case INDEX_op_rotli_vec: 2907 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 2908 if (vece == MO_64) { 2909 insn |= P_VEXW; 2910 } 2911 sub = 1; 2912 goto gen_shift; 2913 gen_shift: 2914 tcg_debug_assert(vece != MO_8); 2915 if (type == TCG_TYPE_V256) { 2916 insn |= P_VEXL; 2917 } 2918 tcg_out_vex_modrm(s, insn, sub, a0, a1); 2919 tcg_out8(s, a2); 2920 break; 2921 2922 case INDEX_op_ld_vec: 2923 tcg_out_ld(s, type, a0, a1, a2); 2924 break; 2925 case INDEX_op_st_vec: 2926 tcg_out_st(s, type, a0, a1, a2); 2927 break; 2928 case INDEX_op_dupm_vec: 2929 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 2930 break; 2931 2932 case INDEX_op_x86_shufps_vec: 2933 insn = OPC_SHUFPS; 2934 sub = args[3]; 2935 goto gen_simd_imm8; 2936 case INDEX_op_x86_blend_vec: 2937 if (vece == MO_16) { 2938 insn = OPC_PBLENDW; 2939 } else if (vece == MO_32) { 2940 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 2941 } else { 2942 g_assert_not_reached(); 2943 } 2944 sub = args[3]; 2945 goto gen_simd_imm8; 2946 case INDEX_op_x86_vperm2i128_vec: 2947 insn = OPC_VPERM2I128; 2948 sub = args[3]; 2949 goto gen_simd_imm8; 2950 case INDEX_op_x86_vpshldi_vec: 2951 insn = vpshldi_insn[vece]; 2952 sub = args[3]; 2953 goto gen_simd_imm8; 2954 2955 case INDEX_op_not_vec: 2956 insn = OPC_VPTERNLOGQ; 2957 a2 = a1; 2958 sub = 0x33; /* !B */ 2959 goto gen_simd_imm8; 2960 case INDEX_op_nor_vec: 2961 insn = OPC_VPTERNLOGQ; 2962 sub = 0x11; /* norCB */ 2963 goto gen_simd_imm8; 2964 case INDEX_op_nand_vec: 2965 insn = OPC_VPTERNLOGQ; 2966 sub = 0x77; /* nandCB */ 2967 goto gen_simd_imm8; 2968 case INDEX_op_eqv_vec: 2969 insn = OPC_VPTERNLOGQ; 2970 sub = 0x99; /* xnorCB */ 2971 goto gen_simd_imm8; 2972 case INDEX_op_orc_vec: 2973 insn = OPC_VPTERNLOGQ; 2974 sub = 0xdd; /* orB!C */ 2975 goto gen_simd_imm8; 2976 2977 case INDEX_op_bitsel_vec: 2978 insn = OPC_VPTERNLOGQ; 2979 a3 = args[3]; 2980 if (a0 == a1) { 2981 a1 = a2; 2982 a2 = a3; 2983 sub = 0xca; /* A?B:C */ 2984 } else if (a0 == a2) { 2985 a2 = a3; 2986 sub = 0xe2; /* B?A:C */ 2987 } else { 2988 tcg_out_mov(s, type, a0, a3); 2989 sub = 0xb8; /* B?C:A */ 2990 } 2991 goto gen_simd_imm8; 2992 2993 gen_simd_imm8: 2994 tcg_debug_assert(insn != OPC_UD2); 2995 if (type == TCG_TYPE_V256) { 2996 insn |= P_VEXL; 2997 } 2998 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2999 tcg_out8(s, sub); 3000 break; 3001 3002 case INDEX_op_x86_vpblendvb_vec: 3003 insn = OPC_VPBLENDVB; 3004 if (type == TCG_TYPE_V256) { 3005 insn |= P_VEXL; 3006 } 3007 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3008 tcg_out8(s, args[3] << 4); 3009 break; 3010 3011 case INDEX_op_x86_psrldq_vec: 3012 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3013 tcg_out8(s, a2); 3014 break; 3015 3016 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3017 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3018 default: 3019 g_assert_not_reached(); 3020 } 3021} 3022 3023static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3024{ 3025 switch (op) { 3026 case INDEX_op_goto_ptr: 3027 return C_O0_I1(r); 3028 3029 case INDEX_op_ld8u_i32: 3030 case INDEX_op_ld8u_i64: 3031 case INDEX_op_ld8s_i32: 3032 case INDEX_op_ld8s_i64: 3033 case INDEX_op_ld16u_i32: 3034 case INDEX_op_ld16u_i64: 3035 case INDEX_op_ld16s_i32: 3036 case INDEX_op_ld16s_i64: 3037 case INDEX_op_ld_i32: 3038 case INDEX_op_ld32u_i64: 3039 case INDEX_op_ld32s_i64: 3040 case INDEX_op_ld_i64: 3041 return C_O1_I1(r, r); 3042 3043 case INDEX_op_st8_i32: 3044 case INDEX_op_st8_i64: 3045 return C_O0_I2(qi, r); 3046 3047 case INDEX_op_st16_i32: 3048 case INDEX_op_st16_i64: 3049 case INDEX_op_st_i32: 3050 case INDEX_op_st32_i64: 3051 return C_O0_I2(ri, r); 3052 3053 case INDEX_op_st_i64: 3054 return C_O0_I2(re, r); 3055 3056 case INDEX_op_add_i32: 3057 case INDEX_op_add_i64: 3058 return C_O1_I2(r, r, re); 3059 3060 case INDEX_op_sub_i32: 3061 case INDEX_op_sub_i64: 3062 case INDEX_op_mul_i32: 3063 case INDEX_op_mul_i64: 3064 case INDEX_op_or_i32: 3065 case INDEX_op_or_i64: 3066 case INDEX_op_xor_i32: 3067 case INDEX_op_xor_i64: 3068 return C_O1_I2(r, 0, re); 3069 3070 case INDEX_op_and_i32: 3071 case INDEX_op_and_i64: 3072 return C_O1_I2(r, 0, reZ); 3073 3074 case INDEX_op_andc_i32: 3075 case INDEX_op_andc_i64: 3076 return C_O1_I2(r, r, rI); 3077 3078 case INDEX_op_shl_i32: 3079 case INDEX_op_shl_i64: 3080 case INDEX_op_shr_i32: 3081 case INDEX_op_shr_i64: 3082 case INDEX_op_sar_i32: 3083 case INDEX_op_sar_i64: 3084 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3085 3086 case INDEX_op_rotl_i32: 3087 case INDEX_op_rotl_i64: 3088 case INDEX_op_rotr_i32: 3089 case INDEX_op_rotr_i64: 3090 return C_O1_I2(r, 0, ci); 3091 3092 case INDEX_op_brcond_i32: 3093 case INDEX_op_brcond_i64: 3094 return C_O0_I2(r, re); 3095 3096 case INDEX_op_bswap16_i32: 3097 case INDEX_op_bswap16_i64: 3098 case INDEX_op_bswap32_i32: 3099 case INDEX_op_bswap32_i64: 3100 case INDEX_op_bswap64_i64: 3101 case INDEX_op_neg_i32: 3102 case INDEX_op_neg_i64: 3103 case INDEX_op_not_i32: 3104 case INDEX_op_not_i64: 3105 case INDEX_op_extrh_i64_i32: 3106 return C_O1_I1(r, 0); 3107 3108 case INDEX_op_ext8s_i32: 3109 case INDEX_op_ext8s_i64: 3110 case INDEX_op_ext8u_i32: 3111 case INDEX_op_ext8u_i64: 3112 return C_O1_I1(r, q); 3113 3114 case INDEX_op_ext16s_i32: 3115 case INDEX_op_ext16s_i64: 3116 case INDEX_op_ext16u_i32: 3117 case INDEX_op_ext16u_i64: 3118 case INDEX_op_ext32s_i64: 3119 case INDEX_op_ext32u_i64: 3120 case INDEX_op_ext_i32_i64: 3121 case INDEX_op_extu_i32_i64: 3122 case INDEX_op_extrl_i64_i32: 3123 case INDEX_op_extract_i32: 3124 case INDEX_op_extract_i64: 3125 case INDEX_op_sextract_i32: 3126 case INDEX_op_ctpop_i32: 3127 case INDEX_op_ctpop_i64: 3128 return C_O1_I1(r, r); 3129 3130 case INDEX_op_extract2_i32: 3131 case INDEX_op_extract2_i64: 3132 return C_O1_I2(r, 0, r); 3133 3134 case INDEX_op_deposit_i32: 3135 case INDEX_op_deposit_i64: 3136 return C_O1_I2(Q, 0, Q); 3137 3138 case INDEX_op_setcond_i32: 3139 case INDEX_op_setcond_i64: 3140 return C_O1_I2(q, r, re); 3141 3142 case INDEX_op_movcond_i32: 3143 case INDEX_op_movcond_i64: 3144 return C_O1_I4(r, r, re, r, 0); 3145 3146 case INDEX_op_div2_i32: 3147 case INDEX_op_div2_i64: 3148 case INDEX_op_divu2_i32: 3149 case INDEX_op_divu2_i64: 3150 return C_O2_I3(a, d, 0, 1, r); 3151 3152 case INDEX_op_mulu2_i32: 3153 case INDEX_op_mulu2_i64: 3154 case INDEX_op_muls2_i32: 3155 case INDEX_op_muls2_i64: 3156 return C_O2_I2(a, d, a, r); 3157 3158 case INDEX_op_add2_i32: 3159 case INDEX_op_add2_i64: 3160 case INDEX_op_sub2_i32: 3161 case INDEX_op_sub2_i64: 3162 return C_O2_I4(r, r, 0, 1, re, re); 3163 3164 case INDEX_op_ctz_i32: 3165 case INDEX_op_ctz_i64: 3166 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3167 3168 case INDEX_op_clz_i32: 3169 case INDEX_op_clz_i64: 3170 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3171 3172 case INDEX_op_qemu_ld_a32_i32: 3173 return C_O1_I1(r, L); 3174 case INDEX_op_qemu_ld_a64_i32: 3175 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3176 3177 case INDEX_op_qemu_st_a32_i32: 3178 return C_O0_I2(L, L); 3179 case INDEX_op_qemu_st_a64_i32: 3180 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3181 case INDEX_op_qemu_st8_a32_i32: 3182 return C_O0_I2(s, L); 3183 case INDEX_op_qemu_st8_a64_i32: 3184 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3185 3186 case INDEX_op_qemu_ld_a32_i64: 3187 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3188 case INDEX_op_qemu_ld_a64_i64: 3189 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3190 3191 case INDEX_op_qemu_st_a32_i64: 3192 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3193 case INDEX_op_qemu_st_a64_i64: 3194 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3195 3196 case INDEX_op_brcond2_i32: 3197 return C_O0_I4(r, r, ri, ri); 3198 3199 case INDEX_op_setcond2_i32: 3200 return C_O1_I4(r, r, r, ri, ri); 3201 3202 case INDEX_op_ld_vec: 3203 case INDEX_op_dupm_vec: 3204 return C_O1_I1(x, r); 3205 3206 case INDEX_op_st_vec: 3207 return C_O0_I2(x, r); 3208 3209 case INDEX_op_add_vec: 3210 case INDEX_op_sub_vec: 3211 case INDEX_op_mul_vec: 3212 case INDEX_op_and_vec: 3213 case INDEX_op_or_vec: 3214 case INDEX_op_xor_vec: 3215 case INDEX_op_andc_vec: 3216 case INDEX_op_orc_vec: 3217 case INDEX_op_nand_vec: 3218 case INDEX_op_nor_vec: 3219 case INDEX_op_eqv_vec: 3220 case INDEX_op_ssadd_vec: 3221 case INDEX_op_usadd_vec: 3222 case INDEX_op_sssub_vec: 3223 case INDEX_op_ussub_vec: 3224 case INDEX_op_smin_vec: 3225 case INDEX_op_umin_vec: 3226 case INDEX_op_smax_vec: 3227 case INDEX_op_umax_vec: 3228 case INDEX_op_shlv_vec: 3229 case INDEX_op_shrv_vec: 3230 case INDEX_op_sarv_vec: 3231 case INDEX_op_rotlv_vec: 3232 case INDEX_op_rotrv_vec: 3233 case INDEX_op_shls_vec: 3234 case INDEX_op_shrs_vec: 3235 case INDEX_op_sars_vec: 3236 case INDEX_op_cmp_vec: 3237 case INDEX_op_x86_shufps_vec: 3238 case INDEX_op_x86_blend_vec: 3239 case INDEX_op_x86_packss_vec: 3240 case INDEX_op_x86_packus_vec: 3241 case INDEX_op_x86_vperm2i128_vec: 3242 case INDEX_op_x86_punpckl_vec: 3243 case INDEX_op_x86_punpckh_vec: 3244 case INDEX_op_x86_vpshldi_vec: 3245#if TCG_TARGET_REG_BITS == 32 3246 case INDEX_op_dup2_vec: 3247#endif 3248 return C_O1_I2(x, x, x); 3249 3250 case INDEX_op_abs_vec: 3251 case INDEX_op_dup_vec: 3252 case INDEX_op_not_vec: 3253 case INDEX_op_shli_vec: 3254 case INDEX_op_shri_vec: 3255 case INDEX_op_sari_vec: 3256 case INDEX_op_rotli_vec: 3257 case INDEX_op_x86_psrldq_vec: 3258 return C_O1_I1(x, x); 3259 3260 case INDEX_op_x86_vpshldv_vec: 3261 case INDEX_op_x86_vpshrdv_vec: 3262 return C_O1_I3(x, 0, x, x); 3263 3264 case INDEX_op_bitsel_vec: 3265 case INDEX_op_x86_vpblendvb_vec: 3266 return C_O1_I3(x, x, x, x); 3267 3268 default: 3269 g_assert_not_reached(); 3270 } 3271} 3272 3273int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3274{ 3275 switch (opc) { 3276 case INDEX_op_add_vec: 3277 case INDEX_op_sub_vec: 3278 case INDEX_op_and_vec: 3279 case INDEX_op_or_vec: 3280 case INDEX_op_xor_vec: 3281 case INDEX_op_andc_vec: 3282 case INDEX_op_orc_vec: 3283 case INDEX_op_nand_vec: 3284 case INDEX_op_nor_vec: 3285 case INDEX_op_eqv_vec: 3286 case INDEX_op_not_vec: 3287 case INDEX_op_bitsel_vec: 3288 return 1; 3289 case INDEX_op_cmp_vec: 3290 case INDEX_op_cmpsel_vec: 3291 return -1; 3292 3293 case INDEX_op_rotli_vec: 3294 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3295 3296 case INDEX_op_shli_vec: 3297 case INDEX_op_shri_vec: 3298 /* We must expand the operation for MO_8. */ 3299 return vece == MO_8 ? -1 : 1; 3300 3301 case INDEX_op_sari_vec: 3302 switch (vece) { 3303 case MO_8: 3304 return -1; 3305 case MO_16: 3306 case MO_32: 3307 return 1; 3308 case MO_64: 3309 if (have_avx512vl) { 3310 return 1; 3311 } 3312 /* 3313 * We can emulate this for MO_64, but it does not pay off 3314 * unless we're producing at least 4 values. 3315 */ 3316 return type >= TCG_TYPE_V256 ? -1 : 0; 3317 } 3318 return 0; 3319 3320 case INDEX_op_shls_vec: 3321 case INDEX_op_shrs_vec: 3322 return vece >= MO_16; 3323 case INDEX_op_sars_vec: 3324 switch (vece) { 3325 case MO_16: 3326 case MO_32: 3327 return 1; 3328 case MO_64: 3329 return have_avx512vl; 3330 } 3331 return 0; 3332 case INDEX_op_rotls_vec: 3333 return vece >= MO_16 ? -1 : 0; 3334 3335 case INDEX_op_shlv_vec: 3336 case INDEX_op_shrv_vec: 3337 switch (vece) { 3338 case MO_16: 3339 return have_avx512bw; 3340 case MO_32: 3341 case MO_64: 3342 return have_avx2; 3343 } 3344 return 0; 3345 case INDEX_op_sarv_vec: 3346 switch (vece) { 3347 case MO_16: 3348 return have_avx512bw; 3349 case MO_32: 3350 return have_avx2; 3351 case MO_64: 3352 return have_avx512vl; 3353 } 3354 return 0; 3355 case INDEX_op_rotlv_vec: 3356 case INDEX_op_rotrv_vec: 3357 switch (vece) { 3358 case MO_16: 3359 return have_avx512vbmi2 ? -1 : 0; 3360 case MO_32: 3361 case MO_64: 3362 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3363 } 3364 return 0; 3365 3366 case INDEX_op_mul_vec: 3367 switch (vece) { 3368 case MO_8: 3369 return -1; 3370 case MO_64: 3371 return have_avx512dq; 3372 } 3373 return 1; 3374 3375 case INDEX_op_ssadd_vec: 3376 case INDEX_op_usadd_vec: 3377 case INDEX_op_sssub_vec: 3378 case INDEX_op_ussub_vec: 3379 return vece <= MO_16; 3380 case INDEX_op_smin_vec: 3381 case INDEX_op_smax_vec: 3382 case INDEX_op_umin_vec: 3383 case INDEX_op_umax_vec: 3384 case INDEX_op_abs_vec: 3385 return vece <= MO_32 || have_avx512vl; 3386 3387 default: 3388 return 0; 3389 } 3390} 3391 3392static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3393 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3394{ 3395 TCGv_vec t1, t2; 3396 3397 tcg_debug_assert(vece == MO_8); 3398 3399 t1 = tcg_temp_new_vec(type); 3400 t2 = tcg_temp_new_vec(type); 3401 3402 /* 3403 * Unpack to W, shift, and repack. Tricky bits: 3404 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3405 * i.e. duplicate in other half of the 16-bit lane. 3406 * (2) For right-shift, add 8 so that the high half of the lane 3407 * becomes zero. For left-shift, and left-rotate, we must 3408 * shift up and down again. 3409 * (3) Step 2 leaves high half zero such that PACKUSWB 3410 * (pack with unsigned saturation) does not modify 3411 * the quantity. 3412 */ 3413 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3414 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3415 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3416 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3417 3418 if (opc != INDEX_op_rotli_vec) { 3419 imm += 8; 3420 } 3421 if (opc == INDEX_op_shri_vec) { 3422 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3423 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3424 } else { 3425 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3426 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3427 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3428 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3429 } 3430 3431 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3432 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3433 tcg_temp_free_vec(t1); 3434 tcg_temp_free_vec(t2); 3435} 3436 3437static void expand_vec_sari(TCGType type, unsigned vece, 3438 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3439{ 3440 TCGv_vec t1, t2; 3441 3442 switch (vece) { 3443 case MO_8: 3444 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3445 t1 = tcg_temp_new_vec(type); 3446 t2 = tcg_temp_new_vec(type); 3447 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3448 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3449 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3450 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3451 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3452 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3453 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3454 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3455 tcg_temp_free_vec(t1); 3456 tcg_temp_free_vec(t2); 3457 break; 3458 3459 case MO_64: 3460 t1 = tcg_temp_new_vec(type); 3461 if (imm <= 32) { 3462 /* 3463 * We can emulate a small sign extend by performing an arithmetic 3464 * 32-bit shift and overwriting the high half of a 64-bit logical 3465 * shift. Note that the ISA says shift of 32 is valid, but TCG 3466 * does not, so we have to bound the smaller shift -- we get the 3467 * same result in the high half either way. 3468 */ 3469 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3470 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3471 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3472 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3473 tcgv_vec_arg(t1), 0xaa); 3474 } else { 3475 /* Otherwise we will need to use a compare vs 0 to produce 3476 * the sign-extend, shift and merge. 3477 */ 3478 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 3479 tcg_constant_vec(type, MO_64, 0), v1); 3480 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3481 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3482 tcg_gen_or_vec(MO_64, v0, v0, t1); 3483 } 3484 tcg_temp_free_vec(t1); 3485 break; 3486 3487 default: 3488 g_assert_not_reached(); 3489 } 3490} 3491 3492static void expand_vec_rotli(TCGType type, unsigned vece, 3493 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3494{ 3495 TCGv_vec t; 3496 3497 if (vece == MO_8) { 3498 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3499 return; 3500 } 3501 3502 if (have_avx512vbmi2) { 3503 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3504 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3505 return; 3506 } 3507 3508 t = tcg_temp_new_vec(type); 3509 tcg_gen_shli_vec(vece, t, v1, imm); 3510 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3511 tcg_gen_or_vec(vece, v0, v0, t); 3512 tcg_temp_free_vec(t); 3513} 3514 3515static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3516 TCGv_vec v1, TCGv_vec sh, bool right) 3517{ 3518 TCGv_vec t; 3519 3520 if (have_avx512vbmi2) { 3521 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3522 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3523 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3524 return; 3525 } 3526 3527 t = tcg_temp_new_vec(type); 3528 tcg_gen_dupi_vec(vece, t, 8 << vece); 3529 tcg_gen_sub_vec(vece, t, t, sh); 3530 if (right) { 3531 tcg_gen_shlv_vec(vece, t, v1, t); 3532 tcg_gen_shrv_vec(vece, v0, v1, sh); 3533 } else { 3534 tcg_gen_shrv_vec(vece, t, v1, t); 3535 tcg_gen_shlv_vec(vece, v0, v1, sh); 3536 } 3537 tcg_gen_or_vec(vece, v0, v0, t); 3538 tcg_temp_free_vec(t); 3539} 3540 3541static void expand_vec_rotls(TCGType type, unsigned vece, 3542 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3543{ 3544 TCGv_vec t = tcg_temp_new_vec(type); 3545 3546 tcg_debug_assert(vece != MO_8); 3547 3548 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3549 tcg_gen_dup_i32_vec(vece, t, lsh); 3550 if (vece >= MO_32) { 3551 tcg_gen_rotlv_vec(vece, v0, v1, t); 3552 } else { 3553 expand_vec_rotv(type, vece, v0, v1, t, false); 3554 } 3555 } else { 3556 TCGv_i32 rsh = tcg_temp_new_i32(); 3557 3558 tcg_gen_neg_i32(rsh, lsh); 3559 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3560 tcg_gen_shls_vec(vece, t, v1, lsh); 3561 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3562 tcg_gen_or_vec(vece, v0, v0, t); 3563 3564 tcg_temp_free_i32(rsh); 3565 } 3566 3567 tcg_temp_free_vec(t); 3568} 3569 3570static void expand_vec_mul(TCGType type, unsigned vece, 3571 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3572{ 3573 TCGv_vec t1, t2, t3, t4, zero; 3574 3575 tcg_debug_assert(vece == MO_8); 3576 3577 /* 3578 * Unpack v1 bytes to words, 0 | x. 3579 * Unpack v2 bytes to words, y | 0. 3580 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3581 * Shift logical right by 8 bits to clear the high 8 bytes before 3582 * using an unsigned saturated pack. 3583 * 3584 * The difference between the V64, V128 and V256 cases is merely how 3585 * we distribute the expansion between temporaries. 3586 */ 3587 switch (type) { 3588 case TCG_TYPE_V64: 3589 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3590 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3591 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3592 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3593 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3594 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3595 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3596 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3597 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3598 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3599 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3600 tcg_temp_free_vec(t1); 3601 tcg_temp_free_vec(t2); 3602 break; 3603 3604 case TCG_TYPE_V128: 3605 case TCG_TYPE_V256: 3606 t1 = tcg_temp_new_vec(type); 3607 t2 = tcg_temp_new_vec(type); 3608 t3 = tcg_temp_new_vec(type); 3609 t4 = tcg_temp_new_vec(type); 3610 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3611 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3612 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3613 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3614 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3615 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3616 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3617 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3618 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3619 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3620 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3621 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3622 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3623 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3624 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3625 tcg_temp_free_vec(t1); 3626 tcg_temp_free_vec(t2); 3627 tcg_temp_free_vec(t3); 3628 tcg_temp_free_vec(t4); 3629 break; 3630 3631 default: 3632 g_assert_not_reached(); 3633 } 3634} 3635 3636static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3637 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3638{ 3639 enum { 3640 NEED_INV = 1, 3641 NEED_SWAP = 2, 3642 NEED_BIAS = 4, 3643 NEED_UMIN = 8, 3644 NEED_UMAX = 16, 3645 }; 3646 TCGv_vec t1, t2, t3; 3647 uint8_t fixup; 3648 3649 switch (cond) { 3650 case TCG_COND_EQ: 3651 case TCG_COND_GT: 3652 fixup = 0; 3653 break; 3654 case TCG_COND_NE: 3655 case TCG_COND_LE: 3656 fixup = NEED_INV; 3657 break; 3658 case TCG_COND_LT: 3659 fixup = NEED_SWAP; 3660 break; 3661 case TCG_COND_GE: 3662 fixup = NEED_SWAP | NEED_INV; 3663 break; 3664 case TCG_COND_LEU: 3665 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3666 fixup = NEED_UMIN; 3667 } else { 3668 fixup = NEED_BIAS | NEED_INV; 3669 } 3670 break; 3671 case TCG_COND_GTU: 3672 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3673 fixup = NEED_UMIN | NEED_INV; 3674 } else { 3675 fixup = NEED_BIAS; 3676 } 3677 break; 3678 case TCG_COND_GEU: 3679 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3680 fixup = NEED_UMAX; 3681 } else { 3682 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3683 } 3684 break; 3685 case TCG_COND_LTU: 3686 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3687 fixup = NEED_UMAX | NEED_INV; 3688 } else { 3689 fixup = NEED_BIAS | NEED_SWAP; 3690 } 3691 break; 3692 default: 3693 g_assert_not_reached(); 3694 } 3695 3696 if (fixup & NEED_INV) { 3697 cond = tcg_invert_cond(cond); 3698 } 3699 if (fixup & NEED_SWAP) { 3700 t1 = v1, v1 = v2, v2 = t1; 3701 cond = tcg_swap_cond(cond); 3702 } 3703 3704 t1 = t2 = NULL; 3705 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3706 t1 = tcg_temp_new_vec(type); 3707 if (fixup & NEED_UMIN) { 3708 tcg_gen_umin_vec(vece, t1, v1, v2); 3709 } else { 3710 tcg_gen_umax_vec(vece, t1, v1, v2); 3711 } 3712 v2 = t1; 3713 cond = TCG_COND_EQ; 3714 } else if (fixup & NEED_BIAS) { 3715 t1 = tcg_temp_new_vec(type); 3716 t2 = tcg_temp_new_vec(type); 3717 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 3718 tcg_gen_sub_vec(vece, t1, v1, t3); 3719 tcg_gen_sub_vec(vece, t2, v2, t3); 3720 v1 = t1; 3721 v2 = t2; 3722 cond = tcg_signed_cond(cond); 3723 } 3724 3725 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 3726 /* Expand directly; do not recurse. */ 3727 vec_gen_4(INDEX_op_cmp_vec, type, vece, 3728 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 3729 3730 if (t1) { 3731 tcg_temp_free_vec(t1); 3732 if (t2) { 3733 tcg_temp_free_vec(t2); 3734 } 3735 } 3736 return fixup & NEED_INV; 3737} 3738 3739static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 3740 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3741{ 3742 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 3743 tcg_gen_not_vec(vece, v0, v0); 3744 } 3745} 3746 3747static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 3748 TCGv_vec c1, TCGv_vec c2, 3749 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 3750{ 3751 TCGv_vec t = tcg_temp_new_vec(type); 3752 3753 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 3754 /* Invert the sense of the compare by swapping arguments. */ 3755 TCGv_vec x; 3756 x = v3, v3 = v4, v4 = x; 3757 } 3758 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 3759 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 3760 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 3761 tcg_temp_free_vec(t); 3762} 3763 3764void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 3765 TCGArg a0, ...) 3766{ 3767 va_list va; 3768 TCGArg a2; 3769 TCGv_vec v0, v1, v2, v3, v4; 3770 3771 va_start(va, a0); 3772 v0 = temp_tcgv_vec(arg_temp(a0)); 3773 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3774 a2 = va_arg(va, TCGArg); 3775 3776 switch (opc) { 3777 case INDEX_op_shli_vec: 3778 case INDEX_op_shri_vec: 3779 expand_vec_shi(type, vece, opc, v0, v1, a2); 3780 break; 3781 3782 case INDEX_op_sari_vec: 3783 expand_vec_sari(type, vece, v0, v1, a2); 3784 break; 3785 3786 case INDEX_op_rotli_vec: 3787 expand_vec_rotli(type, vece, v0, v1, a2); 3788 break; 3789 3790 case INDEX_op_rotls_vec: 3791 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 3792 break; 3793 3794 case INDEX_op_rotlv_vec: 3795 v2 = temp_tcgv_vec(arg_temp(a2)); 3796 expand_vec_rotv(type, vece, v0, v1, v2, false); 3797 break; 3798 case INDEX_op_rotrv_vec: 3799 v2 = temp_tcgv_vec(arg_temp(a2)); 3800 expand_vec_rotv(type, vece, v0, v1, v2, true); 3801 break; 3802 3803 case INDEX_op_mul_vec: 3804 v2 = temp_tcgv_vec(arg_temp(a2)); 3805 expand_vec_mul(type, vece, v0, v1, v2); 3806 break; 3807 3808 case INDEX_op_cmp_vec: 3809 v2 = temp_tcgv_vec(arg_temp(a2)); 3810 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 3811 break; 3812 3813 case INDEX_op_cmpsel_vec: 3814 v2 = temp_tcgv_vec(arg_temp(a2)); 3815 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3816 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3817 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 3818 break; 3819 3820 default: 3821 break; 3822 } 3823 3824 va_end(va); 3825} 3826 3827static const int tcg_target_callee_save_regs[] = { 3828#if TCG_TARGET_REG_BITS == 64 3829 TCG_REG_RBP, 3830 TCG_REG_RBX, 3831#if defined(_WIN64) 3832 TCG_REG_RDI, 3833 TCG_REG_RSI, 3834#endif 3835 TCG_REG_R12, 3836 TCG_REG_R13, 3837 TCG_REG_R14, /* Currently used for the global env. */ 3838 TCG_REG_R15, 3839#else 3840 TCG_REG_EBP, /* Currently used for the global env. */ 3841 TCG_REG_EBX, 3842 TCG_REG_ESI, 3843 TCG_REG_EDI, 3844#endif 3845}; 3846 3847/* Compute frame size via macros, to share between tcg_target_qemu_prologue 3848 and tcg_register_jit. */ 3849 3850#define PUSH_SIZE \ 3851 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 3852 * (TCG_TARGET_REG_BITS / 8)) 3853 3854#define FRAME_SIZE \ 3855 ((PUSH_SIZE \ 3856 + TCG_STATIC_CALL_ARGS_SIZE \ 3857 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 3858 + TCG_TARGET_STACK_ALIGN - 1) \ 3859 & ~(TCG_TARGET_STACK_ALIGN - 1)) 3860 3861/* Generate global QEMU prologue and epilogue code */ 3862static void tcg_target_qemu_prologue(TCGContext *s) 3863{ 3864 int i, stack_addend; 3865 3866 /* TB prologue */ 3867 3868 /* Reserve some stack space, also for TCG temps. */ 3869 stack_addend = FRAME_SIZE - PUSH_SIZE; 3870 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 3871 CPU_TEMP_BUF_NLONGS * sizeof(long)); 3872 3873 /* Save all callee saved registers. */ 3874 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 3875 tcg_out_push(s, tcg_target_callee_save_regs[i]); 3876 } 3877 3878#if TCG_TARGET_REG_BITS == 32 3879 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 3880 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 3881 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3882 /* jmp *tb. */ 3883 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 3884 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 3885 + stack_addend); 3886#else 3887# if !defined(CONFIG_SOFTMMU) 3888 if (guest_base) { 3889 int seg = setup_guest_base_seg(); 3890 if (seg != 0) { 3891 x86_guest_base.seg = seg; 3892 } else if (guest_base == (int32_t)guest_base) { 3893 x86_guest_base.ofs = guest_base; 3894 } else { 3895 /* Choose R12 because, as a base, it requires a SIB byte. */ 3896 x86_guest_base.index = TCG_REG_R12; 3897 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 3898 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 3899 } 3900 } 3901# endif 3902 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 3903 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3904 /* jmp *tb. */ 3905 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 3906#endif 3907 3908 /* 3909 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 3910 * and fall through to the rest of the epilogue. 3911 */ 3912 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 3913 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 3914 3915 /* TB epilogue */ 3916 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 3917 3918 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 3919 3920 if (have_avx2) { 3921 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 3922 } 3923 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 3924 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 3925 } 3926 tcg_out_opc(s, OPC_RET, 0, 0, 0); 3927} 3928 3929static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 3930{ 3931 memset(p, 0x90, count); 3932} 3933 3934static void tcg_target_init(TCGContext *s) 3935{ 3936 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 3937 if (TCG_TARGET_REG_BITS == 64) { 3938 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 3939 } 3940 if (have_avx1) { 3941 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 3942 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 3943 } 3944 if (have_avx2) { 3945 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 3946 } 3947 3948 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 3949 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 3950 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 3951 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 3952 if (TCG_TARGET_REG_BITS == 64) { 3953#if !defined(_WIN64) 3954 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 3955 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 3956#endif 3957 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 3958 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 3959 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 3960 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 3961 } 3962 3963 s->reserved_regs = 0; 3964 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 3965#ifdef _WIN64 3966 /* These are call saved, and we don't save them, so don't use them. */ 3967 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 3968 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 3969 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 3970 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 3971 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 3972 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 3973 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 3974 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 3975 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 3976 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 3977#endif 3978} 3979 3980typedef struct { 3981 DebugFrameHeader h; 3982 uint8_t fde_def_cfa[4]; 3983 uint8_t fde_reg_ofs[14]; 3984} DebugFrame; 3985 3986/* We're expecting a 2 byte uleb128 encoded value. */ 3987QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 3988 3989#if !defined(__ELF__) 3990 /* Host machine without ELF. */ 3991#elif TCG_TARGET_REG_BITS == 64 3992#define ELF_HOST_MACHINE EM_X86_64 3993static const DebugFrame debug_frame = { 3994 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 3995 .h.cie.id = -1, 3996 .h.cie.version = 1, 3997 .h.cie.code_align = 1, 3998 .h.cie.data_align = 0x78, /* sleb128 -8 */ 3999 .h.cie.return_column = 16, 4000 4001 /* Total FDE size does not include the "len" member. */ 4002 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4003 4004 .fde_def_cfa = { 4005 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4006 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4007 (FRAME_SIZE >> 7) 4008 }, 4009 .fde_reg_ofs = { 4010 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4011 /* The following ordering must match tcg_target_callee_save_regs. */ 4012 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4013 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4014 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4015 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4016 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4017 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4018 } 4019}; 4020#else 4021#define ELF_HOST_MACHINE EM_386 4022static const DebugFrame debug_frame = { 4023 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4024 .h.cie.id = -1, 4025 .h.cie.version = 1, 4026 .h.cie.code_align = 1, 4027 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4028 .h.cie.return_column = 8, 4029 4030 /* Total FDE size does not include the "len" member. */ 4031 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4032 4033 .fde_def_cfa = { 4034 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4035 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4036 (FRAME_SIZE >> 7) 4037 }, 4038 .fde_reg_ofs = { 4039 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4040 /* The following ordering must match tcg_target_callee_save_regs. */ 4041 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4042 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4043 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4044 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4045 } 4046}; 4047#endif 4048 4049#if defined(ELF_HOST_MACHINE) 4050void tcg_register_jit(const void *buf, size_t buf_size) 4051{ 4052 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4053} 4054#endif 4055