1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94#define TCG_TMP_VEC TCG_REG_XMM5 95 96static const int tcg_target_call_iarg_regs[] = { 97#if TCG_TARGET_REG_BITS == 64 98#if defined(_WIN64) 99 TCG_REG_RCX, 100 TCG_REG_RDX, 101#else 102 TCG_REG_RDI, 103 TCG_REG_RSI, 104 TCG_REG_RDX, 105 TCG_REG_RCX, 106#endif 107 TCG_REG_R8, 108 TCG_REG_R9, 109#else 110 /* 32 bit mode uses stack based calling convention (GCC default). */ 111#endif 112}; 113 114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 115{ 116 switch (kind) { 117 case TCG_CALL_RET_NORMAL: 118 tcg_debug_assert(slot >= 0 && slot <= 1); 119 return slot ? TCG_REG_EDX : TCG_REG_EAX; 120#ifdef _WIN64 121 case TCG_CALL_RET_BY_VEC: 122 tcg_debug_assert(slot == 0); 123 return TCG_REG_XMM0; 124#endif 125 default: 126 g_assert_not_reached(); 127 } 128} 129 130/* Constants we accept. */ 131#define TCG_CT_CONST_S32 0x100 132#define TCG_CT_CONST_U32 0x200 133#define TCG_CT_CONST_I32 0x400 134#define TCG_CT_CONST_WSZ 0x800 135 136/* Registers used with L constraint, which are the first argument 137 registers on x86_64, and two random call clobbered registers on 138 i386. */ 139#if TCG_TARGET_REG_BITS == 64 140# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 141# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 142#else 143# define TCG_REG_L0 TCG_REG_EAX 144# define TCG_REG_L1 TCG_REG_EDX 145#endif 146 147#if TCG_TARGET_REG_BITS == 64 148# define ALL_GENERAL_REGS 0x0000ffffu 149# define ALL_VECTOR_REGS 0xffff0000u 150# define ALL_BYTEL_REGS ALL_GENERAL_REGS 151#else 152# define ALL_GENERAL_REGS 0x000000ffu 153# define ALL_VECTOR_REGS 0x00ff0000u 154# define ALL_BYTEL_REGS 0x0000000fu 155#endif 156#define SOFTMMU_RESERVE_REGS \ 157 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 158 159/* For 64-bit, we always know that CMOV is available. */ 160#if TCG_TARGET_REG_BITS == 64 161# define have_cmov true 162#else 163# define have_cmov (cpuinfo & CPUINFO_CMOV) 164#endif 165#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 166#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 167 168static const tcg_insn_unit *tb_ret_addr; 169 170static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 171 intptr_t value, intptr_t addend) 172{ 173 value += addend; 174 switch(type) { 175 case R_386_PC32: 176 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 177 if (value != (int32_t)value) { 178 return false; 179 } 180 /* FALLTHRU */ 181 case R_386_32: 182 tcg_patch32(code_ptr, value); 183 break; 184 case R_386_PC8: 185 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 186 if (value != (int8_t)value) { 187 return false; 188 } 189 tcg_patch8(code_ptr, value); 190 break; 191 default: 192 g_assert_not_reached(); 193 } 194 return true; 195} 196 197/* test if a constant matches the constraint */ 198static bool tcg_target_const_match(int64_t val, int ct, 199 TCGType type, TCGCond cond, int vece) 200{ 201 if (ct & TCG_CT_CONST) { 202 return 1; 203 } 204 if (type == TCG_TYPE_I32) { 205 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) { 206 return 1; 207 } 208 } else { 209 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 210 return 1; 211 } 212 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 213 return 1; 214 } 215 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 216 return 1; 217 } 218 } 219 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 220 return 1; 221 } 222 return 0; 223} 224 225# define LOWREGMASK(x) ((x) & 7) 226 227#define P_EXT 0x100 /* 0x0f opcode prefix */ 228#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 229#define P_DATA16 0x400 /* 0x66 opcode prefix */ 230#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 231#if TCG_TARGET_REG_BITS == 64 232# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 233# define P_REXB_R 0x2000 /* REG field as byte register */ 234# define P_REXB_RM 0x4000 /* R/M field as byte register */ 235# define P_GS 0x8000 /* gs segment override */ 236#else 237# define P_REXW 0 238# define P_REXB_R 0 239# define P_REXB_RM 0 240# define P_GS 0 241#endif 242#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 243#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 244#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 245#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 246#define P_EVEX 0x100000 /* Requires EVEX encoding */ 247 248#define OPC_ARITH_EbIb (0x80) 249#define OPC_ARITH_EvIz (0x81) 250#define OPC_ARITH_EvIb (0x83) 251#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 252#define OPC_ANDN (0xf2 | P_EXT38) 253#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 254#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 255#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 256#define OPC_BSF (0xbc | P_EXT) 257#define OPC_BSR (0xbd | P_EXT) 258#define OPC_BSWAP (0xc8 | P_EXT) 259#define OPC_CALL_Jz (0xe8) 260#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 261#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 262#define OPC_DEC_r32 (0x48) 263#define OPC_IMUL_GvEv (0xaf | P_EXT) 264#define OPC_IMUL_GvEvIb (0x6b) 265#define OPC_IMUL_GvEvIz (0x69) 266#define OPC_INC_r32 (0x40) 267#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 268#define OPC_JCC_short (0x70) /* ... plus condition code */ 269#define OPC_JMP_long (0xe9) 270#define OPC_JMP_short (0xeb) 271#define OPC_LEA (0x8d) 272#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 273#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 274#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 275#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 276#define OPC_MOVB_EvIz (0xc6) 277#define OPC_MOVL_EvIz (0xc7) 278#define OPC_MOVB_Ib (0xb0) 279#define OPC_MOVL_Iv (0xb8) 280#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 281#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 282#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 283#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 284#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 285#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 286#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 287#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 288#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 289#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 290#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 291#define OPC_MOVSBL (0xbe | P_EXT) 292#define OPC_MOVSWL (0xbf | P_EXT) 293#define OPC_MOVSLQ (0x63 | P_REXW) 294#define OPC_MOVZBL (0xb6 | P_EXT) 295#define OPC_MOVZWL (0xb7 | P_EXT) 296#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 297#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 298#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 299#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 300#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 301#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 302#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 303#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 304#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 305#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 306#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 307#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 308#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 309#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 310#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 311#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 312#define OPC_PAND (0xdb | P_EXT | P_DATA16) 313#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 314#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 315#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 316#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 317#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 318#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 319#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 320#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 321#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 322#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 323#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 324#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 325#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 326#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 327#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 328#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 329#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 330#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 331#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 332#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 333#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 334#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 335#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 336#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 337#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 338#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 339#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 340#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 341#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 342#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 343#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 344#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 345#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 346#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 347#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 348#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 349#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 350#define OPC_POR (0xeb | P_EXT | P_DATA16) 351#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 352#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 353#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 354#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 355#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 356#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 357#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 358#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 359#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 360#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 361#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 362#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 363#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 364#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 365#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 366#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 367#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 368#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 369#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 370#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 371#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 372#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 373#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 374#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 375#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 376#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 377#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 378#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 379#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 380#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 381#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 382#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 383#define OPC_PXOR (0xef | P_EXT | P_DATA16) 384#define OPC_POP_r32 (0x58) 385#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 386#define OPC_PUSH_r32 (0x50) 387#define OPC_PUSH_Iv (0x68) 388#define OPC_PUSH_Ib (0x6a) 389#define OPC_RET (0xc3) 390#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 391#define OPC_SHIFT_1 (0xd1) 392#define OPC_SHIFT_Ib (0xc1) 393#define OPC_SHIFT_cl (0xd3) 394#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 395#define OPC_SHUFPS (0xc6 | P_EXT) 396#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 397#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 398#define OPC_SHRD_Ib (0xac | P_EXT) 399#define OPC_TESTL (0x85) 400#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 401#define OPC_UD2 (0x0b | P_EXT) 402#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 403#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 404#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 405#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 406#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 407#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 408#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 409#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 410#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 411#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 412#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 413#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 414#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 415#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 416#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 417#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 418#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 419#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 420#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 421#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 422#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 423#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 424#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 425#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 426#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 427#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 428#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 429#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 430#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 431#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 432#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 433#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 434#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 435#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 436#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 437#define OPC_VZEROUPPER (0x77 | P_EXT) 438#define OPC_XCHG_ax_r32 (0x90) 439#define OPC_XCHG_EvGv (0x87) 440 441#define OPC_GRP3_Eb (0xf6) 442#define OPC_GRP3_Ev (0xf7) 443#define OPC_GRP5 (0xff) 444#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 445 446/* Group 1 opcode extensions for 0x80-0x83. 447 These are also used as modifiers for OPC_ARITH. */ 448#define ARITH_ADD 0 449#define ARITH_OR 1 450#define ARITH_ADC 2 451#define ARITH_SBB 3 452#define ARITH_AND 4 453#define ARITH_SUB 5 454#define ARITH_XOR 6 455#define ARITH_CMP 7 456 457/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 458#define SHIFT_ROL 0 459#define SHIFT_ROR 1 460#define SHIFT_SHL 4 461#define SHIFT_SHR 5 462#define SHIFT_SAR 7 463 464/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 465#define EXT3_TESTi 0 466#define EXT3_NOT 2 467#define EXT3_NEG 3 468#define EXT3_MUL 4 469#define EXT3_IMUL 5 470#define EXT3_DIV 6 471#define EXT3_IDIV 7 472 473/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 474#define EXT5_INC_Ev 0 475#define EXT5_DEC_Ev 1 476#define EXT5_CALLN_Ev 2 477#define EXT5_JMPN_Ev 4 478 479/* Condition codes to be added to OPC_JCC_{long,short}. */ 480#define JCC_JMP (-1) 481#define JCC_JO 0x0 482#define JCC_JNO 0x1 483#define JCC_JB 0x2 484#define JCC_JAE 0x3 485#define JCC_JE 0x4 486#define JCC_JNE 0x5 487#define JCC_JBE 0x6 488#define JCC_JA 0x7 489#define JCC_JS 0x8 490#define JCC_JNS 0x9 491#define JCC_JP 0xa 492#define JCC_JNP 0xb 493#define JCC_JL 0xc 494#define JCC_JGE 0xd 495#define JCC_JLE 0xe 496#define JCC_JG 0xf 497 498static const uint8_t tcg_cond_to_jcc[] = { 499 [TCG_COND_EQ] = JCC_JE, 500 [TCG_COND_NE] = JCC_JNE, 501 [TCG_COND_LT] = JCC_JL, 502 [TCG_COND_GE] = JCC_JGE, 503 [TCG_COND_LE] = JCC_JLE, 504 [TCG_COND_GT] = JCC_JG, 505 [TCG_COND_LTU] = JCC_JB, 506 [TCG_COND_GEU] = JCC_JAE, 507 [TCG_COND_LEU] = JCC_JBE, 508 [TCG_COND_GTU] = JCC_JA, 509}; 510 511#if TCG_TARGET_REG_BITS == 64 512static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 513{ 514 int rex; 515 516 if (opc & P_GS) { 517 tcg_out8(s, 0x65); 518 } 519 if (opc & P_DATA16) { 520 /* We should never be asking for both 16 and 64-bit operation. */ 521 tcg_debug_assert((opc & P_REXW) == 0); 522 tcg_out8(s, 0x66); 523 } 524 if (opc & P_SIMDF3) { 525 tcg_out8(s, 0xf3); 526 } else if (opc & P_SIMDF2) { 527 tcg_out8(s, 0xf2); 528 } 529 530 rex = 0; 531 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 532 rex |= (r & 8) >> 1; /* REX.R */ 533 rex |= (x & 8) >> 2; /* REX.X */ 534 rex |= (rm & 8) >> 3; /* REX.B */ 535 536 /* P_REXB_{R,RM} indicates that the given register is the low byte. 537 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 538 as otherwise the encoding indicates %[abcd]h. Note that the values 539 that are ORed in merely indicate that the REX byte must be present; 540 those bits get discarded in output. */ 541 rex |= opc & (r >= 4 ? P_REXB_R : 0); 542 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 543 544 if (rex) { 545 tcg_out8(s, (uint8_t)(rex | 0x40)); 546 } 547 548 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 549 tcg_out8(s, 0x0f); 550 if (opc & P_EXT38) { 551 tcg_out8(s, 0x38); 552 } else if (opc & P_EXT3A) { 553 tcg_out8(s, 0x3a); 554 } 555 } 556 557 tcg_out8(s, opc); 558} 559#else 560static void tcg_out_opc(TCGContext *s, int opc) 561{ 562 if (opc & P_DATA16) { 563 tcg_out8(s, 0x66); 564 } 565 if (opc & P_SIMDF3) { 566 tcg_out8(s, 0xf3); 567 } else if (opc & P_SIMDF2) { 568 tcg_out8(s, 0xf2); 569 } 570 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 571 tcg_out8(s, 0x0f); 572 if (opc & P_EXT38) { 573 tcg_out8(s, 0x38); 574 } else if (opc & P_EXT3A) { 575 tcg_out8(s, 0x3a); 576 } 577 } 578 tcg_out8(s, opc); 579} 580/* Discard the register arguments to tcg_out_opc early, so as not to penalize 581 the 32-bit compilation paths. This method works with all versions of gcc, 582 whereas relying on optimization may not be able to exclude them. */ 583#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 584#endif 585 586static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 587{ 588 tcg_out_opc(s, opc, r, rm, 0); 589 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 590} 591 592static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 593 int rm, int index) 594{ 595 int tmp; 596 597 if (opc & P_GS) { 598 tcg_out8(s, 0x65); 599 } 600 /* Use the two byte form if possible, which cannot encode 601 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 602 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 603 && ((rm | index) & 8) == 0) { 604 /* Two byte VEX prefix. */ 605 tcg_out8(s, 0xc5); 606 607 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 608 } else { 609 /* Three byte VEX prefix. */ 610 tcg_out8(s, 0xc4); 611 612 /* VEX.m-mmmm */ 613 if (opc & P_EXT3A) { 614 tmp = 3; 615 } else if (opc & P_EXT38) { 616 tmp = 2; 617 } else if (opc & P_EXT) { 618 tmp = 1; 619 } else { 620 g_assert_not_reached(); 621 } 622 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 623 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 624 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 625 tcg_out8(s, tmp); 626 627 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 628 } 629 630 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 631 /* VEX.pp */ 632 if (opc & P_DATA16) { 633 tmp |= 1; /* 0x66 */ 634 } else if (opc & P_SIMDF3) { 635 tmp |= 2; /* 0xf3 */ 636 } else if (opc & P_SIMDF2) { 637 tmp |= 3; /* 0xf2 */ 638 } 639 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 640 tcg_out8(s, tmp); 641 tcg_out8(s, opc); 642} 643 644static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 645 int rm, int index) 646{ 647 /* The entire 4-byte evex prefix; with R' and V' set. */ 648 uint32_t p = 0x08041062; 649 int mm, pp; 650 651 tcg_debug_assert(have_avx512vl); 652 653 /* EVEX.mm */ 654 if (opc & P_EXT3A) { 655 mm = 3; 656 } else if (opc & P_EXT38) { 657 mm = 2; 658 } else if (opc & P_EXT) { 659 mm = 1; 660 } else { 661 g_assert_not_reached(); 662 } 663 664 /* EVEX.pp */ 665 if (opc & P_DATA16) { 666 pp = 1; /* 0x66 */ 667 } else if (opc & P_SIMDF3) { 668 pp = 2; /* 0xf3 */ 669 } else if (opc & P_SIMDF2) { 670 pp = 3; /* 0xf2 */ 671 } else { 672 pp = 0; 673 } 674 675 p = deposit32(p, 8, 2, mm); 676 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 677 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 678 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 679 p = deposit32(p, 16, 2, pp); 680 p = deposit32(p, 19, 4, ~v); 681 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 682 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 683 684 tcg_out32(s, p); 685 tcg_out8(s, opc); 686} 687 688static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 689{ 690 if (opc & P_EVEX) { 691 tcg_out_evex_opc(s, opc, r, v, rm, 0); 692 } else { 693 tcg_out_vex_opc(s, opc, r, v, rm, 0); 694 } 695 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 696} 697 698/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 699 We handle either RM and INDEX missing with a negative value. In 64-bit 700 mode for absolute addresses, ~RM is the size of the immediate operand 701 that will follow the instruction. */ 702 703static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 704 int shift, intptr_t offset) 705{ 706 int mod, len; 707 708 if (index < 0 && rm < 0) { 709 if (TCG_TARGET_REG_BITS == 64) { 710 /* Try for a rip-relative addressing mode. This has replaced 711 the 32-bit-mode absolute addressing encoding. */ 712 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 713 intptr_t disp = offset - pc; 714 if (disp == (int32_t)disp) { 715 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 716 tcg_out32(s, disp); 717 return; 718 } 719 720 /* Try for an absolute address encoding. This requires the 721 use of the MODRM+SIB encoding and is therefore larger than 722 rip-relative addressing. */ 723 if (offset == (int32_t)offset) { 724 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 725 tcg_out8(s, (4 << 3) | 5); 726 tcg_out32(s, offset); 727 return; 728 } 729 730 /* ??? The memory isn't directly addressable. */ 731 g_assert_not_reached(); 732 } else { 733 /* Absolute address. */ 734 tcg_out8(s, (r << 3) | 5); 735 tcg_out32(s, offset); 736 return; 737 } 738 } 739 740 /* Find the length of the immediate addend. Note that the encoding 741 that would be used for (%ebp) indicates absolute addressing. */ 742 if (rm < 0) { 743 mod = 0, len = 4, rm = 5; 744 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 745 mod = 0, len = 0; 746 } else if (offset == (int8_t)offset) { 747 mod = 0x40, len = 1; 748 } else { 749 mod = 0x80, len = 4; 750 } 751 752 /* Use a single byte MODRM format if possible. Note that the encoding 753 that would be used for %esp is the escape to the two byte form. */ 754 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 755 /* Single byte MODRM format. */ 756 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 757 } else { 758 /* Two byte MODRM+SIB format. */ 759 760 /* Note that the encoding that would place %esp into the index 761 field indicates no index register. In 64-bit mode, the REX.X 762 bit counts, so %r12 can be used as the index. */ 763 if (index < 0) { 764 index = 4; 765 } else { 766 tcg_debug_assert(index != TCG_REG_ESP); 767 } 768 769 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 770 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 771 } 772 773 if (len == 1) { 774 tcg_out8(s, offset); 775 } else if (len == 4) { 776 tcg_out32(s, offset); 777 } 778} 779 780static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 781 int index, int shift, intptr_t offset) 782{ 783 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 784 tcg_out_sib_offset(s, r, rm, index, shift, offset); 785} 786 787static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 788 int rm, int index, int shift, 789 intptr_t offset) 790{ 791 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 792 tcg_out_sib_offset(s, r, rm, index, shift, offset); 793} 794 795/* A simplification of the above with no index or shift. */ 796static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 797 int rm, intptr_t offset) 798{ 799 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 800} 801 802static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 803 int v, int rm, intptr_t offset) 804{ 805 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 806} 807 808/* Output an opcode with an expected reference to the constant pool. */ 809static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 810{ 811 tcg_out_opc(s, opc, r, 0, 0); 812 /* Absolute for 32-bit, pc-relative for 64-bit. */ 813 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 814 tcg_out32(s, 0); 815} 816 817/* Output an opcode with an expected reference to the constant pool. */ 818static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 819{ 820 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 821 /* Absolute for 32-bit, pc-relative for 64-bit. */ 822 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 823 tcg_out32(s, 0); 824} 825 826/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 827static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 828{ 829 /* Propagate an opcode prefix, such as P_REXW. */ 830 int ext = subop & ~0x7; 831 subop &= 0x7; 832 833 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 834} 835 836static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 837{ 838 int rexw = 0; 839 840 if (arg == ret) { 841 return true; 842 } 843 switch (type) { 844 case TCG_TYPE_I64: 845 rexw = P_REXW; 846 /* fallthru */ 847 case TCG_TYPE_I32: 848 if (ret < 16) { 849 if (arg < 16) { 850 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 851 } else { 852 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 853 } 854 } else { 855 if (arg < 16) { 856 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 857 } else { 858 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 859 } 860 } 861 break; 862 863 case TCG_TYPE_V64: 864 tcg_debug_assert(ret >= 16 && arg >= 16); 865 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 866 break; 867 case TCG_TYPE_V128: 868 tcg_debug_assert(ret >= 16 && arg >= 16); 869 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 870 break; 871 case TCG_TYPE_V256: 872 tcg_debug_assert(ret >= 16 && arg >= 16); 873 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 874 break; 875 876 default: 877 g_assert_not_reached(); 878 } 879 return true; 880} 881 882static const int avx2_dup_insn[4] = { 883 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 884 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 885}; 886 887static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 888 TCGReg r, TCGReg a) 889{ 890 if (have_avx2) { 891 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 892 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 893 } else { 894 switch (vece) { 895 case MO_8: 896 /* ??? With zero in a register, use PSHUFB. */ 897 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 898 a = r; 899 /* FALLTHRU */ 900 case MO_16: 901 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 902 a = r; 903 /* FALLTHRU */ 904 case MO_32: 905 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 906 /* imm8 operand: all output lanes selected from input lane 0. */ 907 tcg_out8(s, 0); 908 break; 909 case MO_64: 910 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 911 break; 912 default: 913 g_assert_not_reached(); 914 } 915 } 916 return true; 917} 918 919static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 920 TCGReg r, TCGReg base, intptr_t offset) 921{ 922 if (have_avx2) { 923 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 924 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 925 r, 0, base, offset); 926 } else { 927 switch (vece) { 928 case MO_64: 929 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 930 break; 931 case MO_32: 932 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 933 break; 934 case MO_16: 935 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 936 tcg_out8(s, 0); /* imm8 */ 937 tcg_out_dup_vec(s, type, vece, r, r); 938 break; 939 case MO_8: 940 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 941 tcg_out8(s, 0); /* imm8 */ 942 tcg_out_dup_vec(s, type, vece, r, r); 943 break; 944 default: 945 g_assert_not_reached(); 946 } 947 } 948 return true; 949} 950 951static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 952 TCGReg ret, int64_t arg) 953{ 954 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 955 956 if (arg == 0) { 957 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 958 return; 959 } 960 if (arg == -1) { 961 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 962 return; 963 } 964 965 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 966 if (have_avx2) { 967 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 968 } else { 969 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 970 } 971 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 972 } else { 973 if (type == TCG_TYPE_V64) { 974 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 975 } else if (have_avx2) { 976 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 977 } else { 978 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 979 } 980 if (TCG_TARGET_REG_BITS == 64) { 981 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 982 } else { 983 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 984 } 985 } 986} 987 988static void tcg_out_movi_vec(TCGContext *s, TCGType type, 989 TCGReg ret, tcg_target_long arg) 990{ 991 if (arg == 0) { 992 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 993 return; 994 } 995 if (arg == -1) { 996 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 997 return; 998 } 999 1000 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1001 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1002 if (TCG_TARGET_REG_BITS == 64) { 1003 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1004 } else { 1005 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1006 } 1007} 1008 1009static void tcg_out_movi_int(TCGContext *s, TCGType type, 1010 TCGReg ret, tcg_target_long arg) 1011{ 1012 tcg_target_long diff; 1013 1014 if (arg == 0) { 1015 tgen_arithr(s, ARITH_XOR, ret, ret); 1016 return; 1017 } 1018 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1019 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1020 tcg_out32(s, arg); 1021 return; 1022 } 1023 if (arg == (int32_t)arg) { 1024 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1025 tcg_out32(s, arg); 1026 return; 1027 } 1028 1029 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1030 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1031 if (diff == (int32_t)diff) { 1032 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1033 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1034 tcg_out32(s, diff); 1035 return; 1036 } 1037 1038 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1039 tcg_out64(s, arg); 1040} 1041 1042static void tcg_out_movi(TCGContext *s, TCGType type, 1043 TCGReg ret, tcg_target_long arg) 1044{ 1045 switch (type) { 1046 case TCG_TYPE_I32: 1047#if TCG_TARGET_REG_BITS == 64 1048 case TCG_TYPE_I64: 1049#endif 1050 if (ret < 16) { 1051 tcg_out_movi_int(s, type, ret, arg); 1052 } else { 1053 tcg_out_movi_vec(s, type, ret, arg); 1054 } 1055 break; 1056 default: 1057 g_assert_not_reached(); 1058 } 1059} 1060 1061static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1062{ 1063 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1064 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1065 return true; 1066} 1067 1068static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1069 tcg_target_long imm) 1070{ 1071 /* This function is only used for passing structs by reference. */ 1072 tcg_debug_assert(imm == (int32_t)imm); 1073 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1074} 1075 1076static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1077{ 1078 if (val == (int8_t)val) { 1079 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1080 tcg_out8(s, val); 1081 } else if (val == (int32_t)val) { 1082 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1083 tcg_out32(s, val); 1084 } else { 1085 g_assert_not_reached(); 1086 } 1087} 1088 1089static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1090{ 1091 /* Given the strength of x86 memory ordering, we only need care for 1092 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1093 faster than "mfence", so don't bother with the sse insn. */ 1094 if (a0 & TCG_MO_ST_LD) { 1095 tcg_out8(s, 0xf0); 1096 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1097 tcg_out8(s, 0); 1098 } 1099} 1100 1101static inline void tcg_out_push(TCGContext *s, int reg) 1102{ 1103 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1104} 1105 1106static inline void tcg_out_pop(TCGContext *s, int reg) 1107{ 1108 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1109} 1110 1111static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1112 TCGReg arg1, intptr_t arg2) 1113{ 1114 switch (type) { 1115 case TCG_TYPE_I32: 1116 if (ret < 16) { 1117 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1118 } else { 1119 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1120 } 1121 break; 1122 case TCG_TYPE_I64: 1123 if (ret < 16) { 1124 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1125 break; 1126 } 1127 /* FALLTHRU */ 1128 case TCG_TYPE_V64: 1129 /* There is no instruction that can validate 8-byte alignment. */ 1130 tcg_debug_assert(ret >= 16); 1131 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1132 break; 1133 case TCG_TYPE_V128: 1134 /* 1135 * The gvec infrastructure is asserts that v128 vector loads 1136 * and stores use a 16-byte aligned offset. Validate that the 1137 * final pointer is aligned by using an insn that will SIGSEGV. 1138 */ 1139 tcg_debug_assert(ret >= 16); 1140 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1141 break; 1142 case TCG_TYPE_V256: 1143 /* 1144 * The gvec infrastructure only requires 16-byte alignment, 1145 * so here we must use an unaligned load. 1146 */ 1147 tcg_debug_assert(ret >= 16); 1148 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1149 ret, 0, arg1, arg2); 1150 break; 1151 default: 1152 g_assert_not_reached(); 1153 } 1154} 1155 1156static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1157 TCGReg arg1, intptr_t arg2) 1158{ 1159 switch (type) { 1160 case TCG_TYPE_I32: 1161 if (arg < 16) { 1162 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1163 } else { 1164 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1165 } 1166 break; 1167 case TCG_TYPE_I64: 1168 if (arg < 16) { 1169 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1170 break; 1171 } 1172 /* FALLTHRU */ 1173 case TCG_TYPE_V64: 1174 /* There is no instruction that can validate 8-byte alignment. */ 1175 tcg_debug_assert(arg >= 16); 1176 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1177 break; 1178 case TCG_TYPE_V128: 1179 /* 1180 * The gvec infrastructure is asserts that v128 vector loads 1181 * and stores use a 16-byte aligned offset. Validate that the 1182 * final pointer is aligned by using an insn that will SIGSEGV. 1183 * 1184 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1185 * for _WIN64, which must have SSE2 but may not have AVX. 1186 */ 1187 tcg_debug_assert(arg >= 16); 1188 if (have_avx1) { 1189 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1190 } else { 1191 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1192 } 1193 break; 1194 case TCG_TYPE_V256: 1195 /* 1196 * The gvec infrastructure only requires 16-byte alignment, 1197 * so here we must use an unaligned store. 1198 */ 1199 tcg_debug_assert(arg >= 16); 1200 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1201 arg, 0, arg1, arg2); 1202 break; 1203 default: 1204 g_assert_not_reached(); 1205 } 1206} 1207 1208static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1209 TCGReg base, intptr_t ofs) 1210{ 1211 int rexw = 0; 1212 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1213 if (val != (int32_t)val) { 1214 return false; 1215 } 1216 rexw = P_REXW; 1217 } else if (type != TCG_TYPE_I32) { 1218 return false; 1219 } 1220 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1221 tcg_out32(s, val); 1222 return true; 1223} 1224 1225static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1226{ 1227 /* Propagate an opcode prefix, such as P_DATA16. */ 1228 int ext = subopc & ~0x7; 1229 subopc &= 0x7; 1230 1231 if (count == 1) { 1232 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1233 } else { 1234 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1235 tcg_out8(s, count); 1236 } 1237} 1238 1239static inline void tcg_out_bswap32(TCGContext *s, int reg) 1240{ 1241 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1242} 1243 1244static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1245{ 1246 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1247} 1248 1249static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1250{ 1251 /* movzbl */ 1252 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1253 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1254} 1255 1256static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1257{ 1258 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1259 /* movsbl */ 1260 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1261 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1262} 1263 1264static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1265{ 1266 /* movzwl */ 1267 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1268} 1269 1270static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1271{ 1272 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1273 /* movsw[lq] */ 1274 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1275} 1276 1277static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1278{ 1279 /* 32-bit mov zero extends. */ 1280 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1281} 1282 1283static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1284{ 1285 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1286 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1287} 1288 1289static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1290{ 1291 tcg_out_ext32s(s, dest, src); 1292} 1293 1294static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1295{ 1296 if (dest != src) { 1297 tcg_out_ext32u(s, dest, src); 1298 } 1299} 1300 1301static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1302{ 1303 tcg_out_ext32u(s, dest, src); 1304} 1305 1306static inline void tcg_out_bswap64(TCGContext *s, int reg) 1307{ 1308 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1309} 1310 1311static void tgen_arithi(TCGContext *s, int c, int r0, 1312 tcg_target_long val, int cf) 1313{ 1314 int rexw = 0; 1315 1316 if (TCG_TARGET_REG_BITS == 64) { 1317 rexw = c & -8; 1318 c &= 7; 1319 } 1320 1321 switch (c) { 1322 case ARITH_ADD: 1323 case ARITH_SUB: 1324 if (!cf) { 1325 /* 1326 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1327 * partial flags update stalls on Pentium4 and are not recommended 1328 * by current Intel optimization manuals. 1329 */ 1330 if (val == 1 || val == -1) { 1331 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1332 if (TCG_TARGET_REG_BITS == 64) { 1333 /* 1334 * The single-byte increment encodings are re-tasked 1335 * as the REX prefixes. Use the MODRM encoding. 1336 */ 1337 tcg_out_modrm(s, OPC_GRP5 + rexw, 1338 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1339 } else { 1340 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1341 } 1342 return; 1343 } 1344 if (val == 128) { 1345 /* 1346 * Facilitate using an 8-bit immediate. Carry is inverted 1347 * by this transformation, so do it only if cf == 0. 1348 */ 1349 c ^= ARITH_ADD ^ ARITH_SUB; 1350 val = -128; 1351 } 1352 } 1353 break; 1354 1355 case ARITH_AND: 1356 if (TCG_TARGET_REG_BITS == 64) { 1357 if (val == 0xffffffffu) { 1358 tcg_out_ext32u(s, r0, r0); 1359 return; 1360 } 1361 if (val == (uint32_t)val) { 1362 /* AND with no high bits set can use a 32-bit operation. */ 1363 rexw = 0; 1364 } 1365 } 1366 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1367 tcg_out_ext8u(s, r0, r0); 1368 return; 1369 } 1370 if (val == 0xffffu) { 1371 tcg_out_ext16u(s, r0, r0); 1372 return; 1373 } 1374 break; 1375 1376 case ARITH_OR: 1377 case ARITH_XOR: 1378 if (val >= 0x80 && val <= 0xff 1379 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1380 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1381 tcg_out8(s, val); 1382 return; 1383 } 1384 break; 1385 } 1386 1387 if (val == (int8_t)val) { 1388 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1389 tcg_out8(s, val); 1390 return; 1391 } 1392 if (rexw == 0 || val == (int32_t)val) { 1393 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1394 tcg_out32(s, val); 1395 return; 1396 } 1397 1398 g_assert_not_reached(); 1399} 1400 1401static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1402{ 1403 if (val != 0) { 1404 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1405 } 1406} 1407 1408/* Set SMALL to force a short forward branch. */ 1409static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1410{ 1411 int32_t val, val1; 1412 1413 if (l->has_value) { 1414 val = tcg_pcrel_diff(s, l->u.value_ptr); 1415 val1 = val - 2; 1416 if ((int8_t)val1 == val1) { 1417 if (opc == -1) { 1418 tcg_out8(s, OPC_JMP_short); 1419 } else { 1420 tcg_out8(s, OPC_JCC_short + opc); 1421 } 1422 tcg_out8(s, val1); 1423 } else { 1424 tcg_debug_assert(!small); 1425 if (opc == -1) { 1426 tcg_out8(s, OPC_JMP_long); 1427 tcg_out32(s, val - 5); 1428 } else { 1429 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1430 tcg_out32(s, val - 6); 1431 } 1432 } 1433 } else if (small) { 1434 if (opc == -1) { 1435 tcg_out8(s, OPC_JMP_short); 1436 } else { 1437 tcg_out8(s, OPC_JCC_short + opc); 1438 } 1439 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1440 s->code_ptr += 1; 1441 } else { 1442 if (opc == -1) { 1443 tcg_out8(s, OPC_JMP_long); 1444 } else { 1445 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1446 } 1447 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1448 s->code_ptr += 4; 1449 } 1450} 1451 1452static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1453 int const_arg2, int rexw) 1454{ 1455 if (const_arg2) { 1456 if (arg2 == 0) { 1457 /* test r, r */ 1458 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1459 } else { 1460 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1461 } 1462 } else { 1463 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1464 } 1465} 1466 1467static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1468 TCGArg arg1, TCGArg arg2, int const_arg2, 1469 TCGLabel *label, bool small) 1470{ 1471 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw); 1472 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1473} 1474 1475#if TCG_TARGET_REG_BITS == 32 1476static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1477 const int *const_args, bool small) 1478{ 1479 TCGLabel *label_next = gen_new_label(); 1480 TCGLabel *label_this = arg_label(args[5]); 1481 1482 switch(args[4]) { 1483 case TCG_COND_EQ: 1484 tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2], 1485 label_next, 1); 1486 tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3], 1487 label_this, small); 1488 break; 1489 case TCG_COND_NE: 1490 tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2], 1491 label_this, small); 1492 tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3], 1493 label_this, small); 1494 break; 1495 case TCG_COND_LT: 1496 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1497 label_this, small); 1498 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1499 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1500 label_this, small); 1501 break; 1502 case TCG_COND_LE: 1503 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1504 label_this, small); 1505 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1506 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1507 label_this, small); 1508 break; 1509 case TCG_COND_GT: 1510 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1511 label_this, small); 1512 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1513 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1514 label_this, small); 1515 break; 1516 case TCG_COND_GE: 1517 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1518 label_this, small); 1519 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1520 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1521 label_this, small); 1522 break; 1523 case TCG_COND_LTU: 1524 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1525 label_this, small); 1526 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1527 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1528 label_this, small); 1529 break; 1530 case TCG_COND_LEU: 1531 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1532 label_this, small); 1533 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1534 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1535 label_this, small); 1536 break; 1537 case TCG_COND_GTU: 1538 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1539 label_this, small); 1540 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1541 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1542 label_this, small); 1543 break; 1544 case TCG_COND_GEU: 1545 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1546 label_this, small); 1547 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1548 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1549 label_this, small); 1550 break; 1551 default: 1552 g_assert_not_reached(); 1553 } 1554 tcg_out_label(s, label_next); 1555} 1556#endif 1557 1558static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1559 TCGArg dest, TCGArg arg1, TCGArg arg2, 1560 int const_arg2, bool neg) 1561{ 1562 bool inv = false; 1563 bool cleared; 1564 1565 switch (cond) { 1566 case TCG_COND_NE: 1567 inv = true; 1568 /* fall through */ 1569 case TCG_COND_EQ: 1570 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1571 if (const_arg2 && arg2 == 0) { 1572 arg2 = 1; 1573 goto do_ltu; 1574 } 1575 break; 1576 1577 case TCG_COND_LEU: 1578 inv = true; 1579 /* fall through */ 1580 case TCG_COND_GTU: 1581 /* If arg2 is a register, swap for LTU/GEU. */ 1582 if (!const_arg2) { 1583 TCGReg t = arg1; 1584 arg1 = arg2; 1585 arg2 = t; 1586 goto do_ltu; 1587 } 1588 break; 1589 1590 case TCG_COND_GEU: 1591 inv = true; 1592 /* fall through */ 1593 case TCG_COND_LTU: 1594 do_ltu: 1595 /* 1596 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1597 * We can then use NEG or INC to produce the desired result. 1598 * This is always smaller than the SETCC expansion. 1599 */ 1600 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw); 1601 1602 /* X - X - C = -C = (C ? -1 : 0) */ 1603 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1604 if (inv && neg) { 1605 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1606 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1607 } else if (inv) { 1608 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1609 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1610 } else if (!neg) { 1611 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1612 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1613 } 1614 return; 1615 1616 case TCG_COND_GE: 1617 inv = true; 1618 /* fall through */ 1619 case TCG_COND_LT: 1620 /* If arg2 is 0, extract the sign bit. */ 1621 if (const_arg2 && arg2 == 0) { 1622 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1623 if (inv) { 1624 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1625 } 1626 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1627 dest, rexw ? 63 : 31); 1628 return; 1629 } 1630 break; 1631 1632 default: 1633 break; 1634 } 1635 1636 /* 1637 * If dest does not overlap the inputs, clearing it first is preferred. 1638 * The XOR breaks any false dependency for the low-byte write to dest, 1639 * and is also one byte smaller than MOVZBL. 1640 */ 1641 cleared = false; 1642 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1643 tgen_arithr(s, ARITH_XOR, dest, dest); 1644 cleared = true; 1645 } 1646 1647 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw); 1648 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1649 1650 if (!cleared) { 1651 tcg_out_ext8u(s, dest, dest); 1652 } 1653 if (neg) { 1654 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1655 } 1656} 1657 1658#if TCG_TARGET_REG_BITS == 32 1659static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1660 const int *const_args) 1661{ 1662 TCGArg new_args[6]; 1663 TCGLabel *label_true, *label_over; 1664 1665 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1666 1667 if (args[0] == args[1] || args[0] == args[2] 1668 || (!const_args[3] && args[0] == args[3]) 1669 || (!const_args[4] && args[0] == args[4])) { 1670 /* When the destination overlaps with one of the argument 1671 registers, don't do anything tricky. */ 1672 label_true = gen_new_label(); 1673 label_over = gen_new_label(); 1674 1675 new_args[5] = label_arg(label_true); 1676 tcg_out_brcond2(s, new_args, const_args+1, 1); 1677 1678 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1679 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1680 tcg_out_label(s, label_true); 1681 1682 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1683 tcg_out_label(s, label_over); 1684 } else { 1685 /* When the destination does not overlap one of the arguments, 1686 clear the destination first, jump if cond false, and emit an 1687 increment in the true case. This results in smaller code. */ 1688 1689 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1690 1691 label_over = gen_new_label(); 1692 new_args[4] = tcg_invert_cond(new_args[4]); 1693 new_args[5] = label_arg(label_over); 1694 tcg_out_brcond2(s, new_args, const_args+1, 1); 1695 1696 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1697 tcg_out_label(s, label_over); 1698 } 1699} 1700#endif 1701 1702static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1703 TCGReg dest, TCGReg v1) 1704{ 1705 if (have_cmov) { 1706 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1707 } else { 1708 TCGLabel *over = gen_new_label(); 1709 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1710 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1711 tcg_out_label(s, over); 1712 } 1713} 1714 1715static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1716 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1717 TCGReg v1) 1718{ 1719 tcg_out_cmp(s, c1, c2, const_c2, rexw); 1720 tcg_out_cmov(s, cond, rexw, dest, v1); 1721} 1722 1723static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1724 TCGArg arg2, bool const_a2) 1725{ 1726 if (have_bmi1) { 1727 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1728 if (const_a2) { 1729 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1730 } else { 1731 tcg_debug_assert(dest != arg2); 1732 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1733 } 1734 } else { 1735 tcg_debug_assert(dest != arg2); 1736 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1737 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1738 } 1739} 1740 1741static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1742 TCGArg arg2, bool const_a2) 1743{ 1744 if (have_lzcnt) { 1745 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1746 if (const_a2) { 1747 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1748 } else { 1749 tcg_debug_assert(dest != arg2); 1750 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1751 } 1752 } else { 1753 tcg_debug_assert(!const_a2); 1754 tcg_debug_assert(dest != arg1); 1755 tcg_debug_assert(dest != arg2); 1756 1757 /* Recall that the output of BSR is the index not the count. */ 1758 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1759 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1760 1761 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1762 tcg_out_cmp(s, arg1, 0, 1, rexw); 1763 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1764 } 1765} 1766 1767static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1768{ 1769 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1770 1771 if (disp == (int32_t)disp) { 1772 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1773 tcg_out32(s, disp); 1774 } else { 1775 /* rip-relative addressing into the constant pool. 1776 This is 6 + 8 = 14 bytes, as compared to using an 1777 immediate load 10 + 6 = 16 bytes, plus we may 1778 be able to re-use the pool constant for more calls. */ 1779 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1780 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1781 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1782 tcg_out32(s, 0); 1783 } 1784} 1785 1786static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1787 const TCGHelperInfo *info) 1788{ 1789 tcg_out_branch(s, 1, dest); 1790 1791#ifndef _WIN32 1792 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1793 /* 1794 * The sysv i386 abi for struct return places a reference as the 1795 * first argument of the stack, and pops that argument with the 1796 * return statement. Since we want to retain the aligned stack 1797 * pointer for the callee, we do not want to actually push that 1798 * argument before the call but rely on the normal store to the 1799 * stack slot. But we do need to compensate for the pop in order 1800 * to reset our correct stack pointer value. 1801 * Pushing a garbage value back onto the stack is quickest. 1802 */ 1803 tcg_out_push(s, TCG_REG_EAX); 1804 } 1805#endif 1806} 1807 1808static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1809{ 1810 tcg_out_branch(s, 0, dest); 1811} 1812 1813static void tcg_out_nopn(TCGContext *s, int n) 1814{ 1815 int i; 1816 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1817 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1818 * duplicate prefix, and all of the interesting recent cores can 1819 * decode and discard the duplicates in a single cycle. 1820 */ 1821 tcg_debug_assert(n >= 1); 1822 for (i = 1; i < n; ++i) { 1823 tcg_out8(s, 0x66); 1824 } 1825 tcg_out8(s, 0x90); 1826} 1827 1828/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */ 1829static void __attribute__((unused)) 1830tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i) 1831{ 1832 /* 1833 * This is used for testing alignment, so we can usually use testb. 1834 * For i686, we have to use testl for %esi/%edi. 1835 */ 1836 if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) { 1837 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r); 1838 tcg_out8(s, i); 1839 } else { 1840 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r); 1841 tcg_out32(s, i); 1842 } 1843} 1844 1845typedef struct { 1846 TCGReg base; 1847 int index; 1848 int ofs; 1849 int seg; 1850 TCGAtomAlign aa; 1851} HostAddress; 1852 1853bool tcg_target_has_memory_bswap(MemOp memop) 1854{ 1855 TCGAtomAlign aa; 1856 1857 if (!have_movbe) { 1858 return false; 1859 } 1860 if ((memop & MO_SIZE) < MO_128) { 1861 return true; 1862 } 1863 1864 /* 1865 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1866 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1867 */ 1868 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 1869 return aa.atom < MO_128; 1870} 1871 1872/* 1873 * Because i686 has no register parameters and because x86_64 has xchg 1874 * to handle addr/data register overlap, we have placed all input arguments 1875 * before we need might need a scratch reg. 1876 * 1877 * Even then, a scratch is only needed for l->raddr. Rather than expose 1878 * a general-purpose scratch when we don't actually know it's available, 1879 * use the ra_gen hook to load into RAX if needed. 1880 */ 1881#if TCG_TARGET_REG_BITS == 64 1882static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1883{ 1884 if (arg < 0) { 1885 arg = TCG_REG_RAX; 1886 } 1887 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1888 return arg; 1889} 1890static const TCGLdstHelperParam ldst_helper_param = { 1891 .ra_gen = ldst_ra_gen 1892}; 1893#else 1894static const TCGLdstHelperParam ldst_helper_param = { }; 1895#endif 1896 1897static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 1898 TCGReg l, TCGReg h, TCGReg v) 1899{ 1900 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1901 1902 /* vpmov{d,q} %v, %l */ 1903 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 1904 /* vpextr{d,q} $1, %v, %h */ 1905 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 1906 tcg_out8(s, 1); 1907} 1908 1909static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 1910 TCGReg v, TCGReg l, TCGReg h) 1911{ 1912 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1913 1914 /* vmov{d,q} %l, %v */ 1915 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 1916 /* vpinsr{d,q} $1, %h, %v, %v */ 1917 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 1918 tcg_out8(s, 1); 1919} 1920 1921/* 1922 * Generate code for the slow path for a load at the end of block 1923 */ 1924static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1925{ 1926 MemOp opc = get_memop(l->oi); 1927 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1928 1929 /* resolve label address */ 1930 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1931 if (label_ptr[1]) { 1932 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1933 } 1934 1935 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 1936 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 1937 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 1938 1939 tcg_out_jmp(s, l->raddr); 1940 return true; 1941} 1942 1943/* 1944 * Generate code for the slow path for a store at the end of block 1945 */ 1946static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1947{ 1948 MemOp opc = get_memop(l->oi); 1949 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1950 1951 /* resolve label address */ 1952 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1953 if (label_ptr[1]) { 1954 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1955 } 1956 1957 tcg_out_st_helper_args(s, l, &ldst_helper_param); 1958 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 1959 1960 tcg_out_jmp(s, l->raddr); 1961 return true; 1962} 1963 1964#ifdef CONFIG_USER_ONLY 1965static HostAddress x86_guest_base = { 1966 .index = -1 1967}; 1968 1969#if defined(__x86_64__) && defined(__linux__) 1970# include <asm/prctl.h> 1971# include <sys/prctl.h> 1972int arch_prctl(int code, unsigned long addr); 1973static inline int setup_guest_base_seg(void) 1974{ 1975 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 1976 return P_GS; 1977 } 1978 return 0; 1979} 1980#define setup_guest_base_seg setup_guest_base_seg 1981#elif defined(__x86_64__) && \ 1982 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 1983# include <machine/sysarch.h> 1984static inline int setup_guest_base_seg(void) 1985{ 1986 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 1987 return P_GS; 1988 } 1989 return 0; 1990} 1991#define setup_guest_base_seg setup_guest_base_seg 1992#endif 1993#else 1994# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 1995#endif /* CONFIG_USER_ONLY */ 1996#ifndef setup_guest_base_seg 1997# define setup_guest_base_seg() 0 1998#endif 1999 2000#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2001 2002/* 2003 * For softmmu, perform the TLB load and compare. 2004 * For useronly, perform any required alignment tests. 2005 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2006 * is required and fill in @h with the host address for the fast path. 2007 */ 2008static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2009 TCGReg addrlo, TCGReg addrhi, 2010 MemOpIdx oi, bool is_ld) 2011{ 2012 TCGLabelQemuLdst *ldst = NULL; 2013 MemOp opc = get_memop(oi); 2014 MemOp s_bits = opc & MO_SIZE; 2015 unsigned a_mask; 2016 2017 if (tcg_use_softmmu) { 2018 h->index = TCG_REG_L0; 2019 h->ofs = 0; 2020 h->seg = 0; 2021 } else { 2022 *h = x86_guest_base; 2023 } 2024 h->base = addrlo; 2025 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2026 a_mask = (1 << h->aa.align) - 1; 2027 2028 if (tcg_use_softmmu) { 2029 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2030 : offsetof(CPUTLBEntry, addr_write); 2031 TCGType ttype = TCG_TYPE_I32; 2032 TCGType tlbtype = TCG_TYPE_I32; 2033 int trexw = 0, hrexw = 0, tlbrexw = 0; 2034 unsigned mem_index = get_mmuidx(oi); 2035 unsigned s_mask = (1 << s_bits) - 1; 2036 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2037 int tlb_mask; 2038 2039 ldst = new_ldst_label(s); 2040 ldst->is_ld = is_ld; 2041 ldst->oi = oi; 2042 ldst->addrlo_reg = addrlo; 2043 ldst->addrhi_reg = addrhi; 2044 2045 if (TCG_TARGET_REG_BITS == 64) { 2046 ttype = s->addr_type; 2047 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2048 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2049 hrexw = P_REXW; 2050 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2051 tlbtype = TCG_TYPE_I64; 2052 tlbrexw = P_REXW; 2053 } 2054 } 2055 } 2056 2057 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 2058 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2059 s->page_bits - CPU_TLB_ENTRY_BITS); 2060 2061 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2062 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2063 2064 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2065 fast_ofs + offsetof(CPUTLBDescFast, table)); 2066 2067 /* 2068 * If the required alignment is at least as large as the access, 2069 * simply copy the address and mask. For lesser alignments, 2070 * check that we don't cross pages for the complete access. 2071 */ 2072 if (a_mask >= s_mask) { 2073 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 2074 } else { 2075 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2076 addrlo, s_mask - a_mask); 2077 } 2078 tlb_mask = s->page_mask | a_mask; 2079 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2080 2081 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2082 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2083 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2084 2085 /* jne slow_path */ 2086 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2087 ldst->label_ptr[0] = s->code_ptr; 2088 s->code_ptr += 4; 2089 2090 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 2091 /* cmp 4(TCG_REG_L0), addrhi */ 2092 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, 2093 TCG_REG_L0, cmp_ofs + 4); 2094 2095 /* jne slow_path */ 2096 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2097 ldst->label_ptr[1] = s->code_ptr; 2098 s->code_ptr += 4; 2099 } 2100 2101 /* TLB Hit. */ 2102 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2103 offsetof(CPUTLBEntry, addend)); 2104 } else if (a_mask) { 2105 ldst = new_ldst_label(s); 2106 2107 ldst->is_ld = is_ld; 2108 ldst->oi = oi; 2109 ldst->addrlo_reg = addrlo; 2110 ldst->addrhi_reg = addrhi; 2111 2112 tcg_out_testi(s, addrlo, a_mask); 2113 /* jne slow_path */ 2114 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2115 ldst->label_ptr[0] = s->code_ptr; 2116 s->code_ptr += 4; 2117 } 2118 2119 return ldst; 2120} 2121 2122static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2123 HostAddress h, TCGType type, MemOp memop) 2124{ 2125 bool use_movbe = false; 2126 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2127 int movop = OPC_MOVL_GvEv; 2128 2129 /* Do big-endian loads with movbe. */ 2130 if (memop & MO_BSWAP) { 2131 tcg_debug_assert(have_movbe); 2132 use_movbe = true; 2133 movop = OPC_MOVBE_GyMy; 2134 } 2135 2136 switch (memop & MO_SSIZE) { 2137 case MO_UB: 2138 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2139 h.base, h.index, 0, h.ofs); 2140 break; 2141 case MO_SB: 2142 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2143 h.base, h.index, 0, h.ofs); 2144 break; 2145 case MO_UW: 2146 if (use_movbe) { 2147 /* There is no extending movbe; only low 16-bits are modified. */ 2148 if (datalo != h.base && datalo != h.index) { 2149 /* XOR breaks dependency chains. */ 2150 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2151 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2152 datalo, h.base, h.index, 0, h.ofs); 2153 } else { 2154 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2155 datalo, h.base, h.index, 0, h.ofs); 2156 tcg_out_ext16u(s, datalo, datalo); 2157 } 2158 } else { 2159 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2160 h.base, h.index, 0, h.ofs); 2161 } 2162 break; 2163 case MO_SW: 2164 if (use_movbe) { 2165 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2166 datalo, h.base, h.index, 0, h.ofs); 2167 tcg_out_ext16s(s, type, datalo, datalo); 2168 } else { 2169 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2170 datalo, h.base, h.index, 0, h.ofs); 2171 } 2172 break; 2173 case MO_UL: 2174 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2175 h.base, h.index, 0, h.ofs); 2176 break; 2177#if TCG_TARGET_REG_BITS == 64 2178 case MO_SL: 2179 if (use_movbe) { 2180 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2181 h.base, h.index, 0, h.ofs); 2182 tcg_out_ext32s(s, datalo, datalo); 2183 } else { 2184 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2185 h.base, h.index, 0, h.ofs); 2186 } 2187 break; 2188#endif 2189 case MO_UQ: 2190 if (TCG_TARGET_REG_BITS == 64) { 2191 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2192 h.base, h.index, 0, h.ofs); 2193 break; 2194 } 2195 if (use_movbe) { 2196 TCGReg t = datalo; 2197 datalo = datahi; 2198 datahi = t; 2199 } 2200 if (h.base == datalo || h.index == datalo) { 2201 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2202 h.base, h.index, 0, h.ofs); 2203 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2204 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2205 } else { 2206 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2207 h.base, h.index, 0, h.ofs); 2208 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2209 h.base, h.index, 0, h.ofs + 4); 2210 } 2211 break; 2212 2213 case MO_128: 2214 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2215 2216 /* 2217 * Without 16-byte atomicity, use integer regs. 2218 * That is where we want the data, and it allows bswaps. 2219 */ 2220 if (h.aa.atom < MO_128) { 2221 if (use_movbe) { 2222 TCGReg t = datalo; 2223 datalo = datahi; 2224 datahi = t; 2225 } 2226 if (h.base == datalo || h.index == datalo) { 2227 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2228 h.base, h.index, 0, h.ofs); 2229 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2230 datalo, datahi, 0); 2231 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2232 datahi, datahi, 8); 2233 } else { 2234 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2235 h.base, h.index, 0, h.ofs); 2236 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2237 h.base, h.index, 0, h.ofs + 8); 2238 } 2239 break; 2240 } 2241 2242 /* 2243 * With 16-byte atomicity, a vector load is required. 2244 * If we already have 16-byte alignment, then VMOVDQA always works. 2245 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2246 * Else use we require a runtime test for alignment for VMOVDQA; 2247 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2248 */ 2249 if (h.aa.align >= MO_128) { 2250 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2251 TCG_TMP_VEC, 0, 2252 h.base, h.index, 0, h.ofs); 2253 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2254 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2255 TCG_TMP_VEC, 0, 2256 h.base, h.index, 0, h.ofs); 2257 } else { 2258 TCGLabel *l1 = gen_new_label(); 2259 TCGLabel *l2 = gen_new_label(); 2260 2261 tcg_out_testi(s, h.base, 15); 2262 tcg_out_jxx(s, JCC_JNE, l1, true); 2263 2264 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2265 TCG_TMP_VEC, 0, 2266 h.base, h.index, 0, h.ofs); 2267 tcg_out_jxx(s, JCC_JMP, l2, true); 2268 2269 tcg_out_label(s, l1); 2270 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2271 TCG_TMP_VEC, 0, 2272 h.base, h.index, 0, h.ofs); 2273 tcg_out_label(s, l2); 2274 } 2275 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2276 break; 2277 2278 default: 2279 g_assert_not_reached(); 2280 } 2281} 2282 2283static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2284 TCGReg addrlo, TCGReg addrhi, 2285 MemOpIdx oi, TCGType data_type) 2286{ 2287 TCGLabelQemuLdst *ldst; 2288 HostAddress h; 2289 2290 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2291 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2292 2293 if (ldst) { 2294 ldst->type = data_type; 2295 ldst->datalo_reg = datalo; 2296 ldst->datahi_reg = datahi; 2297 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2298 } 2299} 2300 2301static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2302 HostAddress h, MemOp memop) 2303{ 2304 bool use_movbe = false; 2305 int movop = OPC_MOVL_EvGv; 2306 2307 /* 2308 * Do big-endian stores with movbe or system-mode. 2309 * User-only without movbe will have its swapping done generically. 2310 */ 2311 if (memop & MO_BSWAP) { 2312 tcg_debug_assert(have_movbe); 2313 use_movbe = true; 2314 movop = OPC_MOVBE_MyGy; 2315 } 2316 2317 switch (memop & MO_SIZE) { 2318 case MO_8: 2319 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2320 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2321 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2322 datalo, h.base, h.index, 0, h.ofs); 2323 break; 2324 case MO_16: 2325 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2326 h.base, h.index, 0, h.ofs); 2327 break; 2328 case MO_32: 2329 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2330 h.base, h.index, 0, h.ofs); 2331 break; 2332 case MO_64: 2333 if (TCG_TARGET_REG_BITS == 64) { 2334 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2335 h.base, h.index, 0, h.ofs); 2336 } else { 2337 if (use_movbe) { 2338 TCGReg t = datalo; 2339 datalo = datahi; 2340 datahi = t; 2341 } 2342 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2343 h.base, h.index, 0, h.ofs); 2344 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2345 h.base, h.index, 0, h.ofs + 4); 2346 } 2347 break; 2348 2349 case MO_128: 2350 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2351 2352 /* 2353 * Without 16-byte atomicity, use integer regs. 2354 * That is where we have the data, and it allows bswaps. 2355 */ 2356 if (h.aa.atom < MO_128) { 2357 if (use_movbe) { 2358 TCGReg t = datalo; 2359 datalo = datahi; 2360 datahi = t; 2361 } 2362 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2363 h.base, h.index, 0, h.ofs); 2364 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2365 h.base, h.index, 0, h.ofs + 8); 2366 break; 2367 } 2368 2369 /* 2370 * With 16-byte atomicity, a vector store is required. 2371 * If we already have 16-byte alignment, then VMOVDQA always works. 2372 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2373 * Else use we require a runtime test for alignment for VMOVDQA; 2374 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2375 */ 2376 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2377 if (h.aa.align >= MO_128) { 2378 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2379 TCG_TMP_VEC, 0, 2380 h.base, h.index, 0, h.ofs); 2381 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2382 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2383 TCG_TMP_VEC, 0, 2384 h.base, h.index, 0, h.ofs); 2385 } else { 2386 TCGLabel *l1 = gen_new_label(); 2387 TCGLabel *l2 = gen_new_label(); 2388 2389 tcg_out_testi(s, h.base, 15); 2390 tcg_out_jxx(s, JCC_JNE, l1, true); 2391 2392 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2393 TCG_TMP_VEC, 0, 2394 h.base, h.index, 0, h.ofs); 2395 tcg_out_jxx(s, JCC_JMP, l2, true); 2396 2397 tcg_out_label(s, l1); 2398 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2399 TCG_TMP_VEC, 0, 2400 h.base, h.index, 0, h.ofs); 2401 tcg_out_label(s, l2); 2402 } 2403 break; 2404 2405 default: 2406 g_assert_not_reached(); 2407 } 2408} 2409 2410static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2411 TCGReg addrlo, TCGReg addrhi, 2412 MemOpIdx oi, TCGType data_type) 2413{ 2414 TCGLabelQemuLdst *ldst; 2415 HostAddress h; 2416 2417 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2418 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2419 2420 if (ldst) { 2421 ldst->type = data_type; 2422 ldst->datalo_reg = datalo; 2423 ldst->datahi_reg = datahi; 2424 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2425 } 2426} 2427 2428static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2429{ 2430 /* Reuse the zeroing that exists for goto_ptr. */ 2431 if (a0 == 0) { 2432 tcg_out_jmp(s, tcg_code_gen_epilogue); 2433 } else { 2434 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2435 tcg_out_jmp(s, tb_ret_addr); 2436 } 2437} 2438 2439static void tcg_out_goto_tb(TCGContext *s, int which) 2440{ 2441 /* 2442 * Jump displacement must be aligned for atomic patching; 2443 * see if we need to add extra nops before jump 2444 */ 2445 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2446 if (gap != 1) { 2447 tcg_out_nopn(s, gap - 1); 2448 } 2449 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2450 set_jmp_insn_offset(s, which); 2451 tcg_out32(s, 0); 2452 set_jmp_reset_offset(s, which); 2453} 2454 2455void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2456 uintptr_t jmp_rx, uintptr_t jmp_rw) 2457{ 2458 /* patch the branch destination */ 2459 uintptr_t addr = tb->jmp_target_addr[n]; 2460 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2461 /* no need to flush icache explicitly */ 2462} 2463 2464static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2465 const TCGArg args[TCG_MAX_OP_ARGS], 2466 const int const_args[TCG_MAX_OP_ARGS]) 2467{ 2468 TCGArg a0, a1, a2; 2469 int c, const_a2, vexop, rexw = 0; 2470 2471#if TCG_TARGET_REG_BITS == 64 2472# define OP_32_64(x) \ 2473 case glue(glue(INDEX_op_, x), _i64): \ 2474 rexw = P_REXW; /* FALLTHRU */ \ 2475 case glue(glue(INDEX_op_, x), _i32) 2476#else 2477# define OP_32_64(x) \ 2478 case glue(glue(INDEX_op_, x), _i32) 2479#endif 2480 2481 /* Hoist the loads of the most common arguments. */ 2482 a0 = args[0]; 2483 a1 = args[1]; 2484 a2 = args[2]; 2485 const_a2 = const_args[2]; 2486 2487 switch (opc) { 2488 case INDEX_op_goto_ptr: 2489 /* jmp to the given host address (could be epilogue) */ 2490 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2491 break; 2492 case INDEX_op_br: 2493 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2494 break; 2495 OP_32_64(ld8u): 2496 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2497 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2498 break; 2499 OP_32_64(ld8s): 2500 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2501 break; 2502 OP_32_64(ld16u): 2503 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2504 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2505 break; 2506 OP_32_64(ld16s): 2507 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2508 break; 2509#if TCG_TARGET_REG_BITS == 64 2510 case INDEX_op_ld32u_i64: 2511#endif 2512 case INDEX_op_ld_i32: 2513 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2514 break; 2515 2516 OP_32_64(st8): 2517 if (const_args[0]) { 2518 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2519 tcg_out8(s, a0); 2520 } else { 2521 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2522 } 2523 break; 2524 OP_32_64(st16): 2525 if (const_args[0]) { 2526 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2527 tcg_out16(s, a0); 2528 } else { 2529 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2530 } 2531 break; 2532#if TCG_TARGET_REG_BITS == 64 2533 case INDEX_op_st32_i64: 2534#endif 2535 case INDEX_op_st_i32: 2536 if (const_args[0]) { 2537 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2538 tcg_out32(s, a0); 2539 } else { 2540 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2541 } 2542 break; 2543 2544 OP_32_64(add): 2545 /* For 3-operand addition, use LEA. */ 2546 if (a0 != a1) { 2547 TCGArg c3 = 0; 2548 if (const_a2) { 2549 c3 = a2, a2 = -1; 2550 } else if (a0 == a2) { 2551 /* Watch out for dest = src + dest, since we've removed 2552 the matching constraint on the add. */ 2553 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2554 break; 2555 } 2556 2557 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2558 break; 2559 } 2560 c = ARITH_ADD; 2561 goto gen_arith; 2562 OP_32_64(sub): 2563 c = ARITH_SUB; 2564 goto gen_arith; 2565 OP_32_64(and): 2566 c = ARITH_AND; 2567 goto gen_arith; 2568 OP_32_64(or): 2569 c = ARITH_OR; 2570 goto gen_arith; 2571 OP_32_64(xor): 2572 c = ARITH_XOR; 2573 goto gen_arith; 2574 gen_arith: 2575 if (const_a2) { 2576 tgen_arithi(s, c + rexw, a0, a2, 0); 2577 } else { 2578 tgen_arithr(s, c + rexw, a0, a2); 2579 } 2580 break; 2581 2582 OP_32_64(andc): 2583 if (const_a2) { 2584 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2585 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2586 } else { 2587 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2588 } 2589 break; 2590 2591 OP_32_64(mul): 2592 if (const_a2) { 2593 int32_t val; 2594 val = a2; 2595 if (val == (int8_t)val) { 2596 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2597 tcg_out8(s, val); 2598 } else { 2599 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2600 tcg_out32(s, val); 2601 } 2602 } else { 2603 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2604 } 2605 break; 2606 2607 OP_32_64(div2): 2608 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2609 break; 2610 OP_32_64(divu2): 2611 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2612 break; 2613 2614 OP_32_64(shl): 2615 /* For small constant 3-operand shift, use LEA. */ 2616 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2617 if (a2 - 1 == 0) { 2618 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2619 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2620 } else { 2621 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2622 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2623 } 2624 break; 2625 } 2626 c = SHIFT_SHL; 2627 vexop = OPC_SHLX; 2628 goto gen_shift_maybe_vex; 2629 OP_32_64(shr): 2630 c = SHIFT_SHR; 2631 vexop = OPC_SHRX; 2632 goto gen_shift_maybe_vex; 2633 OP_32_64(sar): 2634 c = SHIFT_SAR; 2635 vexop = OPC_SARX; 2636 goto gen_shift_maybe_vex; 2637 OP_32_64(rotl): 2638 c = SHIFT_ROL; 2639 goto gen_shift; 2640 OP_32_64(rotr): 2641 c = SHIFT_ROR; 2642 goto gen_shift; 2643 gen_shift_maybe_vex: 2644 if (have_bmi2) { 2645 if (!const_a2) { 2646 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2647 break; 2648 } 2649 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2650 } 2651 /* FALLTHRU */ 2652 gen_shift: 2653 if (const_a2) { 2654 tcg_out_shifti(s, c + rexw, a0, a2); 2655 } else { 2656 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2657 } 2658 break; 2659 2660 OP_32_64(ctz): 2661 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2662 break; 2663 OP_32_64(clz): 2664 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2665 break; 2666 OP_32_64(ctpop): 2667 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2668 break; 2669 2670 OP_32_64(brcond): 2671 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 2672 arg_label(args[3]), 0); 2673 break; 2674 OP_32_64(setcond): 2675 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 2676 break; 2677 OP_32_64(negsetcond): 2678 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 2679 break; 2680 OP_32_64(movcond): 2681 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 2682 break; 2683 2684 OP_32_64(bswap16): 2685 if (a2 & TCG_BSWAP_OS) { 2686 /* Output must be sign-extended. */ 2687 if (rexw) { 2688 tcg_out_bswap64(s, a0); 2689 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2690 } else { 2691 tcg_out_bswap32(s, a0); 2692 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2693 } 2694 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2695 /* Output must be zero-extended, but input isn't. */ 2696 tcg_out_bswap32(s, a0); 2697 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2698 } else { 2699 tcg_out_rolw_8(s, a0); 2700 } 2701 break; 2702 OP_32_64(bswap32): 2703 tcg_out_bswap32(s, a0); 2704 if (rexw && (a2 & TCG_BSWAP_OS)) { 2705 tcg_out_ext32s(s, a0, a0); 2706 } 2707 break; 2708 2709 OP_32_64(neg): 2710 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2711 break; 2712 OP_32_64(not): 2713 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2714 break; 2715 2716 case INDEX_op_qemu_ld_a64_i32: 2717 if (TCG_TARGET_REG_BITS == 32) { 2718 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2719 break; 2720 } 2721 /* fall through */ 2722 case INDEX_op_qemu_ld_a32_i32: 2723 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2724 break; 2725 case INDEX_op_qemu_ld_a32_i64: 2726 if (TCG_TARGET_REG_BITS == 64) { 2727 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2728 } else { 2729 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2730 } 2731 break; 2732 case INDEX_op_qemu_ld_a64_i64: 2733 if (TCG_TARGET_REG_BITS == 64) { 2734 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2735 } else { 2736 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2737 } 2738 break; 2739 case INDEX_op_qemu_ld_a32_i128: 2740 case INDEX_op_qemu_ld_a64_i128: 2741 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2742 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2743 break; 2744 2745 case INDEX_op_qemu_st_a64_i32: 2746 case INDEX_op_qemu_st8_a64_i32: 2747 if (TCG_TARGET_REG_BITS == 32) { 2748 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2749 break; 2750 } 2751 /* fall through */ 2752 case INDEX_op_qemu_st_a32_i32: 2753 case INDEX_op_qemu_st8_a32_i32: 2754 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2755 break; 2756 case INDEX_op_qemu_st_a32_i64: 2757 if (TCG_TARGET_REG_BITS == 64) { 2758 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2759 } else { 2760 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2761 } 2762 break; 2763 case INDEX_op_qemu_st_a64_i64: 2764 if (TCG_TARGET_REG_BITS == 64) { 2765 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2766 } else { 2767 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2768 } 2769 break; 2770 case INDEX_op_qemu_st_a32_i128: 2771 case INDEX_op_qemu_st_a64_i128: 2772 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2773 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2774 break; 2775 2776 OP_32_64(mulu2): 2777 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2778 break; 2779 OP_32_64(muls2): 2780 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2781 break; 2782 OP_32_64(add2): 2783 if (const_args[4]) { 2784 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2785 } else { 2786 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2787 } 2788 if (const_args[5]) { 2789 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2790 } else { 2791 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2792 } 2793 break; 2794 OP_32_64(sub2): 2795 if (const_args[4]) { 2796 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2797 } else { 2798 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2799 } 2800 if (const_args[5]) { 2801 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2802 } else { 2803 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2804 } 2805 break; 2806 2807#if TCG_TARGET_REG_BITS == 32 2808 case INDEX_op_brcond2_i32: 2809 tcg_out_brcond2(s, args, const_args, 0); 2810 break; 2811 case INDEX_op_setcond2_i32: 2812 tcg_out_setcond2(s, args, const_args); 2813 break; 2814#else /* TCG_TARGET_REG_BITS == 64 */ 2815 case INDEX_op_ld32s_i64: 2816 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2817 break; 2818 case INDEX_op_ld_i64: 2819 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2820 break; 2821 case INDEX_op_st_i64: 2822 if (const_args[0]) { 2823 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2824 tcg_out32(s, a0); 2825 } else { 2826 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2827 } 2828 break; 2829 2830 case INDEX_op_bswap64_i64: 2831 tcg_out_bswap64(s, a0); 2832 break; 2833 case INDEX_op_extrh_i64_i32: 2834 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2835 break; 2836#endif 2837 2838 OP_32_64(deposit): 2839 if (args[3] == 0 && args[4] == 8) { 2840 /* load bits 0..7 */ 2841 if (const_a2) { 2842 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 2843 0, a0, 0); 2844 tcg_out8(s, a2); 2845 } else { 2846 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2847 } 2848 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 2849 /* load bits 8..15 */ 2850 if (const_a2) { 2851 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 2852 tcg_out8(s, a2); 2853 } else { 2854 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2855 } 2856 } else if (args[3] == 0 && args[4] == 16) { 2857 /* load bits 0..15 */ 2858 if (const_a2) { 2859 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 2860 0, a0, 0); 2861 tcg_out16(s, a2); 2862 } else { 2863 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2864 } 2865 } else { 2866 g_assert_not_reached(); 2867 } 2868 break; 2869 2870 case INDEX_op_extract_i64: 2871 if (a2 + args[3] == 32) { 2872 /* This is a 32-bit zero-extending right shift. */ 2873 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2874 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2875 break; 2876 } 2877 /* FALLTHRU */ 2878 case INDEX_op_extract_i32: 2879 /* On the off-chance that we can use the high-byte registers. 2880 Otherwise we emit the same ext16 + shift pattern that we 2881 would have gotten from the normal tcg-op.c expansion. */ 2882 tcg_debug_assert(a2 == 8 && args[3] == 8); 2883 if (a1 < 4 && a0 < 8) { 2884 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2885 } else { 2886 tcg_out_ext16u(s, a0, a1); 2887 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2888 } 2889 break; 2890 2891 case INDEX_op_sextract_i32: 2892 /* We don't implement sextract_i64, as we cannot sign-extend to 2893 64-bits without using the REX prefix that explicitly excludes 2894 access to the high-byte registers. */ 2895 tcg_debug_assert(a2 == 8 && args[3] == 8); 2896 if (a1 < 4 && a0 < 8) { 2897 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2898 } else { 2899 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 2900 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2901 } 2902 break; 2903 2904 OP_32_64(extract2): 2905 /* Note that SHRD outputs to the r/m operand. */ 2906 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2907 tcg_out8(s, args[3]); 2908 break; 2909 2910 case INDEX_op_mb: 2911 tcg_out_mb(s, a0); 2912 break; 2913 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2914 case INDEX_op_mov_i64: 2915 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2916 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 2917 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 2918 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 2919 case INDEX_op_ext8s_i64: 2920 case INDEX_op_ext8u_i32: 2921 case INDEX_op_ext8u_i64: 2922 case INDEX_op_ext16s_i32: 2923 case INDEX_op_ext16s_i64: 2924 case INDEX_op_ext16u_i32: 2925 case INDEX_op_ext16u_i64: 2926 case INDEX_op_ext32s_i64: 2927 case INDEX_op_ext32u_i64: 2928 case INDEX_op_ext_i32_i64: 2929 case INDEX_op_extu_i32_i64: 2930 case INDEX_op_extrl_i64_i32: 2931 default: 2932 g_assert_not_reached(); 2933 } 2934 2935#undef OP_32_64 2936} 2937 2938static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2939 unsigned vecl, unsigned vece, 2940 const TCGArg args[TCG_MAX_OP_ARGS], 2941 const int const_args[TCG_MAX_OP_ARGS]) 2942{ 2943 static int const add_insn[4] = { 2944 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2945 }; 2946 static int const ssadd_insn[4] = { 2947 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2948 }; 2949 static int const usadd_insn[4] = { 2950 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2951 }; 2952 static int const sub_insn[4] = { 2953 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2954 }; 2955 static int const sssub_insn[4] = { 2956 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2957 }; 2958 static int const ussub_insn[4] = { 2959 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2960 }; 2961 static int const mul_insn[4] = { 2962 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 2963 }; 2964 static int const shift_imm_insn[4] = { 2965 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2966 }; 2967 static int const cmpeq_insn[4] = { 2968 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2969 }; 2970 static int const cmpgt_insn[4] = { 2971 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2972 }; 2973 static int const punpckl_insn[4] = { 2974 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2975 }; 2976 static int const punpckh_insn[4] = { 2977 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2978 }; 2979 static int const packss_insn[4] = { 2980 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2981 }; 2982 static int const packus_insn[4] = { 2983 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2984 }; 2985 static int const smin_insn[4] = { 2986 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 2987 }; 2988 static int const smax_insn[4] = { 2989 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 2990 }; 2991 static int const umin_insn[4] = { 2992 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 2993 }; 2994 static int const umax_insn[4] = { 2995 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 2996 }; 2997 static int const rotlv_insn[4] = { 2998 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 2999 }; 3000 static int const rotrv_insn[4] = { 3001 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3002 }; 3003 static int const shlv_insn[4] = { 3004 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3005 }; 3006 static int const shrv_insn[4] = { 3007 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3008 }; 3009 static int const sarv_insn[4] = { 3010 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3011 }; 3012 static int const shls_insn[4] = { 3013 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3014 }; 3015 static int const shrs_insn[4] = { 3016 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3017 }; 3018 static int const sars_insn[4] = { 3019 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3020 }; 3021 static int const vpshldi_insn[4] = { 3022 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3023 }; 3024 static int const vpshldv_insn[4] = { 3025 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3026 }; 3027 static int const vpshrdv_insn[4] = { 3028 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3029 }; 3030 static int const abs_insn[4] = { 3031 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3032 }; 3033 3034 TCGType type = vecl + TCG_TYPE_V64; 3035 int insn, sub; 3036 TCGArg a0, a1, a2, a3; 3037 3038 a0 = args[0]; 3039 a1 = args[1]; 3040 a2 = args[2]; 3041 3042 switch (opc) { 3043 case INDEX_op_add_vec: 3044 insn = add_insn[vece]; 3045 goto gen_simd; 3046 case INDEX_op_ssadd_vec: 3047 insn = ssadd_insn[vece]; 3048 goto gen_simd; 3049 case INDEX_op_usadd_vec: 3050 insn = usadd_insn[vece]; 3051 goto gen_simd; 3052 case INDEX_op_sub_vec: 3053 insn = sub_insn[vece]; 3054 goto gen_simd; 3055 case INDEX_op_sssub_vec: 3056 insn = sssub_insn[vece]; 3057 goto gen_simd; 3058 case INDEX_op_ussub_vec: 3059 insn = ussub_insn[vece]; 3060 goto gen_simd; 3061 case INDEX_op_mul_vec: 3062 insn = mul_insn[vece]; 3063 goto gen_simd; 3064 case INDEX_op_and_vec: 3065 insn = OPC_PAND; 3066 goto gen_simd; 3067 case INDEX_op_or_vec: 3068 insn = OPC_POR; 3069 goto gen_simd; 3070 case INDEX_op_xor_vec: 3071 insn = OPC_PXOR; 3072 goto gen_simd; 3073 case INDEX_op_smin_vec: 3074 insn = smin_insn[vece]; 3075 goto gen_simd; 3076 case INDEX_op_umin_vec: 3077 insn = umin_insn[vece]; 3078 goto gen_simd; 3079 case INDEX_op_smax_vec: 3080 insn = smax_insn[vece]; 3081 goto gen_simd; 3082 case INDEX_op_umax_vec: 3083 insn = umax_insn[vece]; 3084 goto gen_simd; 3085 case INDEX_op_shlv_vec: 3086 insn = shlv_insn[vece]; 3087 goto gen_simd; 3088 case INDEX_op_shrv_vec: 3089 insn = shrv_insn[vece]; 3090 goto gen_simd; 3091 case INDEX_op_sarv_vec: 3092 insn = sarv_insn[vece]; 3093 goto gen_simd; 3094 case INDEX_op_rotlv_vec: 3095 insn = rotlv_insn[vece]; 3096 goto gen_simd; 3097 case INDEX_op_rotrv_vec: 3098 insn = rotrv_insn[vece]; 3099 goto gen_simd; 3100 case INDEX_op_shls_vec: 3101 insn = shls_insn[vece]; 3102 goto gen_simd; 3103 case INDEX_op_shrs_vec: 3104 insn = shrs_insn[vece]; 3105 goto gen_simd; 3106 case INDEX_op_sars_vec: 3107 insn = sars_insn[vece]; 3108 goto gen_simd; 3109 case INDEX_op_x86_punpckl_vec: 3110 insn = punpckl_insn[vece]; 3111 goto gen_simd; 3112 case INDEX_op_x86_punpckh_vec: 3113 insn = punpckh_insn[vece]; 3114 goto gen_simd; 3115 case INDEX_op_x86_packss_vec: 3116 insn = packss_insn[vece]; 3117 goto gen_simd; 3118 case INDEX_op_x86_packus_vec: 3119 insn = packus_insn[vece]; 3120 goto gen_simd; 3121 case INDEX_op_x86_vpshldv_vec: 3122 insn = vpshldv_insn[vece]; 3123 a1 = a2; 3124 a2 = args[3]; 3125 goto gen_simd; 3126 case INDEX_op_x86_vpshrdv_vec: 3127 insn = vpshrdv_insn[vece]; 3128 a1 = a2; 3129 a2 = args[3]; 3130 goto gen_simd; 3131#if TCG_TARGET_REG_BITS == 32 3132 case INDEX_op_dup2_vec: 3133 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3134 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3135 /* Then replicate the 64-bit elements across the rest of the vector. */ 3136 if (type != TCG_TYPE_V64) { 3137 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3138 } 3139 break; 3140#endif 3141 case INDEX_op_abs_vec: 3142 insn = abs_insn[vece]; 3143 a2 = a1; 3144 a1 = 0; 3145 goto gen_simd; 3146 gen_simd: 3147 tcg_debug_assert(insn != OPC_UD2); 3148 if (type == TCG_TYPE_V256) { 3149 insn |= P_VEXL; 3150 } 3151 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3152 break; 3153 3154 case INDEX_op_cmp_vec: 3155 sub = args[3]; 3156 if (sub == TCG_COND_EQ) { 3157 insn = cmpeq_insn[vece]; 3158 } else if (sub == TCG_COND_GT) { 3159 insn = cmpgt_insn[vece]; 3160 } else { 3161 g_assert_not_reached(); 3162 } 3163 goto gen_simd; 3164 3165 case INDEX_op_andc_vec: 3166 insn = OPC_PANDN; 3167 if (type == TCG_TYPE_V256) { 3168 insn |= P_VEXL; 3169 } 3170 tcg_out_vex_modrm(s, insn, a0, a2, a1); 3171 break; 3172 3173 case INDEX_op_shli_vec: 3174 insn = shift_imm_insn[vece]; 3175 sub = 6; 3176 goto gen_shift; 3177 case INDEX_op_shri_vec: 3178 insn = shift_imm_insn[vece]; 3179 sub = 2; 3180 goto gen_shift; 3181 case INDEX_op_sari_vec: 3182 if (vece == MO_64) { 3183 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3184 } else { 3185 insn = shift_imm_insn[vece]; 3186 } 3187 sub = 4; 3188 goto gen_shift; 3189 case INDEX_op_rotli_vec: 3190 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3191 if (vece == MO_64) { 3192 insn |= P_VEXW; 3193 } 3194 sub = 1; 3195 goto gen_shift; 3196 gen_shift: 3197 tcg_debug_assert(vece != MO_8); 3198 if (type == TCG_TYPE_V256) { 3199 insn |= P_VEXL; 3200 } 3201 tcg_out_vex_modrm(s, insn, sub, a0, a1); 3202 tcg_out8(s, a2); 3203 break; 3204 3205 case INDEX_op_ld_vec: 3206 tcg_out_ld(s, type, a0, a1, a2); 3207 break; 3208 case INDEX_op_st_vec: 3209 tcg_out_st(s, type, a0, a1, a2); 3210 break; 3211 case INDEX_op_dupm_vec: 3212 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3213 break; 3214 3215 case INDEX_op_x86_shufps_vec: 3216 insn = OPC_SHUFPS; 3217 sub = args[3]; 3218 goto gen_simd_imm8; 3219 case INDEX_op_x86_blend_vec: 3220 if (vece == MO_16) { 3221 insn = OPC_PBLENDW; 3222 } else if (vece == MO_32) { 3223 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3224 } else { 3225 g_assert_not_reached(); 3226 } 3227 sub = args[3]; 3228 goto gen_simd_imm8; 3229 case INDEX_op_x86_vperm2i128_vec: 3230 insn = OPC_VPERM2I128; 3231 sub = args[3]; 3232 goto gen_simd_imm8; 3233 case INDEX_op_x86_vpshldi_vec: 3234 insn = vpshldi_insn[vece]; 3235 sub = args[3]; 3236 goto gen_simd_imm8; 3237 3238 case INDEX_op_not_vec: 3239 insn = OPC_VPTERNLOGQ; 3240 a2 = a1; 3241 sub = 0x33; /* !B */ 3242 goto gen_simd_imm8; 3243 case INDEX_op_nor_vec: 3244 insn = OPC_VPTERNLOGQ; 3245 sub = 0x11; /* norCB */ 3246 goto gen_simd_imm8; 3247 case INDEX_op_nand_vec: 3248 insn = OPC_VPTERNLOGQ; 3249 sub = 0x77; /* nandCB */ 3250 goto gen_simd_imm8; 3251 case INDEX_op_eqv_vec: 3252 insn = OPC_VPTERNLOGQ; 3253 sub = 0x99; /* xnorCB */ 3254 goto gen_simd_imm8; 3255 case INDEX_op_orc_vec: 3256 insn = OPC_VPTERNLOGQ; 3257 sub = 0xdd; /* orB!C */ 3258 goto gen_simd_imm8; 3259 3260 case INDEX_op_bitsel_vec: 3261 insn = OPC_VPTERNLOGQ; 3262 a3 = args[3]; 3263 if (a0 == a1) { 3264 a1 = a2; 3265 a2 = a3; 3266 sub = 0xca; /* A?B:C */ 3267 } else if (a0 == a2) { 3268 a2 = a3; 3269 sub = 0xe2; /* B?A:C */ 3270 } else { 3271 tcg_out_mov(s, type, a0, a3); 3272 sub = 0xb8; /* B?C:A */ 3273 } 3274 goto gen_simd_imm8; 3275 3276 gen_simd_imm8: 3277 tcg_debug_assert(insn != OPC_UD2); 3278 if (type == TCG_TYPE_V256) { 3279 insn |= P_VEXL; 3280 } 3281 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3282 tcg_out8(s, sub); 3283 break; 3284 3285 case INDEX_op_x86_vpblendvb_vec: 3286 insn = OPC_VPBLENDVB; 3287 if (type == TCG_TYPE_V256) { 3288 insn |= P_VEXL; 3289 } 3290 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3291 tcg_out8(s, args[3] << 4); 3292 break; 3293 3294 case INDEX_op_x86_psrldq_vec: 3295 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3296 tcg_out8(s, a2); 3297 break; 3298 3299 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3300 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3301 default: 3302 g_assert_not_reached(); 3303 } 3304} 3305 3306static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3307{ 3308 switch (op) { 3309 case INDEX_op_goto_ptr: 3310 return C_O0_I1(r); 3311 3312 case INDEX_op_ld8u_i32: 3313 case INDEX_op_ld8u_i64: 3314 case INDEX_op_ld8s_i32: 3315 case INDEX_op_ld8s_i64: 3316 case INDEX_op_ld16u_i32: 3317 case INDEX_op_ld16u_i64: 3318 case INDEX_op_ld16s_i32: 3319 case INDEX_op_ld16s_i64: 3320 case INDEX_op_ld_i32: 3321 case INDEX_op_ld32u_i64: 3322 case INDEX_op_ld32s_i64: 3323 case INDEX_op_ld_i64: 3324 return C_O1_I1(r, r); 3325 3326 case INDEX_op_st8_i32: 3327 case INDEX_op_st8_i64: 3328 return C_O0_I2(qi, r); 3329 3330 case INDEX_op_st16_i32: 3331 case INDEX_op_st16_i64: 3332 case INDEX_op_st_i32: 3333 case INDEX_op_st32_i64: 3334 return C_O0_I2(ri, r); 3335 3336 case INDEX_op_st_i64: 3337 return C_O0_I2(re, r); 3338 3339 case INDEX_op_add_i32: 3340 case INDEX_op_add_i64: 3341 return C_O1_I2(r, r, re); 3342 3343 case INDEX_op_sub_i32: 3344 case INDEX_op_sub_i64: 3345 case INDEX_op_mul_i32: 3346 case INDEX_op_mul_i64: 3347 case INDEX_op_or_i32: 3348 case INDEX_op_or_i64: 3349 case INDEX_op_xor_i32: 3350 case INDEX_op_xor_i64: 3351 return C_O1_I2(r, 0, re); 3352 3353 case INDEX_op_and_i32: 3354 case INDEX_op_and_i64: 3355 return C_O1_I2(r, 0, reZ); 3356 3357 case INDEX_op_andc_i32: 3358 case INDEX_op_andc_i64: 3359 return C_O1_I2(r, r, rI); 3360 3361 case INDEX_op_shl_i32: 3362 case INDEX_op_shl_i64: 3363 case INDEX_op_shr_i32: 3364 case INDEX_op_shr_i64: 3365 case INDEX_op_sar_i32: 3366 case INDEX_op_sar_i64: 3367 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3368 3369 case INDEX_op_rotl_i32: 3370 case INDEX_op_rotl_i64: 3371 case INDEX_op_rotr_i32: 3372 case INDEX_op_rotr_i64: 3373 return C_O1_I2(r, 0, ci); 3374 3375 case INDEX_op_brcond_i32: 3376 case INDEX_op_brcond_i64: 3377 return C_O0_I2(r, re); 3378 3379 case INDEX_op_bswap16_i32: 3380 case INDEX_op_bswap16_i64: 3381 case INDEX_op_bswap32_i32: 3382 case INDEX_op_bswap32_i64: 3383 case INDEX_op_bswap64_i64: 3384 case INDEX_op_neg_i32: 3385 case INDEX_op_neg_i64: 3386 case INDEX_op_not_i32: 3387 case INDEX_op_not_i64: 3388 case INDEX_op_extrh_i64_i32: 3389 return C_O1_I1(r, 0); 3390 3391 case INDEX_op_ext8s_i32: 3392 case INDEX_op_ext8s_i64: 3393 case INDEX_op_ext8u_i32: 3394 case INDEX_op_ext8u_i64: 3395 return C_O1_I1(r, q); 3396 3397 case INDEX_op_ext16s_i32: 3398 case INDEX_op_ext16s_i64: 3399 case INDEX_op_ext16u_i32: 3400 case INDEX_op_ext16u_i64: 3401 case INDEX_op_ext32s_i64: 3402 case INDEX_op_ext32u_i64: 3403 case INDEX_op_ext_i32_i64: 3404 case INDEX_op_extu_i32_i64: 3405 case INDEX_op_extrl_i64_i32: 3406 case INDEX_op_extract_i32: 3407 case INDEX_op_extract_i64: 3408 case INDEX_op_sextract_i32: 3409 case INDEX_op_ctpop_i32: 3410 case INDEX_op_ctpop_i64: 3411 return C_O1_I1(r, r); 3412 3413 case INDEX_op_extract2_i32: 3414 case INDEX_op_extract2_i64: 3415 return C_O1_I2(r, 0, r); 3416 3417 case INDEX_op_deposit_i32: 3418 case INDEX_op_deposit_i64: 3419 return C_O1_I2(q, 0, qi); 3420 3421 case INDEX_op_setcond_i32: 3422 case INDEX_op_setcond_i64: 3423 case INDEX_op_negsetcond_i32: 3424 case INDEX_op_negsetcond_i64: 3425 return C_O1_I2(q, r, re); 3426 3427 case INDEX_op_movcond_i32: 3428 case INDEX_op_movcond_i64: 3429 return C_O1_I4(r, r, re, r, 0); 3430 3431 case INDEX_op_div2_i32: 3432 case INDEX_op_div2_i64: 3433 case INDEX_op_divu2_i32: 3434 case INDEX_op_divu2_i64: 3435 return C_O2_I3(a, d, 0, 1, r); 3436 3437 case INDEX_op_mulu2_i32: 3438 case INDEX_op_mulu2_i64: 3439 case INDEX_op_muls2_i32: 3440 case INDEX_op_muls2_i64: 3441 return C_O2_I2(a, d, a, r); 3442 3443 case INDEX_op_add2_i32: 3444 case INDEX_op_add2_i64: 3445 case INDEX_op_sub2_i32: 3446 case INDEX_op_sub2_i64: 3447 return C_N1_O1_I4(r, r, 0, 1, re, re); 3448 3449 case INDEX_op_ctz_i32: 3450 case INDEX_op_ctz_i64: 3451 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3452 3453 case INDEX_op_clz_i32: 3454 case INDEX_op_clz_i64: 3455 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3456 3457 case INDEX_op_qemu_ld_a32_i32: 3458 return C_O1_I1(r, L); 3459 case INDEX_op_qemu_ld_a64_i32: 3460 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3461 3462 case INDEX_op_qemu_st_a32_i32: 3463 return C_O0_I2(L, L); 3464 case INDEX_op_qemu_st_a64_i32: 3465 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3466 case INDEX_op_qemu_st8_a32_i32: 3467 return C_O0_I2(s, L); 3468 case INDEX_op_qemu_st8_a64_i32: 3469 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3470 3471 case INDEX_op_qemu_ld_a32_i64: 3472 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3473 case INDEX_op_qemu_ld_a64_i64: 3474 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3475 3476 case INDEX_op_qemu_st_a32_i64: 3477 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3478 case INDEX_op_qemu_st_a64_i64: 3479 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3480 3481 case INDEX_op_qemu_ld_a32_i128: 3482 case INDEX_op_qemu_ld_a64_i128: 3483 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3484 return C_O2_I1(r, r, L); 3485 case INDEX_op_qemu_st_a32_i128: 3486 case INDEX_op_qemu_st_a64_i128: 3487 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3488 return C_O0_I3(L, L, L); 3489 3490 case INDEX_op_brcond2_i32: 3491 return C_O0_I4(r, r, ri, ri); 3492 3493 case INDEX_op_setcond2_i32: 3494 return C_O1_I4(r, r, r, ri, ri); 3495 3496 case INDEX_op_ld_vec: 3497 case INDEX_op_dupm_vec: 3498 return C_O1_I1(x, r); 3499 3500 case INDEX_op_st_vec: 3501 return C_O0_I2(x, r); 3502 3503 case INDEX_op_add_vec: 3504 case INDEX_op_sub_vec: 3505 case INDEX_op_mul_vec: 3506 case INDEX_op_and_vec: 3507 case INDEX_op_or_vec: 3508 case INDEX_op_xor_vec: 3509 case INDEX_op_andc_vec: 3510 case INDEX_op_orc_vec: 3511 case INDEX_op_nand_vec: 3512 case INDEX_op_nor_vec: 3513 case INDEX_op_eqv_vec: 3514 case INDEX_op_ssadd_vec: 3515 case INDEX_op_usadd_vec: 3516 case INDEX_op_sssub_vec: 3517 case INDEX_op_ussub_vec: 3518 case INDEX_op_smin_vec: 3519 case INDEX_op_umin_vec: 3520 case INDEX_op_smax_vec: 3521 case INDEX_op_umax_vec: 3522 case INDEX_op_shlv_vec: 3523 case INDEX_op_shrv_vec: 3524 case INDEX_op_sarv_vec: 3525 case INDEX_op_rotlv_vec: 3526 case INDEX_op_rotrv_vec: 3527 case INDEX_op_shls_vec: 3528 case INDEX_op_shrs_vec: 3529 case INDEX_op_sars_vec: 3530 case INDEX_op_cmp_vec: 3531 case INDEX_op_x86_shufps_vec: 3532 case INDEX_op_x86_blend_vec: 3533 case INDEX_op_x86_packss_vec: 3534 case INDEX_op_x86_packus_vec: 3535 case INDEX_op_x86_vperm2i128_vec: 3536 case INDEX_op_x86_punpckl_vec: 3537 case INDEX_op_x86_punpckh_vec: 3538 case INDEX_op_x86_vpshldi_vec: 3539#if TCG_TARGET_REG_BITS == 32 3540 case INDEX_op_dup2_vec: 3541#endif 3542 return C_O1_I2(x, x, x); 3543 3544 case INDEX_op_abs_vec: 3545 case INDEX_op_dup_vec: 3546 case INDEX_op_not_vec: 3547 case INDEX_op_shli_vec: 3548 case INDEX_op_shri_vec: 3549 case INDEX_op_sari_vec: 3550 case INDEX_op_rotli_vec: 3551 case INDEX_op_x86_psrldq_vec: 3552 return C_O1_I1(x, x); 3553 3554 case INDEX_op_x86_vpshldv_vec: 3555 case INDEX_op_x86_vpshrdv_vec: 3556 return C_O1_I3(x, 0, x, x); 3557 3558 case INDEX_op_bitsel_vec: 3559 case INDEX_op_x86_vpblendvb_vec: 3560 return C_O1_I3(x, x, x, x); 3561 3562 default: 3563 g_assert_not_reached(); 3564 } 3565} 3566 3567int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3568{ 3569 switch (opc) { 3570 case INDEX_op_add_vec: 3571 case INDEX_op_sub_vec: 3572 case INDEX_op_and_vec: 3573 case INDEX_op_or_vec: 3574 case INDEX_op_xor_vec: 3575 case INDEX_op_andc_vec: 3576 case INDEX_op_orc_vec: 3577 case INDEX_op_nand_vec: 3578 case INDEX_op_nor_vec: 3579 case INDEX_op_eqv_vec: 3580 case INDEX_op_not_vec: 3581 case INDEX_op_bitsel_vec: 3582 return 1; 3583 case INDEX_op_cmp_vec: 3584 case INDEX_op_cmpsel_vec: 3585 return -1; 3586 3587 case INDEX_op_rotli_vec: 3588 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3589 3590 case INDEX_op_shli_vec: 3591 case INDEX_op_shri_vec: 3592 /* We must expand the operation for MO_8. */ 3593 return vece == MO_8 ? -1 : 1; 3594 3595 case INDEX_op_sari_vec: 3596 switch (vece) { 3597 case MO_8: 3598 return -1; 3599 case MO_16: 3600 case MO_32: 3601 return 1; 3602 case MO_64: 3603 if (have_avx512vl) { 3604 return 1; 3605 } 3606 /* 3607 * We can emulate this for MO_64, but it does not pay off 3608 * unless we're producing at least 4 values. 3609 */ 3610 return type >= TCG_TYPE_V256 ? -1 : 0; 3611 } 3612 return 0; 3613 3614 case INDEX_op_shls_vec: 3615 case INDEX_op_shrs_vec: 3616 return vece >= MO_16; 3617 case INDEX_op_sars_vec: 3618 switch (vece) { 3619 case MO_16: 3620 case MO_32: 3621 return 1; 3622 case MO_64: 3623 return have_avx512vl; 3624 } 3625 return 0; 3626 case INDEX_op_rotls_vec: 3627 return vece >= MO_16 ? -1 : 0; 3628 3629 case INDEX_op_shlv_vec: 3630 case INDEX_op_shrv_vec: 3631 switch (vece) { 3632 case MO_16: 3633 return have_avx512bw; 3634 case MO_32: 3635 case MO_64: 3636 return have_avx2; 3637 } 3638 return 0; 3639 case INDEX_op_sarv_vec: 3640 switch (vece) { 3641 case MO_16: 3642 return have_avx512bw; 3643 case MO_32: 3644 return have_avx2; 3645 case MO_64: 3646 return have_avx512vl; 3647 } 3648 return 0; 3649 case INDEX_op_rotlv_vec: 3650 case INDEX_op_rotrv_vec: 3651 switch (vece) { 3652 case MO_16: 3653 return have_avx512vbmi2 ? -1 : 0; 3654 case MO_32: 3655 case MO_64: 3656 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3657 } 3658 return 0; 3659 3660 case INDEX_op_mul_vec: 3661 switch (vece) { 3662 case MO_8: 3663 return -1; 3664 case MO_64: 3665 return have_avx512dq; 3666 } 3667 return 1; 3668 3669 case INDEX_op_ssadd_vec: 3670 case INDEX_op_usadd_vec: 3671 case INDEX_op_sssub_vec: 3672 case INDEX_op_ussub_vec: 3673 return vece <= MO_16; 3674 case INDEX_op_smin_vec: 3675 case INDEX_op_smax_vec: 3676 case INDEX_op_umin_vec: 3677 case INDEX_op_umax_vec: 3678 case INDEX_op_abs_vec: 3679 return vece <= MO_32 || have_avx512vl; 3680 3681 default: 3682 return 0; 3683 } 3684} 3685 3686static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3687 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3688{ 3689 TCGv_vec t1, t2; 3690 3691 tcg_debug_assert(vece == MO_8); 3692 3693 t1 = tcg_temp_new_vec(type); 3694 t2 = tcg_temp_new_vec(type); 3695 3696 /* 3697 * Unpack to W, shift, and repack. Tricky bits: 3698 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3699 * i.e. duplicate in other half of the 16-bit lane. 3700 * (2) For right-shift, add 8 so that the high half of the lane 3701 * becomes zero. For left-shift, and left-rotate, we must 3702 * shift up and down again. 3703 * (3) Step 2 leaves high half zero such that PACKUSWB 3704 * (pack with unsigned saturation) does not modify 3705 * the quantity. 3706 */ 3707 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3708 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3709 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3710 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3711 3712 if (opc != INDEX_op_rotli_vec) { 3713 imm += 8; 3714 } 3715 if (opc == INDEX_op_shri_vec) { 3716 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3717 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3718 } else { 3719 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3720 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3721 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3722 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3723 } 3724 3725 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3726 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3727 tcg_temp_free_vec(t1); 3728 tcg_temp_free_vec(t2); 3729} 3730 3731static void expand_vec_sari(TCGType type, unsigned vece, 3732 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3733{ 3734 TCGv_vec t1, t2; 3735 3736 switch (vece) { 3737 case MO_8: 3738 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3739 t1 = tcg_temp_new_vec(type); 3740 t2 = tcg_temp_new_vec(type); 3741 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3742 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3743 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3744 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3745 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3746 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3747 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3748 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3749 tcg_temp_free_vec(t1); 3750 tcg_temp_free_vec(t2); 3751 break; 3752 3753 case MO_64: 3754 t1 = tcg_temp_new_vec(type); 3755 if (imm <= 32) { 3756 /* 3757 * We can emulate a small sign extend by performing an arithmetic 3758 * 32-bit shift and overwriting the high half of a 64-bit logical 3759 * shift. Note that the ISA says shift of 32 is valid, but TCG 3760 * does not, so we have to bound the smaller shift -- we get the 3761 * same result in the high half either way. 3762 */ 3763 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3764 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3765 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3766 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3767 tcgv_vec_arg(t1), 0xaa); 3768 } else { 3769 /* Otherwise we will need to use a compare vs 0 to produce 3770 * the sign-extend, shift and merge. 3771 */ 3772 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 3773 tcg_constant_vec(type, MO_64, 0), v1); 3774 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3775 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3776 tcg_gen_or_vec(MO_64, v0, v0, t1); 3777 } 3778 tcg_temp_free_vec(t1); 3779 break; 3780 3781 default: 3782 g_assert_not_reached(); 3783 } 3784} 3785 3786static void expand_vec_rotli(TCGType type, unsigned vece, 3787 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3788{ 3789 TCGv_vec t; 3790 3791 if (vece == MO_8) { 3792 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3793 return; 3794 } 3795 3796 if (have_avx512vbmi2) { 3797 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3798 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3799 return; 3800 } 3801 3802 t = tcg_temp_new_vec(type); 3803 tcg_gen_shli_vec(vece, t, v1, imm); 3804 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3805 tcg_gen_or_vec(vece, v0, v0, t); 3806 tcg_temp_free_vec(t); 3807} 3808 3809static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3810 TCGv_vec v1, TCGv_vec sh, bool right) 3811{ 3812 TCGv_vec t; 3813 3814 if (have_avx512vbmi2) { 3815 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3816 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3817 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3818 return; 3819 } 3820 3821 t = tcg_temp_new_vec(type); 3822 tcg_gen_dupi_vec(vece, t, 8 << vece); 3823 tcg_gen_sub_vec(vece, t, t, sh); 3824 if (right) { 3825 tcg_gen_shlv_vec(vece, t, v1, t); 3826 tcg_gen_shrv_vec(vece, v0, v1, sh); 3827 } else { 3828 tcg_gen_shrv_vec(vece, t, v1, t); 3829 tcg_gen_shlv_vec(vece, v0, v1, sh); 3830 } 3831 tcg_gen_or_vec(vece, v0, v0, t); 3832 tcg_temp_free_vec(t); 3833} 3834 3835static void expand_vec_rotls(TCGType type, unsigned vece, 3836 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3837{ 3838 TCGv_vec t = tcg_temp_new_vec(type); 3839 3840 tcg_debug_assert(vece != MO_8); 3841 3842 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3843 tcg_gen_dup_i32_vec(vece, t, lsh); 3844 if (vece >= MO_32) { 3845 tcg_gen_rotlv_vec(vece, v0, v1, t); 3846 } else { 3847 expand_vec_rotv(type, vece, v0, v1, t, false); 3848 } 3849 } else { 3850 TCGv_i32 rsh = tcg_temp_new_i32(); 3851 3852 tcg_gen_neg_i32(rsh, lsh); 3853 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3854 tcg_gen_shls_vec(vece, t, v1, lsh); 3855 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3856 tcg_gen_or_vec(vece, v0, v0, t); 3857 3858 tcg_temp_free_i32(rsh); 3859 } 3860 3861 tcg_temp_free_vec(t); 3862} 3863 3864static void expand_vec_mul(TCGType type, unsigned vece, 3865 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3866{ 3867 TCGv_vec t1, t2, t3, t4, zero; 3868 3869 tcg_debug_assert(vece == MO_8); 3870 3871 /* 3872 * Unpack v1 bytes to words, 0 | x. 3873 * Unpack v2 bytes to words, y | 0. 3874 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3875 * Shift logical right by 8 bits to clear the high 8 bytes before 3876 * using an unsigned saturated pack. 3877 * 3878 * The difference between the V64, V128 and V256 cases is merely how 3879 * we distribute the expansion between temporaries. 3880 */ 3881 switch (type) { 3882 case TCG_TYPE_V64: 3883 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3884 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3885 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3886 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3887 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3888 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3889 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3890 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3891 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3892 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3893 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3894 tcg_temp_free_vec(t1); 3895 tcg_temp_free_vec(t2); 3896 break; 3897 3898 case TCG_TYPE_V128: 3899 case TCG_TYPE_V256: 3900 t1 = tcg_temp_new_vec(type); 3901 t2 = tcg_temp_new_vec(type); 3902 t3 = tcg_temp_new_vec(type); 3903 t4 = tcg_temp_new_vec(type); 3904 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3905 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3906 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3907 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3908 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3909 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3910 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3911 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3912 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3913 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3914 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3915 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3916 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3917 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3918 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3919 tcg_temp_free_vec(t1); 3920 tcg_temp_free_vec(t2); 3921 tcg_temp_free_vec(t3); 3922 tcg_temp_free_vec(t4); 3923 break; 3924 3925 default: 3926 g_assert_not_reached(); 3927 } 3928} 3929 3930static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3931 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3932{ 3933 enum { 3934 NEED_INV = 1, 3935 NEED_SWAP = 2, 3936 NEED_BIAS = 4, 3937 NEED_UMIN = 8, 3938 NEED_UMAX = 16, 3939 }; 3940 TCGv_vec t1, t2, t3; 3941 uint8_t fixup; 3942 3943 switch (cond) { 3944 case TCG_COND_EQ: 3945 case TCG_COND_GT: 3946 fixup = 0; 3947 break; 3948 case TCG_COND_NE: 3949 case TCG_COND_LE: 3950 fixup = NEED_INV; 3951 break; 3952 case TCG_COND_LT: 3953 fixup = NEED_SWAP; 3954 break; 3955 case TCG_COND_GE: 3956 fixup = NEED_SWAP | NEED_INV; 3957 break; 3958 case TCG_COND_LEU: 3959 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3960 fixup = NEED_UMIN; 3961 } else { 3962 fixup = NEED_BIAS | NEED_INV; 3963 } 3964 break; 3965 case TCG_COND_GTU: 3966 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3967 fixup = NEED_UMIN | NEED_INV; 3968 } else { 3969 fixup = NEED_BIAS; 3970 } 3971 break; 3972 case TCG_COND_GEU: 3973 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3974 fixup = NEED_UMAX; 3975 } else { 3976 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3977 } 3978 break; 3979 case TCG_COND_LTU: 3980 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3981 fixup = NEED_UMAX | NEED_INV; 3982 } else { 3983 fixup = NEED_BIAS | NEED_SWAP; 3984 } 3985 break; 3986 default: 3987 g_assert_not_reached(); 3988 } 3989 3990 if (fixup & NEED_INV) { 3991 cond = tcg_invert_cond(cond); 3992 } 3993 if (fixup & NEED_SWAP) { 3994 t1 = v1, v1 = v2, v2 = t1; 3995 cond = tcg_swap_cond(cond); 3996 } 3997 3998 t1 = t2 = NULL; 3999 if (fixup & (NEED_UMIN | NEED_UMAX)) { 4000 t1 = tcg_temp_new_vec(type); 4001 if (fixup & NEED_UMIN) { 4002 tcg_gen_umin_vec(vece, t1, v1, v2); 4003 } else { 4004 tcg_gen_umax_vec(vece, t1, v1, v2); 4005 } 4006 v2 = t1; 4007 cond = TCG_COND_EQ; 4008 } else if (fixup & NEED_BIAS) { 4009 t1 = tcg_temp_new_vec(type); 4010 t2 = tcg_temp_new_vec(type); 4011 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4012 tcg_gen_sub_vec(vece, t1, v1, t3); 4013 tcg_gen_sub_vec(vece, t2, v2, t3); 4014 v1 = t1; 4015 v2 = t2; 4016 cond = tcg_signed_cond(cond); 4017 } 4018 4019 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 4020 /* Expand directly; do not recurse. */ 4021 vec_gen_4(INDEX_op_cmp_vec, type, vece, 4022 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 4023 4024 if (t1) { 4025 tcg_temp_free_vec(t1); 4026 if (t2) { 4027 tcg_temp_free_vec(t2); 4028 } 4029 } 4030 return fixup & NEED_INV; 4031} 4032 4033static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 4034 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 4035{ 4036 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 4037 tcg_gen_not_vec(vece, v0, v0); 4038 } 4039} 4040 4041static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 4042 TCGv_vec c1, TCGv_vec c2, 4043 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 4044{ 4045 TCGv_vec t = tcg_temp_new_vec(type); 4046 4047 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 4048 /* Invert the sense of the compare by swapping arguments. */ 4049 TCGv_vec x; 4050 x = v3, v3 = v4, v4 = x; 4051 } 4052 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 4053 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 4054 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 4055 tcg_temp_free_vec(t); 4056} 4057 4058void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4059 TCGArg a0, ...) 4060{ 4061 va_list va; 4062 TCGArg a2; 4063 TCGv_vec v0, v1, v2, v3, v4; 4064 4065 va_start(va, a0); 4066 v0 = temp_tcgv_vec(arg_temp(a0)); 4067 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4068 a2 = va_arg(va, TCGArg); 4069 4070 switch (opc) { 4071 case INDEX_op_shli_vec: 4072 case INDEX_op_shri_vec: 4073 expand_vec_shi(type, vece, opc, v0, v1, a2); 4074 break; 4075 4076 case INDEX_op_sari_vec: 4077 expand_vec_sari(type, vece, v0, v1, a2); 4078 break; 4079 4080 case INDEX_op_rotli_vec: 4081 expand_vec_rotli(type, vece, v0, v1, a2); 4082 break; 4083 4084 case INDEX_op_rotls_vec: 4085 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4086 break; 4087 4088 case INDEX_op_rotlv_vec: 4089 v2 = temp_tcgv_vec(arg_temp(a2)); 4090 expand_vec_rotv(type, vece, v0, v1, v2, false); 4091 break; 4092 case INDEX_op_rotrv_vec: 4093 v2 = temp_tcgv_vec(arg_temp(a2)); 4094 expand_vec_rotv(type, vece, v0, v1, v2, true); 4095 break; 4096 4097 case INDEX_op_mul_vec: 4098 v2 = temp_tcgv_vec(arg_temp(a2)); 4099 expand_vec_mul(type, vece, v0, v1, v2); 4100 break; 4101 4102 case INDEX_op_cmp_vec: 4103 v2 = temp_tcgv_vec(arg_temp(a2)); 4104 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 4105 break; 4106 4107 case INDEX_op_cmpsel_vec: 4108 v2 = temp_tcgv_vec(arg_temp(a2)); 4109 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4110 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4111 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 4112 break; 4113 4114 default: 4115 break; 4116 } 4117 4118 va_end(va); 4119} 4120 4121static const int tcg_target_callee_save_regs[] = { 4122#if TCG_TARGET_REG_BITS == 64 4123 TCG_REG_RBP, 4124 TCG_REG_RBX, 4125#if defined(_WIN64) 4126 TCG_REG_RDI, 4127 TCG_REG_RSI, 4128#endif 4129 TCG_REG_R12, 4130 TCG_REG_R13, 4131 TCG_REG_R14, /* Currently used for the global env. */ 4132 TCG_REG_R15, 4133#else 4134 TCG_REG_EBP, /* Currently used for the global env. */ 4135 TCG_REG_EBX, 4136 TCG_REG_ESI, 4137 TCG_REG_EDI, 4138#endif 4139}; 4140 4141/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4142 and tcg_register_jit. */ 4143 4144#define PUSH_SIZE \ 4145 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4146 * (TCG_TARGET_REG_BITS / 8)) 4147 4148#define FRAME_SIZE \ 4149 ((PUSH_SIZE \ 4150 + TCG_STATIC_CALL_ARGS_SIZE \ 4151 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4152 + TCG_TARGET_STACK_ALIGN - 1) \ 4153 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4154 4155/* Generate global QEMU prologue and epilogue code */ 4156static void tcg_target_qemu_prologue(TCGContext *s) 4157{ 4158 int i, stack_addend; 4159 4160 /* TB prologue */ 4161 4162 /* Reserve some stack space, also for TCG temps. */ 4163 stack_addend = FRAME_SIZE - PUSH_SIZE; 4164 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4165 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4166 4167 /* Save all callee saved registers. */ 4168 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4169 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4170 } 4171 4172 if (!tcg_use_softmmu && guest_base) { 4173 int seg = setup_guest_base_seg(); 4174 if (seg != 0) { 4175 x86_guest_base.seg = seg; 4176 } else if (guest_base == (int32_t)guest_base) { 4177 x86_guest_base.ofs = guest_base; 4178 } else { 4179 assert(TCG_TARGET_REG_BITS == 64); 4180 /* Choose R12 because, as a base, it requires a SIB byte. */ 4181 x86_guest_base.index = TCG_REG_R12; 4182 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4183 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4184 } 4185 } 4186 4187 if (TCG_TARGET_REG_BITS == 32) { 4188 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4189 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4190 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4191 /* jmp *tb. */ 4192 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4193 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4194 + stack_addend); 4195 } else { 4196 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4197 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4198 /* jmp *tb. */ 4199 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4200 } 4201 4202 /* 4203 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4204 * and fall through to the rest of the epilogue. 4205 */ 4206 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4207 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4208 4209 /* TB epilogue */ 4210 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4211 4212 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4213 4214 if (have_avx2) { 4215 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4216 } 4217 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4218 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4219 } 4220 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4221} 4222 4223static void tcg_out_tb_start(TCGContext *s) 4224{ 4225 /* nothing to do */ 4226} 4227 4228static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4229{ 4230 memset(p, 0x90, count); 4231} 4232 4233static void tcg_target_init(TCGContext *s) 4234{ 4235 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4236 if (TCG_TARGET_REG_BITS == 64) { 4237 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4238 } 4239 if (have_avx1) { 4240 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4241 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4242 } 4243 if (have_avx2) { 4244 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4245 } 4246 4247 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4248 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4249 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4250 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4251 if (TCG_TARGET_REG_BITS == 64) { 4252#if !defined(_WIN64) 4253 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4254 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4255#endif 4256 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4257 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4258 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4259 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4260 } 4261 4262 s->reserved_regs = 0; 4263 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4264 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4265#ifdef _WIN64 4266 /* These are call saved, and we don't save them, so don't use them. */ 4267 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4268 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4269 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4270 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4271 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4272 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4273 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4274 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4275 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4276 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4277#endif 4278} 4279 4280typedef struct { 4281 DebugFrameHeader h; 4282 uint8_t fde_def_cfa[4]; 4283 uint8_t fde_reg_ofs[14]; 4284} DebugFrame; 4285 4286/* We're expecting a 2 byte uleb128 encoded value. */ 4287QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4288 4289#if !defined(__ELF__) 4290 /* Host machine without ELF. */ 4291#elif TCG_TARGET_REG_BITS == 64 4292#define ELF_HOST_MACHINE EM_X86_64 4293static const DebugFrame debug_frame = { 4294 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4295 .h.cie.id = -1, 4296 .h.cie.version = 1, 4297 .h.cie.code_align = 1, 4298 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4299 .h.cie.return_column = 16, 4300 4301 /* Total FDE size does not include the "len" member. */ 4302 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4303 4304 .fde_def_cfa = { 4305 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4306 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4307 (FRAME_SIZE >> 7) 4308 }, 4309 .fde_reg_ofs = { 4310 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4311 /* The following ordering must match tcg_target_callee_save_regs. */ 4312 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4313 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4314 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4315 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4316 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4317 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4318 } 4319}; 4320#else 4321#define ELF_HOST_MACHINE EM_386 4322static const DebugFrame debug_frame = { 4323 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4324 .h.cie.id = -1, 4325 .h.cie.version = 1, 4326 .h.cie.code_align = 1, 4327 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4328 .h.cie.return_column = 8, 4329 4330 /* Total FDE size does not include the "len" member. */ 4331 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4332 4333 .fde_def_cfa = { 4334 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4335 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4336 (FRAME_SIZE >> 7) 4337 }, 4338 .fde_reg_ofs = { 4339 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4340 /* The following ordering must match tcg_target_callee_save_regs. */ 4341 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4342 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4343 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4344 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4345 } 4346}; 4347#endif 4348 4349#if defined(ELF_HOST_MACHINE) 4350void tcg_register_jit(const void *buf, size_t buf_size) 4351{ 4352 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4353} 4354#endif 4355