1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94static const int tcg_target_call_iarg_regs[] = { 95#if TCG_TARGET_REG_BITS == 64 96#if defined(_WIN64) 97 TCG_REG_RCX, 98 TCG_REG_RDX, 99#else 100 TCG_REG_RDI, 101 TCG_REG_RSI, 102 TCG_REG_RDX, 103 TCG_REG_RCX, 104#endif 105 TCG_REG_R8, 106 TCG_REG_R9, 107#else 108 /* 32 bit mode uses stack based calling convention (GCC default). */ 109#endif 110}; 111 112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 113{ 114 switch (kind) { 115 case TCG_CALL_RET_NORMAL: 116 tcg_debug_assert(slot >= 0 && slot <= 1); 117 return slot ? TCG_REG_EDX : TCG_REG_EAX; 118#ifdef _WIN64 119 case TCG_CALL_RET_BY_VEC: 120 tcg_debug_assert(slot == 0); 121 return TCG_REG_XMM0; 122#endif 123 default: 124 g_assert_not_reached(); 125 } 126} 127 128/* Constants we accept. */ 129#define TCG_CT_CONST_S32 0x100 130#define TCG_CT_CONST_U32 0x200 131#define TCG_CT_CONST_I32 0x400 132#define TCG_CT_CONST_WSZ 0x800 133 134/* Registers used with L constraint, which are the first argument 135 registers on x86_64, and two random call clobbered registers on 136 i386. */ 137#if TCG_TARGET_REG_BITS == 64 138# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 139# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 140#else 141# define TCG_REG_L0 TCG_REG_EAX 142# define TCG_REG_L1 TCG_REG_EDX 143#endif 144 145#define ALL_BYTEH_REGS 0x0000000fu 146#if TCG_TARGET_REG_BITS == 64 147# define ALL_GENERAL_REGS 0x0000ffffu 148# define ALL_VECTOR_REGS 0xffff0000u 149# define ALL_BYTEL_REGS ALL_GENERAL_REGS 150#else 151# define ALL_GENERAL_REGS 0x000000ffu 152# define ALL_VECTOR_REGS 0x00ff0000u 153# define ALL_BYTEL_REGS ALL_BYTEH_REGS 154#endif 155#ifdef CONFIG_SOFTMMU 156# define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1)) 157#else 158# define SOFTMMU_RESERVE_REGS 0 159#endif 160 161/* The host compiler should supply <cpuid.h> to enable runtime features 162 detection, as we're not going to go so far as our own inline assembly. 163 If not available, default values will be assumed. */ 164#if defined(CONFIG_CPUID_H) 165#include "qemu/cpuid.h" 166#endif 167 168/* For 64-bit, we always know that CMOV is available. */ 169#if TCG_TARGET_REG_BITS == 64 170# define have_cmov 1 171#elif defined(CONFIG_CPUID_H) 172static bool have_cmov; 173#else 174# define have_cmov 0 175#endif 176 177/* We need these symbols in tcg-target.h, and we can't properly conditionalize 178 it there. Therefore we always define the variable. */ 179bool have_bmi1; 180bool have_popcnt; 181bool have_avx1; 182bool have_avx2; 183bool have_avx512bw; 184bool have_avx512dq; 185bool have_avx512vbmi2; 186bool have_avx512vl; 187bool have_movbe; 188 189#ifdef CONFIG_CPUID_H 190static bool have_bmi2; 191static bool have_lzcnt; 192#else 193# define have_bmi2 0 194# define have_lzcnt 0 195#endif 196 197static const tcg_insn_unit *tb_ret_addr; 198 199static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 200 intptr_t value, intptr_t addend) 201{ 202 value += addend; 203 switch(type) { 204 case R_386_PC32: 205 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 206 if (value != (int32_t)value) { 207 return false; 208 } 209 /* FALLTHRU */ 210 case R_386_32: 211 tcg_patch32(code_ptr, value); 212 break; 213 case R_386_PC8: 214 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 215 if (value != (int8_t)value) { 216 return false; 217 } 218 tcg_patch8(code_ptr, value); 219 break; 220 default: 221 tcg_abort(); 222 } 223 return true; 224} 225 226/* test if a constant matches the constraint */ 227static bool tcg_target_const_match(int64_t val, TCGType type, int ct) 228{ 229 if (ct & TCG_CT_CONST) { 230 return 1; 231 } 232 if (type == TCG_TYPE_I32) { 233 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) { 234 return 1; 235 } 236 } else { 237 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 238 return 1; 239 } 240 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 241 return 1; 242 } 243 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 244 return 1; 245 } 246 } 247 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 248 return 1; 249 } 250 return 0; 251} 252 253# define LOWREGMASK(x) ((x) & 7) 254 255#define P_EXT 0x100 /* 0x0f opcode prefix */ 256#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 257#define P_DATA16 0x400 /* 0x66 opcode prefix */ 258#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 259#if TCG_TARGET_REG_BITS == 64 260# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 261# define P_REXB_R 0x2000 /* REG field as byte register */ 262# define P_REXB_RM 0x4000 /* R/M field as byte register */ 263# define P_GS 0x8000 /* gs segment override */ 264#else 265# define P_REXW 0 266# define P_REXB_R 0 267# define P_REXB_RM 0 268# define P_GS 0 269#endif 270#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 271#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 272#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 273#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 274#define P_EVEX 0x100000 /* Requires EVEX encoding */ 275 276#define OPC_ARITH_EvIz (0x81) 277#define OPC_ARITH_EvIb (0x83) 278#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 279#define OPC_ANDN (0xf2 | P_EXT38) 280#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 281#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 282#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 283#define OPC_BSF (0xbc | P_EXT) 284#define OPC_BSR (0xbd | P_EXT) 285#define OPC_BSWAP (0xc8 | P_EXT) 286#define OPC_CALL_Jz (0xe8) 287#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 288#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 289#define OPC_DEC_r32 (0x48) 290#define OPC_IMUL_GvEv (0xaf | P_EXT) 291#define OPC_IMUL_GvEvIb (0x6b) 292#define OPC_IMUL_GvEvIz (0x69) 293#define OPC_INC_r32 (0x40) 294#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 295#define OPC_JCC_short (0x70) /* ... plus condition code */ 296#define OPC_JMP_long (0xe9) 297#define OPC_JMP_short (0xeb) 298#define OPC_LEA (0x8d) 299#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 300#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 301#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 302#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 303#define OPC_MOVB_EvIz (0xc6) 304#define OPC_MOVL_EvIz (0xc7) 305#define OPC_MOVL_Iv (0xb8) 306#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 307#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 308#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 309#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 310#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 311#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 312#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 313#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 314#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 315#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 316#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 317#define OPC_MOVSBL (0xbe | P_EXT) 318#define OPC_MOVSWL (0xbf | P_EXT) 319#define OPC_MOVSLQ (0x63 | P_REXW) 320#define OPC_MOVZBL (0xb6 | P_EXT) 321#define OPC_MOVZWL (0xb7 | P_EXT) 322#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 323#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 324#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 325#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 326#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 327#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 328#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 329#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 330#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 331#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 332#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 333#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 334#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 335#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 336#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 337#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 338#define OPC_PAND (0xdb | P_EXT | P_DATA16) 339#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 340#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 341#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 342#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 343#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 344#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 345#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 346#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 347#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 348#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 349#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 350#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 351#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 352#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 353#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 354#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 355#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 356#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 357#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 358#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 359#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 360#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 362#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 363#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 364#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 365#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 366#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 367#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 368#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 369#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 370#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 371#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 372#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 373#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 374#define OPC_POR (0xeb | P_EXT | P_DATA16) 375#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 376#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 377#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 378#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 379#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 380#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 381#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 382#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 383#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 384#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 385#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 386#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 387#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 388#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 389#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 390#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 391#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 392#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 393#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 394#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 395#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 396#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 397#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 398#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 399#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 400#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 401#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 402#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 403#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 404#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 405#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 406#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 407#define OPC_PXOR (0xef | P_EXT | P_DATA16) 408#define OPC_POP_r32 (0x58) 409#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 410#define OPC_PUSH_r32 (0x50) 411#define OPC_PUSH_Iv (0x68) 412#define OPC_PUSH_Ib (0x6a) 413#define OPC_RET (0xc3) 414#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 415#define OPC_SHIFT_1 (0xd1) 416#define OPC_SHIFT_Ib (0xc1) 417#define OPC_SHIFT_cl (0xd3) 418#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 419#define OPC_SHUFPS (0xc6 | P_EXT) 420#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 421#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 422#define OPC_SHRD_Ib (0xac | P_EXT) 423#define OPC_TESTL (0x85) 424#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 425#define OPC_UD2 (0x0b | P_EXT) 426#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 427#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 428#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 429#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 430#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 431#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 432#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 433#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 434#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 435#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 436#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 437#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 438#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 439#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 440#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 441#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 442#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 443#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 444#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 445#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 446#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 447#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 448#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 449#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 450#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 451#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 452#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 453#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 454#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 455#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 456#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 457#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 458#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 459#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 460#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VZEROUPPER (0x77 | P_EXT) 462#define OPC_XCHG_ax_r32 (0x90) 463 464#define OPC_GRP3_Eb (0xf6) 465#define OPC_GRP3_Ev (0xf7) 466#define OPC_GRP5 (0xff) 467#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 468 469/* Group 1 opcode extensions for 0x80-0x83. 470 These are also used as modifiers for OPC_ARITH. */ 471#define ARITH_ADD 0 472#define ARITH_OR 1 473#define ARITH_ADC 2 474#define ARITH_SBB 3 475#define ARITH_AND 4 476#define ARITH_SUB 5 477#define ARITH_XOR 6 478#define ARITH_CMP 7 479 480/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 481#define SHIFT_ROL 0 482#define SHIFT_ROR 1 483#define SHIFT_SHL 4 484#define SHIFT_SHR 5 485#define SHIFT_SAR 7 486 487/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 488#define EXT3_TESTi 0 489#define EXT3_NOT 2 490#define EXT3_NEG 3 491#define EXT3_MUL 4 492#define EXT3_IMUL 5 493#define EXT3_DIV 6 494#define EXT3_IDIV 7 495 496/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 497#define EXT5_INC_Ev 0 498#define EXT5_DEC_Ev 1 499#define EXT5_CALLN_Ev 2 500#define EXT5_JMPN_Ev 4 501 502/* Condition codes to be added to OPC_JCC_{long,short}. */ 503#define JCC_JMP (-1) 504#define JCC_JO 0x0 505#define JCC_JNO 0x1 506#define JCC_JB 0x2 507#define JCC_JAE 0x3 508#define JCC_JE 0x4 509#define JCC_JNE 0x5 510#define JCC_JBE 0x6 511#define JCC_JA 0x7 512#define JCC_JS 0x8 513#define JCC_JNS 0x9 514#define JCC_JP 0xa 515#define JCC_JNP 0xb 516#define JCC_JL 0xc 517#define JCC_JGE 0xd 518#define JCC_JLE 0xe 519#define JCC_JG 0xf 520 521static const uint8_t tcg_cond_to_jcc[] = { 522 [TCG_COND_EQ] = JCC_JE, 523 [TCG_COND_NE] = JCC_JNE, 524 [TCG_COND_LT] = JCC_JL, 525 [TCG_COND_GE] = JCC_JGE, 526 [TCG_COND_LE] = JCC_JLE, 527 [TCG_COND_GT] = JCC_JG, 528 [TCG_COND_LTU] = JCC_JB, 529 [TCG_COND_GEU] = JCC_JAE, 530 [TCG_COND_LEU] = JCC_JBE, 531 [TCG_COND_GTU] = JCC_JA, 532}; 533 534#if TCG_TARGET_REG_BITS == 64 535static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 536{ 537 int rex; 538 539 if (opc & P_GS) { 540 tcg_out8(s, 0x65); 541 } 542 if (opc & P_DATA16) { 543 /* We should never be asking for both 16 and 64-bit operation. */ 544 tcg_debug_assert((opc & P_REXW) == 0); 545 tcg_out8(s, 0x66); 546 } 547 if (opc & P_SIMDF3) { 548 tcg_out8(s, 0xf3); 549 } else if (opc & P_SIMDF2) { 550 tcg_out8(s, 0xf2); 551 } 552 553 rex = 0; 554 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 555 rex |= (r & 8) >> 1; /* REX.R */ 556 rex |= (x & 8) >> 2; /* REX.X */ 557 rex |= (rm & 8) >> 3; /* REX.B */ 558 559 /* P_REXB_{R,RM} indicates that the given register is the low byte. 560 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 561 as otherwise the encoding indicates %[abcd]h. Note that the values 562 that are ORed in merely indicate that the REX byte must be present; 563 those bits get discarded in output. */ 564 rex |= opc & (r >= 4 ? P_REXB_R : 0); 565 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 566 567 if (rex) { 568 tcg_out8(s, (uint8_t)(rex | 0x40)); 569 } 570 571 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 572 tcg_out8(s, 0x0f); 573 if (opc & P_EXT38) { 574 tcg_out8(s, 0x38); 575 } else if (opc & P_EXT3A) { 576 tcg_out8(s, 0x3a); 577 } 578 } 579 580 tcg_out8(s, opc); 581} 582#else 583static void tcg_out_opc(TCGContext *s, int opc) 584{ 585 if (opc & P_DATA16) { 586 tcg_out8(s, 0x66); 587 } 588 if (opc & P_SIMDF3) { 589 tcg_out8(s, 0xf3); 590 } else if (opc & P_SIMDF2) { 591 tcg_out8(s, 0xf2); 592 } 593 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 594 tcg_out8(s, 0x0f); 595 if (opc & P_EXT38) { 596 tcg_out8(s, 0x38); 597 } else if (opc & P_EXT3A) { 598 tcg_out8(s, 0x3a); 599 } 600 } 601 tcg_out8(s, opc); 602} 603/* Discard the register arguments to tcg_out_opc early, so as not to penalize 604 the 32-bit compilation paths. This method works with all versions of gcc, 605 whereas relying on optimization may not be able to exclude them. */ 606#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 607#endif 608 609static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 610{ 611 tcg_out_opc(s, opc, r, rm, 0); 612 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 613} 614 615static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 616 int rm, int index) 617{ 618 int tmp; 619 620 /* Use the two byte form if possible, which cannot encode 621 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 622 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 623 && ((rm | index) & 8) == 0) { 624 /* Two byte VEX prefix. */ 625 tcg_out8(s, 0xc5); 626 627 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 628 } else { 629 /* Three byte VEX prefix. */ 630 tcg_out8(s, 0xc4); 631 632 /* VEX.m-mmmm */ 633 if (opc & P_EXT3A) { 634 tmp = 3; 635 } else if (opc & P_EXT38) { 636 tmp = 2; 637 } else if (opc & P_EXT) { 638 tmp = 1; 639 } else { 640 g_assert_not_reached(); 641 } 642 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 643 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 644 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 645 tcg_out8(s, tmp); 646 647 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 648 } 649 650 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 651 /* VEX.pp */ 652 if (opc & P_DATA16) { 653 tmp |= 1; /* 0x66 */ 654 } else if (opc & P_SIMDF3) { 655 tmp |= 2; /* 0xf3 */ 656 } else if (opc & P_SIMDF2) { 657 tmp |= 3; /* 0xf2 */ 658 } 659 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 660 tcg_out8(s, tmp); 661 tcg_out8(s, opc); 662} 663 664static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 665 int rm, int index) 666{ 667 /* The entire 4-byte evex prefix; with R' and V' set. */ 668 uint32_t p = 0x08041062; 669 int mm, pp; 670 671 tcg_debug_assert(have_avx512vl); 672 673 /* EVEX.mm */ 674 if (opc & P_EXT3A) { 675 mm = 3; 676 } else if (opc & P_EXT38) { 677 mm = 2; 678 } else if (opc & P_EXT) { 679 mm = 1; 680 } else { 681 g_assert_not_reached(); 682 } 683 684 /* EVEX.pp */ 685 if (opc & P_DATA16) { 686 pp = 1; /* 0x66 */ 687 } else if (opc & P_SIMDF3) { 688 pp = 2; /* 0xf3 */ 689 } else if (opc & P_SIMDF2) { 690 pp = 3; /* 0xf2 */ 691 } else { 692 pp = 0; 693 } 694 695 p = deposit32(p, 8, 2, mm); 696 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 697 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 698 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 699 p = deposit32(p, 16, 2, pp); 700 p = deposit32(p, 19, 4, ~v); 701 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 702 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 703 704 tcg_out32(s, p); 705 tcg_out8(s, opc); 706} 707 708static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 709{ 710 if (opc & P_EVEX) { 711 tcg_out_evex_opc(s, opc, r, v, rm, 0); 712 } else { 713 tcg_out_vex_opc(s, opc, r, v, rm, 0); 714 } 715 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 716} 717 718/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 719 We handle either RM and INDEX missing with a negative value. In 64-bit 720 mode for absolute addresses, ~RM is the size of the immediate operand 721 that will follow the instruction. */ 722 723static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 724 int shift, intptr_t offset) 725{ 726 int mod, len; 727 728 if (index < 0 && rm < 0) { 729 if (TCG_TARGET_REG_BITS == 64) { 730 /* Try for a rip-relative addressing mode. This has replaced 731 the 32-bit-mode absolute addressing encoding. */ 732 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 733 intptr_t disp = offset - pc; 734 if (disp == (int32_t)disp) { 735 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 736 tcg_out32(s, disp); 737 return; 738 } 739 740 /* Try for an absolute address encoding. This requires the 741 use of the MODRM+SIB encoding and is therefore larger than 742 rip-relative addressing. */ 743 if (offset == (int32_t)offset) { 744 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 745 tcg_out8(s, (4 << 3) | 5); 746 tcg_out32(s, offset); 747 return; 748 } 749 750 /* ??? The memory isn't directly addressable. */ 751 g_assert_not_reached(); 752 } else { 753 /* Absolute address. */ 754 tcg_out8(s, (r << 3) | 5); 755 tcg_out32(s, offset); 756 return; 757 } 758 } 759 760 /* Find the length of the immediate addend. Note that the encoding 761 that would be used for (%ebp) indicates absolute addressing. */ 762 if (rm < 0) { 763 mod = 0, len = 4, rm = 5; 764 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 765 mod = 0, len = 0; 766 } else if (offset == (int8_t)offset) { 767 mod = 0x40, len = 1; 768 } else { 769 mod = 0x80, len = 4; 770 } 771 772 /* Use a single byte MODRM format if possible. Note that the encoding 773 that would be used for %esp is the escape to the two byte form. */ 774 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 775 /* Single byte MODRM format. */ 776 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 777 } else { 778 /* Two byte MODRM+SIB format. */ 779 780 /* Note that the encoding that would place %esp into the index 781 field indicates no index register. In 64-bit mode, the REX.X 782 bit counts, so %r12 can be used as the index. */ 783 if (index < 0) { 784 index = 4; 785 } else { 786 tcg_debug_assert(index != TCG_REG_ESP); 787 } 788 789 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 790 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 791 } 792 793 if (len == 1) { 794 tcg_out8(s, offset); 795 } else if (len == 4) { 796 tcg_out32(s, offset); 797 } 798} 799 800static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 801 int index, int shift, intptr_t offset) 802{ 803 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 804 tcg_out_sib_offset(s, r, rm, index, shift, offset); 805} 806 807static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 808 int rm, int index, int shift, 809 intptr_t offset) 810{ 811 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 812 tcg_out_sib_offset(s, r, rm, index, shift, offset); 813} 814 815/* A simplification of the above with no index or shift. */ 816static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 817 int rm, intptr_t offset) 818{ 819 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 820} 821 822static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 823 int v, int rm, intptr_t offset) 824{ 825 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 826} 827 828/* Output an opcode with an expected reference to the constant pool. */ 829static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 830{ 831 tcg_out_opc(s, opc, r, 0, 0); 832 /* Absolute for 32-bit, pc-relative for 64-bit. */ 833 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 834 tcg_out32(s, 0); 835} 836 837/* Output an opcode with an expected reference to the constant pool. */ 838static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 839{ 840 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 841 /* Absolute for 32-bit, pc-relative for 64-bit. */ 842 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 843 tcg_out32(s, 0); 844} 845 846/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 847static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 848{ 849 /* Propagate an opcode prefix, such as P_REXW. */ 850 int ext = subop & ~0x7; 851 subop &= 0x7; 852 853 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 854} 855 856static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 857{ 858 int rexw = 0; 859 860 if (arg == ret) { 861 return true; 862 } 863 switch (type) { 864 case TCG_TYPE_I64: 865 rexw = P_REXW; 866 /* fallthru */ 867 case TCG_TYPE_I32: 868 if (ret < 16) { 869 if (arg < 16) { 870 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 871 } else { 872 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 873 } 874 } else { 875 if (arg < 16) { 876 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 877 } else { 878 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 879 } 880 } 881 break; 882 883 case TCG_TYPE_V64: 884 tcg_debug_assert(ret >= 16 && arg >= 16); 885 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 886 break; 887 case TCG_TYPE_V128: 888 tcg_debug_assert(ret >= 16 && arg >= 16); 889 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 890 break; 891 case TCG_TYPE_V256: 892 tcg_debug_assert(ret >= 16 && arg >= 16); 893 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 894 break; 895 896 default: 897 g_assert_not_reached(); 898 } 899 return true; 900} 901 902static const int avx2_dup_insn[4] = { 903 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 904 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 905}; 906 907static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 908 TCGReg r, TCGReg a) 909{ 910 if (have_avx2) { 911 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 912 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 913 } else { 914 switch (vece) { 915 case MO_8: 916 /* ??? With zero in a register, use PSHUFB. */ 917 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 918 a = r; 919 /* FALLTHRU */ 920 case MO_16: 921 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 922 a = r; 923 /* FALLTHRU */ 924 case MO_32: 925 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 926 /* imm8 operand: all output lanes selected from input lane 0. */ 927 tcg_out8(s, 0); 928 break; 929 case MO_64: 930 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 931 break; 932 default: 933 g_assert_not_reached(); 934 } 935 } 936 return true; 937} 938 939static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 940 TCGReg r, TCGReg base, intptr_t offset) 941{ 942 if (have_avx2) { 943 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 944 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 945 r, 0, base, offset); 946 } else { 947 switch (vece) { 948 case MO_64: 949 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 950 break; 951 case MO_32: 952 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 953 break; 954 case MO_16: 955 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 956 tcg_out8(s, 0); /* imm8 */ 957 tcg_out_dup_vec(s, type, vece, r, r); 958 break; 959 case MO_8: 960 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 961 tcg_out8(s, 0); /* imm8 */ 962 tcg_out_dup_vec(s, type, vece, r, r); 963 break; 964 default: 965 g_assert_not_reached(); 966 } 967 } 968 return true; 969} 970 971static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 972 TCGReg ret, int64_t arg) 973{ 974 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 975 976 if (arg == 0) { 977 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 978 return; 979 } 980 if (arg == -1) { 981 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 982 return; 983 } 984 985 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 986 if (have_avx2) { 987 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 988 } else { 989 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 990 } 991 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 992 } else { 993 if (type == TCG_TYPE_V64) { 994 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 995 } else if (have_avx2) { 996 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 997 } else { 998 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 999 } 1000 if (TCG_TARGET_REG_BITS == 64) { 1001 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1002 } else { 1003 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1004 } 1005 } 1006} 1007 1008static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1009 TCGReg ret, tcg_target_long arg) 1010{ 1011 if (arg == 0) { 1012 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1013 return; 1014 } 1015 if (arg == -1) { 1016 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1017 return; 1018 } 1019 1020 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1021 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1022 if (TCG_TARGET_REG_BITS == 64) { 1023 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1024 } else { 1025 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1026 } 1027} 1028 1029static void tcg_out_movi_int(TCGContext *s, TCGType type, 1030 TCGReg ret, tcg_target_long arg) 1031{ 1032 tcg_target_long diff; 1033 1034 if (arg == 0) { 1035 tgen_arithr(s, ARITH_XOR, ret, ret); 1036 return; 1037 } 1038 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1039 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1040 tcg_out32(s, arg); 1041 return; 1042 } 1043 if (arg == (int32_t)arg) { 1044 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1045 tcg_out32(s, arg); 1046 return; 1047 } 1048 1049 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1050 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1051 if (diff == (int32_t)diff) { 1052 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1053 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1054 tcg_out32(s, diff); 1055 return; 1056 } 1057 1058 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1059 tcg_out64(s, arg); 1060} 1061 1062static void tcg_out_movi(TCGContext *s, TCGType type, 1063 TCGReg ret, tcg_target_long arg) 1064{ 1065 switch (type) { 1066 case TCG_TYPE_I32: 1067#if TCG_TARGET_REG_BITS == 64 1068 case TCG_TYPE_I64: 1069#endif 1070 if (ret < 16) { 1071 tcg_out_movi_int(s, type, ret, arg); 1072 } else { 1073 tcg_out_movi_vec(s, type, ret, arg); 1074 } 1075 break; 1076 default: 1077 g_assert_not_reached(); 1078 } 1079} 1080 1081static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1082 tcg_target_long imm) 1083{ 1084 /* This function is only used for passing structs by reference. */ 1085 tcg_debug_assert(TCG_TARGET_REG_BITS == 32); 1086 tcg_out_modrm_offset(s, OPC_LEA, rd, rs, imm); 1087} 1088 1089static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1090{ 1091 if (val == (int8_t)val) { 1092 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1093 tcg_out8(s, val); 1094 } else if (val == (int32_t)val) { 1095 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1096 tcg_out32(s, val); 1097 } else { 1098 tcg_abort(); 1099 } 1100} 1101 1102static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1103{ 1104 /* Given the strength of x86 memory ordering, we only need care for 1105 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1106 faster than "mfence", so don't bother with the sse insn. */ 1107 if (a0 & TCG_MO_ST_LD) { 1108 tcg_out8(s, 0xf0); 1109 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1110 tcg_out8(s, 0); 1111 } 1112} 1113 1114static inline void tcg_out_push(TCGContext *s, int reg) 1115{ 1116 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1117} 1118 1119static inline void tcg_out_pop(TCGContext *s, int reg) 1120{ 1121 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1122} 1123 1124static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1125 TCGReg arg1, intptr_t arg2) 1126{ 1127 switch (type) { 1128 case TCG_TYPE_I32: 1129 if (ret < 16) { 1130 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1131 } else { 1132 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1133 } 1134 break; 1135 case TCG_TYPE_I64: 1136 if (ret < 16) { 1137 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1138 break; 1139 } 1140 /* FALLTHRU */ 1141 case TCG_TYPE_V64: 1142 /* There is no instruction that can validate 8-byte alignment. */ 1143 tcg_debug_assert(ret >= 16); 1144 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1145 break; 1146 case TCG_TYPE_V128: 1147 /* 1148 * The gvec infrastructure is asserts that v128 vector loads 1149 * and stores use a 16-byte aligned offset. Validate that the 1150 * final pointer is aligned by using an insn that will SIGSEGV. 1151 */ 1152 tcg_debug_assert(ret >= 16); 1153 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1154 break; 1155 case TCG_TYPE_V256: 1156 /* 1157 * The gvec infrastructure only requires 16-byte alignment, 1158 * so here we must use an unaligned load. 1159 */ 1160 tcg_debug_assert(ret >= 16); 1161 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1162 ret, 0, arg1, arg2); 1163 break; 1164 default: 1165 g_assert_not_reached(); 1166 } 1167} 1168 1169static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1170 TCGReg arg1, intptr_t arg2) 1171{ 1172 switch (type) { 1173 case TCG_TYPE_I32: 1174 if (arg < 16) { 1175 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1176 } else { 1177 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1178 } 1179 break; 1180 case TCG_TYPE_I64: 1181 if (arg < 16) { 1182 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1183 break; 1184 } 1185 /* FALLTHRU */ 1186 case TCG_TYPE_V64: 1187 /* There is no instruction that can validate 8-byte alignment. */ 1188 tcg_debug_assert(arg >= 16); 1189 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1190 break; 1191 case TCG_TYPE_V128: 1192 /* 1193 * The gvec infrastructure is asserts that v128 vector loads 1194 * and stores use a 16-byte aligned offset. Validate that the 1195 * final pointer is aligned by using an insn that will SIGSEGV. 1196 * 1197 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1198 * for _WIN64, which must have SSE2 but may not have AVX. 1199 */ 1200 tcg_debug_assert(arg >= 16); 1201 if (have_avx1) { 1202 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1203 } else { 1204 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1205 } 1206 break; 1207 case TCG_TYPE_V256: 1208 /* 1209 * The gvec infrastructure only requires 16-byte alignment, 1210 * so here we must use an unaligned store. 1211 */ 1212 tcg_debug_assert(arg >= 16); 1213 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1214 arg, 0, arg1, arg2); 1215 break; 1216 default: 1217 g_assert_not_reached(); 1218 } 1219} 1220 1221static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1222 TCGReg base, intptr_t ofs) 1223{ 1224 int rexw = 0; 1225 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1226 if (val != (int32_t)val) { 1227 return false; 1228 } 1229 rexw = P_REXW; 1230 } else if (type != TCG_TYPE_I32) { 1231 return false; 1232 } 1233 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1234 tcg_out32(s, val); 1235 return true; 1236} 1237 1238static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1239{ 1240 /* Propagate an opcode prefix, such as P_DATA16. */ 1241 int ext = subopc & ~0x7; 1242 subopc &= 0x7; 1243 1244 if (count == 1) { 1245 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1246 } else { 1247 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1248 tcg_out8(s, count); 1249 } 1250} 1251 1252static inline void tcg_out_bswap32(TCGContext *s, int reg) 1253{ 1254 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1255} 1256 1257static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1258{ 1259 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1260} 1261 1262static inline void tcg_out_ext8u(TCGContext *s, int dest, int src) 1263{ 1264 /* movzbl */ 1265 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1266 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1267} 1268 1269static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw) 1270{ 1271 /* movsbl */ 1272 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1273 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1274} 1275 1276static inline void tcg_out_ext16u(TCGContext *s, int dest, int src) 1277{ 1278 /* movzwl */ 1279 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1280} 1281 1282static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw) 1283{ 1284 /* movsw[lq] */ 1285 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1286} 1287 1288static inline void tcg_out_ext32u(TCGContext *s, int dest, int src) 1289{ 1290 /* 32-bit mov zero extends. */ 1291 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1292} 1293 1294static inline void tcg_out_ext32s(TCGContext *s, int dest, int src) 1295{ 1296 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1297} 1298 1299static inline void tcg_out_bswap64(TCGContext *s, int reg) 1300{ 1301 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1302} 1303 1304static void tgen_arithi(TCGContext *s, int c, int r0, 1305 tcg_target_long val, int cf) 1306{ 1307 int rexw = 0; 1308 1309 if (TCG_TARGET_REG_BITS == 64) { 1310 rexw = c & -8; 1311 c &= 7; 1312 } 1313 1314 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1315 partial flags update stalls on Pentium4 and are not recommended 1316 by current Intel optimization manuals. */ 1317 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) { 1318 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1319 if (TCG_TARGET_REG_BITS == 64) { 1320 /* The single-byte increment encodings are re-tasked as the 1321 REX prefixes. Use the MODRM encoding. */ 1322 tcg_out_modrm(s, OPC_GRP5 + rexw, 1323 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1324 } else { 1325 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1326 } 1327 return; 1328 } 1329 1330 if (c == ARITH_AND) { 1331 if (TCG_TARGET_REG_BITS == 64) { 1332 if (val == 0xffffffffu) { 1333 tcg_out_ext32u(s, r0, r0); 1334 return; 1335 } 1336 if (val == (uint32_t)val) { 1337 /* AND with no high bits set can use a 32-bit operation. */ 1338 rexw = 0; 1339 } 1340 } 1341 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1342 tcg_out_ext8u(s, r0, r0); 1343 return; 1344 } 1345 if (val == 0xffffu) { 1346 tcg_out_ext16u(s, r0, r0); 1347 return; 1348 } 1349 } 1350 1351 if (val == (int8_t)val) { 1352 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1353 tcg_out8(s, val); 1354 return; 1355 } 1356 if (rexw == 0 || val == (int32_t)val) { 1357 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1358 tcg_out32(s, val); 1359 return; 1360 } 1361 1362 tcg_abort(); 1363} 1364 1365static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1366{ 1367 if (val != 0) { 1368 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1369 } 1370} 1371 1372/* Use SMALL != 0 to force a short forward branch. */ 1373static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small) 1374{ 1375 int32_t val, val1; 1376 1377 if (l->has_value) { 1378 val = tcg_pcrel_diff(s, l->u.value_ptr); 1379 val1 = val - 2; 1380 if ((int8_t)val1 == val1) { 1381 if (opc == -1) { 1382 tcg_out8(s, OPC_JMP_short); 1383 } else { 1384 tcg_out8(s, OPC_JCC_short + opc); 1385 } 1386 tcg_out8(s, val1); 1387 } else { 1388 if (small) { 1389 tcg_abort(); 1390 } 1391 if (opc == -1) { 1392 tcg_out8(s, OPC_JMP_long); 1393 tcg_out32(s, val - 5); 1394 } else { 1395 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1396 tcg_out32(s, val - 6); 1397 } 1398 } 1399 } else if (small) { 1400 if (opc == -1) { 1401 tcg_out8(s, OPC_JMP_short); 1402 } else { 1403 tcg_out8(s, OPC_JCC_short + opc); 1404 } 1405 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1406 s->code_ptr += 1; 1407 } else { 1408 if (opc == -1) { 1409 tcg_out8(s, OPC_JMP_long); 1410 } else { 1411 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1412 } 1413 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1414 s->code_ptr += 4; 1415 } 1416} 1417 1418static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1419 int const_arg2, int rexw) 1420{ 1421 if (const_arg2) { 1422 if (arg2 == 0) { 1423 /* test r, r */ 1424 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1425 } else { 1426 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1427 } 1428 } else { 1429 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1430 } 1431} 1432 1433static void tcg_out_brcond32(TCGContext *s, TCGCond cond, 1434 TCGArg arg1, TCGArg arg2, int const_arg2, 1435 TCGLabel *label, int small) 1436{ 1437 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1438 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1439} 1440 1441#if TCG_TARGET_REG_BITS == 64 1442static void tcg_out_brcond64(TCGContext *s, TCGCond cond, 1443 TCGArg arg1, TCGArg arg2, int const_arg2, 1444 TCGLabel *label, int small) 1445{ 1446 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1447 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1448} 1449#else 1450/* XXX: we implement it at the target level to avoid having to 1451 handle cross basic blocks temporaries */ 1452static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1453 const int *const_args, int small) 1454{ 1455 TCGLabel *label_next = gen_new_label(); 1456 TCGLabel *label_this = arg_label(args[5]); 1457 1458 switch(args[4]) { 1459 case TCG_COND_EQ: 1460 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1461 label_next, 1); 1462 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3], 1463 label_this, small); 1464 break; 1465 case TCG_COND_NE: 1466 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1467 label_this, small); 1468 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3], 1469 label_this, small); 1470 break; 1471 case TCG_COND_LT: 1472 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1473 label_this, small); 1474 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1475 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1476 label_this, small); 1477 break; 1478 case TCG_COND_LE: 1479 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1480 label_this, small); 1481 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1482 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1483 label_this, small); 1484 break; 1485 case TCG_COND_GT: 1486 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1487 label_this, small); 1488 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1489 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1490 label_this, small); 1491 break; 1492 case TCG_COND_GE: 1493 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1494 label_this, small); 1495 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1496 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1497 label_this, small); 1498 break; 1499 case TCG_COND_LTU: 1500 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1501 label_this, small); 1502 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1503 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1504 label_this, small); 1505 break; 1506 case TCG_COND_LEU: 1507 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1508 label_this, small); 1509 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1510 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1511 label_this, small); 1512 break; 1513 case TCG_COND_GTU: 1514 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1515 label_this, small); 1516 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1517 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1518 label_this, small); 1519 break; 1520 case TCG_COND_GEU: 1521 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1522 label_this, small); 1523 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1524 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1525 label_this, small); 1526 break; 1527 default: 1528 tcg_abort(); 1529 } 1530 tcg_out_label(s, label_next); 1531} 1532#endif 1533 1534static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest, 1535 TCGArg arg1, TCGArg arg2, int const_arg2) 1536{ 1537 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1538 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1539 tcg_out_ext8u(s, dest, dest); 1540} 1541 1542#if TCG_TARGET_REG_BITS == 64 1543static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest, 1544 TCGArg arg1, TCGArg arg2, int const_arg2) 1545{ 1546 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1547 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1548 tcg_out_ext8u(s, dest, dest); 1549} 1550#else 1551static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1552 const int *const_args) 1553{ 1554 TCGArg new_args[6]; 1555 TCGLabel *label_true, *label_over; 1556 1557 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1558 1559 if (args[0] == args[1] || args[0] == args[2] 1560 || (!const_args[3] && args[0] == args[3]) 1561 || (!const_args[4] && args[0] == args[4])) { 1562 /* When the destination overlaps with one of the argument 1563 registers, don't do anything tricky. */ 1564 label_true = gen_new_label(); 1565 label_over = gen_new_label(); 1566 1567 new_args[5] = label_arg(label_true); 1568 tcg_out_brcond2(s, new_args, const_args+1, 1); 1569 1570 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1571 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1572 tcg_out_label(s, label_true); 1573 1574 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1575 tcg_out_label(s, label_over); 1576 } else { 1577 /* When the destination does not overlap one of the arguments, 1578 clear the destination first, jump if cond false, and emit an 1579 increment in the true case. This results in smaller code. */ 1580 1581 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1582 1583 label_over = gen_new_label(); 1584 new_args[4] = tcg_invert_cond(new_args[4]); 1585 new_args[5] = label_arg(label_over); 1586 tcg_out_brcond2(s, new_args, const_args+1, 1); 1587 1588 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1589 tcg_out_label(s, label_over); 1590 } 1591} 1592#endif 1593 1594static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1595 TCGReg dest, TCGReg v1) 1596{ 1597 if (have_cmov) { 1598 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1599 } else { 1600 TCGLabel *over = gen_new_label(); 1601 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1602 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1603 tcg_out_label(s, over); 1604 } 1605} 1606 1607static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest, 1608 TCGReg c1, TCGArg c2, int const_c2, 1609 TCGReg v1) 1610{ 1611 tcg_out_cmp(s, c1, c2, const_c2, 0); 1612 tcg_out_cmov(s, cond, 0, dest, v1); 1613} 1614 1615#if TCG_TARGET_REG_BITS == 64 1616static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest, 1617 TCGReg c1, TCGArg c2, int const_c2, 1618 TCGReg v1) 1619{ 1620 tcg_out_cmp(s, c1, c2, const_c2, P_REXW); 1621 tcg_out_cmov(s, cond, P_REXW, dest, v1); 1622} 1623#endif 1624 1625static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1626 TCGArg arg2, bool const_a2) 1627{ 1628 if (have_bmi1) { 1629 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1630 if (const_a2) { 1631 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1632 } else { 1633 tcg_debug_assert(dest != arg2); 1634 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1635 } 1636 } else { 1637 tcg_debug_assert(dest != arg2); 1638 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1639 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1640 } 1641} 1642 1643static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1644 TCGArg arg2, bool const_a2) 1645{ 1646 if (have_lzcnt) { 1647 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1648 if (const_a2) { 1649 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1650 } else { 1651 tcg_debug_assert(dest != arg2); 1652 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1653 } 1654 } else { 1655 tcg_debug_assert(!const_a2); 1656 tcg_debug_assert(dest != arg1); 1657 tcg_debug_assert(dest != arg2); 1658 1659 /* Recall that the output of BSR is the index not the count. */ 1660 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1661 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1662 1663 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1664 tcg_out_cmp(s, arg1, 0, 1, rexw); 1665 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1666 } 1667} 1668 1669static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1670{ 1671 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1672 1673 if (disp == (int32_t)disp) { 1674 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1675 tcg_out32(s, disp); 1676 } else { 1677 /* rip-relative addressing into the constant pool. 1678 This is 6 + 8 = 14 bytes, as compared to using an 1679 immediate load 10 + 6 = 16 bytes, plus we may 1680 be able to re-use the pool constant for more calls. */ 1681 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1682 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1683 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1684 tcg_out32(s, 0); 1685 } 1686} 1687 1688static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1689 const TCGHelperInfo *info) 1690{ 1691 tcg_out_branch(s, 1, dest); 1692 1693#ifndef _WIN32 1694 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1695 /* 1696 * The sysv i386 abi for struct return places a reference as the 1697 * first argument of the stack, and pops that argument with the 1698 * return statement. Since we want to retain the aligned stack 1699 * pointer for the callee, we do not want to actually push that 1700 * argument before the call but rely on the normal store to the 1701 * stack slot. But we do need to compensate for the pop in order 1702 * to reset our correct stack pointer value. 1703 * Pushing a garbage value back onto the stack is quickest. 1704 */ 1705 tcg_out_push(s, TCG_REG_EAX); 1706 } 1707#endif 1708} 1709 1710static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1711{ 1712 tcg_out_branch(s, 0, dest); 1713} 1714 1715static void tcg_out_nopn(TCGContext *s, int n) 1716{ 1717 int i; 1718 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1719 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1720 * duplicate prefix, and all of the interesting recent cores can 1721 * decode and discard the duplicates in a single cycle. 1722 */ 1723 tcg_debug_assert(n >= 1); 1724 for (i = 1; i < n; ++i) { 1725 tcg_out8(s, 0x66); 1726 } 1727 tcg_out8(s, 0x90); 1728} 1729 1730#if defined(CONFIG_SOFTMMU) 1731/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, 1732 * int mmu_idx, uintptr_t ra) 1733 */ 1734static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = { 1735 [MO_UB] = helper_ret_ldub_mmu, 1736 [MO_LEUW] = helper_le_lduw_mmu, 1737 [MO_LEUL] = helper_le_ldul_mmu, 1738 [MO_LEUQ] = helper_le_ldq_mmu, 1739 [MO_BEUW] = helper_be_lduw_mmu, 1740 [MO_BEUL] = helper_be_ldul_mmu, 1741 [MO_BEUQ] = helper_be_ldq_mmu, 1742}; 1743 1744/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, 1745 * uintxx_t val, int mmu_idx, uintptr_t ra) 1746 */ 1747static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = { 1748 [MO_UB] = helper_ret_stb_mmu, 1749 [MO_LEUW] = helper_le_stw_mmu, 1750 [MO_LEUL] = helper_le_stl_mmu, 1751 [MO_LEUQ] = helper_le_stq_mmu, 1752 [MO_BEUW] = helper_be_stw_mmu, 1753 [MO_BEUL] = helper_be_stl_mmu, 1754 [MO_BEUQ] = helper_be_stq_mmu, 1755}; 1756 1757/* Perform the TLB load and compare. 1758 1759 Inputs: 1760 ADDRLO and ADDRHI contain the low and high part of the address. 1761 1762 MEM_INDEX and S_BITS are the memory context and log2 size of the load. 1763 1764 WHICH is the offset into the CPUTLBEntry structure of the slot to read. 1765 This should be offsetof addr_read or addr_write. 1766 1767 Outputs: 1768 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses) 1769 positions of the displacements of forward jumps to the TLB miss case. 1770 1771 Second argument register is loaded with the low part of the address. 1772 In the TLB hit case, it has been adjusted as indicated by the TLB 1773 and so is a host address. In the TLB miss case, it continues to 1774 hold a guest address. 1775 1776 First argument register is clobbered. */ 1777 1778static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, 1779 int mem_index, MemOp opc, 1780 tcg_insn_unit **label_ptr, int which) 1781{ 1782 const TCGReg r0 = TCG_REG_L0; 1783 const TCGReg r1 = TCG_REG_L1; 1784 TCGType ttype = TCG_TYPE_I32; 1785 TCGType tlbtype = TCG_TYPE_I32; 1786 int trexw = 0, hrexw = 0, tlbrexw = 0; 1787 unsigned a_bits = get_alignment_bits(opc); 1788 unsigned s_bits = opc & MO_SIZE; 1789 unsigned a_mask = (1 << a_bits) - 1; 1790 unsigned s_mask = (1 << s_bits) - 1; 1791 target_ulong tlb_mask; 1792 1793 if (TCG_TARGET_REG_BITS == 64) { 1794 if (TARGET_LONG_BITS == 64) { 1795 ttype = TCG_TYPE_I64; 1796 trexw = P_REXW; 1797 } 1798 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 1799 hrexw = P_REXW; 1800 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) { 1801 tlbtype = TCG_TYPE_I64; 1802 tlbrexw = P_REXW; 1803 } 1804 } 1805 } 1806 1807 tcg_out_mov(s, tlbtype, r0, addrlo); 1808 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0, 1809 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 1810 1811 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0, 1812 TLB_MASK_TABLE_OFS(mem_index) + 1813 offsetof(CPUTLBDescFast, mask)); 1814 1815 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0, 1816 TLB_MASK_TABLE_OFS(mem_index) + 1817 offsetof(CPUTLBDescFast, table)); 1818 1819 /* If the required alignment is at least as large as the access, simply 1820 copy the address and mask. For lesser alignments, check that we don't 1821 cross pages for the complete access. */ 1822 if (a_bits >= s_bits) { 1823 tcg_out_mov(s, ttype, r1, addrlo); 1824 } else { 1825 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask); 1826 } 1827 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask; 1828 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0); 1829 1830 /* cmp 0(r0), r1 */ 1831 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which); 1832 1833 /* Prepare for both the fast path add of the tlb addend, and the slow 1834 path function argument setup. */ 1835 tcg_out_mov(s, ttype, r1, addrlo); 1836 1837 /* jne slow_path */ 1838 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1839 label_ptr[0] = s->code_ptr; 1840 s->code_ptr += 4; 1841 1842 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1843 /* cmp 4(r0), addrhi */ 1844 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4); 1845 1846 /* jne slow_path */ 1847 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1848 label_ptr[1] = s->code_ptr; 1849 s->code_ptr += 4; 1850 } 1851 1852 /* TLB Hit. */ 1853 1854 /* add addend(r0), r1 */ 1855 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0, 1856 offsetof(CPUTLBEntry, addend)); 1857} 1858 1859/* 1860 * Record the context of a call to the out of line helper code for the slow path 1861 * for a load or store, so that we can later generate the correct helper code 1862 */ 1863static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64, 1864 MemOpIdx oi, 1865 TCGReg datalo, TCGReg datahi, 1866 TCGReg addrlo, TCGReg addrhi, 1867 tcg_insn_unit *raddr, 1868 tcg_insn_unit **label_ptr) 1869{ 1870 TCGLabelQemuLdst *label = new_ldst_label(s); 1871 1872 label->is_ld = is_ld; 1873 label->oi = oi; 1874 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32; 1875 label->datalo_reg = datalo; 1876 label->datahi_reg = datahi; 1877 label->addrlo_reg = addrlo; 1878 label->addrhi_reg = addrhi; 1879 label->raddr = tcg_splitwx_to_rx(raddr); 1880 label->label_ptr[0] = label_ptr[0]; 1881 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1882 label->label_ptr[1] = label_ptr[1]; 1883 } 1884} 1885 1886/* 1887 * Generate code for the slow path for a load at the end of block 1888 */ 1889static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1890{ 1891 MemOpIdx oi = l->oi; 1892 MemOp opc = get_memop(oi); 1893 TCGReg data_reg; 1894 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1895 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0); 1896 1897 /* resolve label address */ 1898 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1899 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1900 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1901 } 1902 1903 if (TCG_TARGET_REG_BITS == 32) { 1904 int ofs = 0; 1905 1906 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 1907 ofs += 4; 1908 1909 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 1910 ofs += 4; 1911 1912 if (TARGET_LONG_BITS == 64) { 1913 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 1914 ofs += 4; 1915 } 1916 1917 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs); 1918 ofs += 4; 1919 1920 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs); 1921 } else { 1922 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 1923 /* The second argument is already loaded with addrlo. */ 1924 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi); 1925 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], 1926 (uintptr_t)l->raddr); 1927 } 1928 1929 tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); 1930 1931 data_reg = l->datalo_reg; 1932 switch (opc & MO_SSIZE) { 1933 case MO_SB: 1934 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw); 1935 break; 1936 case MO_SW: 1937 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw); 1938 break; 1939#if TCG_TARGET_REG_BITS == 64 1940 case MO_SL: 1941 tcg_out_ext32s(s, data_reg, TCG_REG_EAX); 1942 break; 1943#endif 1944 case MO_UB: 1945 case MO_UW: 1946 /* Note that the helpers have zero-extended to tcg_target_long. */ 1947 case MO_UL: 1948 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); 1949 break; 1950 case MO_UQ: 1951 if (TCG_TARGET_REG_BITS == 64) { 1952 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); 1953 } else if (data_reg == TCG_REG_EDX) { 1954 /* xchg %edx, %eax */ 1955 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); 1956 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); 1957 } else { 1958 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); 1959 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); 1960 } 1961 break; 1962 default: 1963 tcg_abort(); 1964 } 1965 1966 /* Jump to the code corresponding to next IR of qemu_st */ 1967 tcg_out_jmp(s, l->raddr); 1968 return true; 1969} 1970 1971/* 1972 * Generate code for the slow path for a store at the end of block 1973 */ 1974static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1975{ 1976 MemOpIdx oi = l->oi; 1977 MemOp opc = get_memop(oi); 1978 MemOp s_bits = opc & MO_SIZE; 1979 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1980 TCGReg retaddr; 1981 1982 /* resolve label address */ 1983 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1984 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1985 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1986 } 1987 1988 if (TCG_TARGET_REG_BITS == 32) { 1989 int ofs = 0; 1990 1991 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 1992 ofs += 4; 1993 1994 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 1995 ofs += 4; 1996 1997 if (TARGET_LONG_BITS == 64) { 1998 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 1999 ofs += 4; 2000 } 2001 2002 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs); 2003 ofs += 4; 2004 2005 if (s_bits == MO_64) { 2006 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs); 2007 ofs += 4; 2008 } 2009 2010 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs); 2011 ofs += 4; 2012 2013 retaddr = TCG_REG_EAX; 2014 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 2015 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs); 2016 } else { 2017 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 2018 /* The second argument is already loaded with addrlo. */ 2019 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32), 2020 tcg_target_call_iarg_regs[2], l->datalo_reg); 2021 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi); 2022 2023 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) { 2024 retaddr = tcg_target_call_iarg_regs[4]; 2025 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 2026 } else { 2027 retaddr = TCG_REG_RAX; 2028 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 2029 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, 2030 TCG_TARGET_CALL_STACK_OFFSET); 2031 } 2032 } 2033 2034 /* "Tail call" to the helper, with the return address back inline. */ 2035 tcg_out_push(s, retaddr); 2036 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); 2037 return true; 2038} 2039#else 2040 2041static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo, 2042 TCGReg addrhi, unsigned a_bits) 2043{ 2044 unsigned a_mask = (1 << a_bits) - 1; 2045 TCGLabelQemuLdst *label; 2046 2047 /* 2048 * We are expecting a_bits to max out at 7, so we can usually use testb. 2049 * For i686, we have to use testl for %esi/%edi. 2050 */ 2051 if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) { 2052 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo); 2053 tcg_out8(s, a_mask); 2054 } else { 2055 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo); 2056 tcg_out32(s, a_mask); 2057 } 2058 2059 /* jne slow_path */ 2060 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2061 2062 label = new_ldst_label(s); 2063 label->is_ld = is_ld; 2064 label->addrlo_reg = addrlo; 2065 label->addrhi_reg = addrhi; 2066 label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4); 2067 label->label_ptr[0] = s->code_ptr; 2068 2069 s->code_ptr += 4; 2070} 2071 2072static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) 2073{ 2074 /* resolve label address */ 2075 tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4); 2076 2077 if (TCG_TARGET_REG_BITS == 32) { 2078 int ofs = 0; 2079 2080 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 2081 ofs += 4; 2082 2083 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 2084 ofs += 4; 2085 if (TARGET_LONG_BITS == 64) { 2086 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 2087 ofs += 4; 2088 } 2089 2090 tcg_out_pushi(s, (uintptr_t)l->raddr); 2091 } else { 2092 tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1], 2093 l->addrlo_reg); 2094 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 2095 2096 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr); 2097 tcg_out_push(s, TCG_REG_RAX); 2098 } 2099 2100 /* "Tail call" to the helper, with the return address back inline. */ 2101 tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld 2102 : helper_unaligned_st)); 2103 return true; 2104} 2105 2106static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2107{ 2108 return tcg_out_fail_alignment(s, l); 2109} 2110 2111static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2112{ 2113 return tcg_out_fail_alignment(s, l); 2114} 2115 2116#if TCG_TARGET_REG_BITS == 32 2117# define x86_guest_base_seg 0 2118# define x86_guest_base_index -1 2119# define x86_guest_base_offset guest_base 2120#else 2121static int x86_guest_base_seg; 2122static int x86_guest_base_index = -1; 2123static int32_t x86_guest_base_offset; 2124# if defined(__x86_64__) && defined(__linux__) 2125# include <asm/prctl.h> 2126# include <sys/prctl.h> 2127int arch_prctl(int code, unsigned long addr); 2128static inline int setup_guest_base_seg(void) 2129{ 2130 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2131 return P_GS; 2132 } 2133 return 0; 2134} 2135# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__) 2136# include <machine/sysarch.h> 2137static inline int setup_guest_base_seg(void) 2138{ 2139 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2140 return P_GS; 2141 } 2142 return 0; 2143} 2144# else 2145static inline int setup_guest_base_seg(void) 2146{ 2147 return 0; 2148} 2149# endif 2150#endif 2151#endif /* SOFTMMU */ 2152 2153static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2154 TCGReg base, int index, intptr_t ofs, 2155 int seg, bool is64, MemOp memop) 2156{ 2157 bool use_movbe = false; 2158 int rexw = is64 * P_REXW; 2159 int movop = OPC_MOVL_GvEv; 2160 2161 /* Do big-endian loads with movbe. */ 2162 if (memop & MO_BSWAP) { 2163 tcg_debug_assert(have_movbe); 2164 use_movbe = true; 2165 movop = OPC_MOVBE_GyMy; 2166 } 2167 2168 switch (memop & MO_SSIZE) { 2169 case MO_UB: 2170 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo, 2171 base, index, 0, ofs); 2172 break; 2173 case MO_SB: 2174 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo, 2175 base, index, 0, ofs); 2176 break; 2177 case MO_UW: 2178 if (use_movbe) { 2179 /* There is no extending movbe; only low 16-bits are modified. */ 2180 if (datalo != base && datalo != index) { 2181 /* XOR breaks dependency chains. */ 2182 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2183 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2184 datalo, base, index, 0, ofs); 2185 } else { 2186 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2187 datalo, base, index, 0, ofs); 2188 tcg_out_ext16u(s, datalo, datalo); 2189 } 2190 } else { 2191 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo, 2192 base, index, 0, ofs); 2193 } 2194 break; 2195 case MO_SW: 2196 if (use_movbe) { 2197 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2198 datalo, base, index, 0, ofs); 2199 tcg_out_ext16s(s, datalo, datalo, rexw); 2200 } else { 2201 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg, 2202 datalo, base, index, 0, ofs); 2203 } 2204 break; 2205 case MO_UL: 2206 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs); 2207 break; 2208#if TCG_TARGET_REG_BITS == 64 2209 case MO_SL: 2210 if (use_movbe) { 2211 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo, 2212 base, index, 0, ofs); 2213 tcg_out_ext32s(s, datalo, datalo); 2214 } else { 2215 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo, 2216 base, index, 0, ofs); 2217 } 2218 break; 2219#endif 2220 case MO_UQ: 2221 if (TCG_TARGET_REG_BITS == 64) { 2222 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo, 2223 base, index, 0, ofs); 2224 } else { 2225 if (use_movbe) { 2226 TCGReg t = datalo; 2227 datalo = datahi; 2228 datahi = t; 2229 } 2230 if (base != datalo) { 2231 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2232 base, index, 0, ofs); 2233 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2234 base, index, 0, ofs + 4); 2235 } else { 2236 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2237 base, index, 0, ofs + 4); 2238 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2239 base, index, 0, ofs); 2240 } 2241 } 2242 break; 2243 default: 2244 g_assert_not_reached(); 2245 } 2246} 2247 2248/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and 2249 EAX. It will be useful once fixed registers globals are less 2250 common. */ 2251static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64) 2252{ 2253 TCGReg datalo, datahi, addrlo; 2254 TCGReg addrhi __attribute__((unused)); 2255 MemOpIdx oi; 2256 MemOp opc; 2257#if defined(CONFIG_SOFTMMU) 2258 int mem_index; 2259 tcg_insn_unit *label_ptr[2]; 2260#else 2261 unsigned a_bits; 2262#endif 2263 2264 datalo = *args++; 2265 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); 2266 addrlo = *args++; 2267 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); 2268 oi = *args++; 2269 opc = get_memop(oi); 2270 2271#if defined(CONFIG_SOFTMMU) 2272 mem_index = get_mmuidx(oi); 2273 2274 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, 2275 label_ptr, offsetof(CPUTLBEntry, addr_read)); 2276 2277 /* TLB Hit. */ 2278 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc); 2279 2280 /* Record the current context of a load into ldst label */ 2281 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi, 2282 s->code_ptr, label_ptr); 2283#else 2284 a_bits = get_alignment_bits(opc); 2285 if (a_bits) { 2286 tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits); 2287 } 2288 2289 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index, 2290 x86_guest_base_offset, x86_guest_base_seg, 2291 is64, opc); 2292#endif 2293} 2294 2295static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2296 TCGReg base, int index, intptr_t ofs, 2297 int seg, MemOp memop) 2298{ 2299 bool use_movbe = false; 2300 int movop = OPC_MOVL_EvGv; 2301 2302 /* 2303 * Do big-endian stores with movbe or softmmu. 2304 * User-only without movbe will have its swapping done generically. 2305 */ 2306 if (memop & MO_BSWAP) { 2307 tcg_debug_assert(have_movbe); 2308 use_movbe = true; 2309 movop = OPC_MOVBE_MyGy; 2310 } 2311 2312 switch (memop & MO_SIZE) { 2313 case MO_8: 2314 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2315 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2316 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg, 2317 datalo, base, index, 0, ofs); 2318 break; 2319 case MO_16: 2320 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo, 2321 base, index, 0, ofs); 2322 break; 2323 case MO_32: 2324 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs); 2325 break; 2326 case MO_64: 2327 if (TCG_TARGET_REG_BITS == 64) { 2328 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo, 2329 base, index, 0, ofs); 2330 } else { 2331 if (use_movbe) { 2332 TCGReg t = datalo; 2333 datalo = datahi; 2334 datahi = t; 2335 } 2336 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2337 base, index, 0, ofs); 2338 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2339 base, index, 0, ofs + 4); 2340 } 2341 break; 2342 default: 2343 g_assert_not_reached(); 2344 } 2345} 2346 2347static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) 2348{ 2349 TCGReg datalo, datahi, addrlo; 2350 TCGReg addrhi __attribute__((unused)); 2351 MemOpIdx oi; 2352 MemOp opc; 2353#if defined(CONFIG_SOFTMMU) 2354 int mem_index; 2355 tcg_insn_unit *label_ptr[2]; 2356#else 2357 unsigned a_bits; 2358#endif 2359 2360 datalo = *args++; 2361 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); 2362 addrlo = *args++; 2363 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); 2364 oi = *args++; 2365 opc = get_memop(oi); 2366 2367#if defined(CONFIG_SOFTMMU) 2368 mem_index = get_mmuidx(oi); 2369 2370 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, 2371 label_ptr, offsetof(CPUTLBEntry, addr_write)); 2372 2373 /* TLB Hit. */ 2374 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc); 2375 2376 /* Record the current context of a store into ldst label */ 2377 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi, 2378 s->code_ptr, label_ptr); 2379#else 2380 a_bits = get_alignment_bits(opc); 2381 if (a_bits) { 2382 tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits); 2383 } 2384 2385 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index, 2386 x86_guest_base_offset, x86_guest_base_seg, opc); 2387#endif 2388} 2389 2390static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2391{ 2392 /* Reuse the zeroing that exists for goto_ptr. */ 2393 if (a0 == 0) { 2394 tcg_out_jmp(s, tcg_code_gen_epilogue); 2395 } else { 2396 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2397 tcg_out_jmp(s, tb_ret_addr); 2398 } 2399} 2400 2401static void tcg_out_goto_tb(TCGContext *s, int which) 2402{ 2403 /* 2404 * Jump displacement must be aligned for atomic patching; 2405 * see if we need to add extra nops before jump 2406 */ 2407 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2408 if (gap != 1) { 2409 tcg_out_nopn(s, gap - 1); 2410 } 2411 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2412 set_jmp_insn_offset(s, which); 2413 tcg_out32(s, 0); 2414 set_jmp_reset_offset(s, which); 2415} 2416 2417void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2418 uintptr_t jmp_rx, uintptr_t jmp_rw) 2419{ 2420 /* patch the branch destination */ 2421 uintptr_t addr = tb->jmp_target_addr[n]; 2422 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2423 /* no need to flush icache explicitly */ 2424} 2425 2426static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2427 const TCGArg args[TCG_MAX_OP_ARGS], 2428 const int const_args[TCG_MAX_OP_ARGS]) 2429{ 2430 TCGArg a0, a1, a2; 2431 int c, const_a2, vexop, rexw = 0; 2432 2433#if TCG_TARGET_REG_BITS == 64 2434# define OP_32_64(x) \ 2435 case glue(glue(INDEX_op_, x), _i64): \ 2436 rexw = P_REXW; /* FALLTHRU */ \ 2437 case glue(glue(INDEX_op_, x), _i32) 2438#else 2439# define OP_32_64(x) \ 2440 case glue(glue(INDEX_op_, x), _i32) 2441#endif 2442 2443 /* Hoist the loads of the most common arguments. */ 2444 a0 = args[0]; 2445 a1 = args[1]; 2446 a2 = args[2]; 2447 const_a2 = const_args[2]; 2448 2449 switch (opc) { 2450 case INDEX_op_goto_ptr: 2451 /* jmp to the given host address (could be epilogue) */ 2452 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2453 break; 2454 case INDEX_op_br: 2455 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2456 break; 2457 OP_32_64(ld8u): 2458 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2459 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2460 break; 2461 OP_32_64(ld8s): 2462 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2463 break; 2464 OP_32_64(ld16u): 2465 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2466 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2467 break; 2468 OP_32_64(ld16s): 2469 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2470 break; 2471#if TCG_TARGET_REG_BITS == 64 2472 case INDEX_op_ld32u_i64: 2473#endif 2474 case INDEX_op_ld_i32: 2475 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2476 break; 2477 2478 OP_32_64(st8): 2479 if (const_args[0]) { 2480 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2481 tcg_out8(s, a0); 2482 } else { 2483 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2484 } 2485 break; 2486 OP_32_64(st16): 2487 if (const_args[0]) { 2488 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2489 tcg_out16(s, a0); 2490 } else { 2491 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2492 } 2493 break; 2494#if TCG_TARGET_REG_BITS == 64 2495 case INDEX_op_st32_i64: 2496#endif 2497 case INDEX_op_st_i32: 2498 if (const_args[0]) { 2499 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2500 tcg_out32(s, a0); 2501 } else { 2502 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2503 } 2504 break; 2505 2506 OP_32_64(add): 2507 /* For 3-operand addition, use LEA. */ 2508 if (a0 != a1) { 2509 TCGArg c3 = 0; 2510 if (const_a2) { 2511 c3 = a2, a2 = -1; 2512 } else if (a0 == a2) { 2513 /* Watch out for dest = src + dest, since we've removed 2514 the matching constraint on the add. */ 2515 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2516 break; 2517 } 2518 2519 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2520 break; 2521 } 2522 c = ARITH_ADD; 2523 goto gen_arith; 2524 OP_32_64(sub): 2525 c = ARITH_SUB; 2526 goto gen_arith; 2527 OP_32_64(and): 2528 c = ARITH_AND; 2529 goto gen_arith; 2530 OP_32_64(or): 2531 c = ARITH_OR; 2532 goto gen_arith; 2533 OP_32_64(xor): 2534 c = ARITH_XOR; 2535 goto gen_arith; 2536 gen_arith: 2537 if (const_a2) { 2538 tgen_arithi(s, c + rexw, a0, a2, 0); 2539 } else { 2540 tgen_arithr(s, c + rexw, a0, a2); 2541 } 2542 break; 2543 2544 OP_32_64(andc): 2545 if (const_a2) { 2546 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2547 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2548 } else { 2549 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2550 } 2551 break; 2552 2553 OP_32_64(mul): 2554 if (const_a2) { 2555 int32_t val; 2556 val = a2; 2557 if (val == (int8_t)val) { 2558 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2559 tcg_out8(s, val); 2560 } else { 2561 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2562 tcg_out32(s, val); 2563 } 2564 } else { 2565 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2566 } 2567 break; 2568 2569 OP_32_64(div2): 2570 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2571 break; 2572 OP_32_64(divu2): 2573 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2574 break; 2575 2576 OP_32_64(shl): 2577 /* For small constant 3-operand shift, use LEA. */ 2578 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2579 if (a2 - 1 == 0) { 2580 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2581 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2582 } else { 2583 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2584 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2585 } 2586 break; 2587 } 2588 c = SHIFT_SHL; 2589 vexop = OPC_SHLX; 2590 goto gen_shift_maybe_vex; 2591 OP_32_64(shr): 2592 c = SHIFT_SHR; 2593 vexop = OPC_SHRX; 2594 goto gen_shift_maybe_vex; 2595 OP_32_64(sar): 2596 c = SHIFT_SAR; 2597 vexop = OPC_SARX; 2598 goto gen_shift_maybe_vex; 2599 OP_32_64(rotl): 2600 c = SHIFT_ROL; 2601 goto gen_shift; 2602 OP_32_64(rotr): 2603 c = SHIFT_ROR; 2604 goto gen_shift; 2605 gen_shift_maybe_vex: 2606 if (have_bmi2) { 2607 if (!const_a2) { 2608 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2609 break; 2610 } 2611 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2612 } 2613 /* FALLTHRU */ 2614 gen_shift: 2615 if (const_a2) { 2616 tcg_out_shifti(s, c + rexw, a0, a2); 2617 } else { 2618 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2619 } 2620 break; 2621 2622 OP_32_64(ctz): 2623 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2624 break; 2625 OP_32_64(clz): 2626 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2627 break; 2628 OP_32_64(ctpop): 2629 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2630 break; 2631 2632 case INDEX_op_brcond_i32: 2633 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2634 break; 2635 case INDEX_op_setcond_i32: 2636 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2); 2637 break; 2638 case INDEX_op_movcond_i32: 2639 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]); 2640 break; 2641 2642 OP_32_64(bswap16): 2643 if (a2 & TCG_BSWAP_OS) { 2644 /* Output must be sign-extended. */ 2645 if (rexw) { 2646 tcg_out_bswap64(s, a0); 2647 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2648 } else { 2649 tcg_out_bswap32(s, a0); 2650 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2651 } 2652 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2653 /* Output must be zero-extended, but input isn't. */ 2654 tcg_out_bswap32(s, a0); 2655 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2656 } else { 2657 tcg_out_rolw_8(s, a0); 2658 } 2659 break; 2660 OP_32_64(bswap32): 2661 tcg_out_bswap32(s, a0); 2662 if (rexw && (a2 & TCG_BSWAP_OS)) { 2663 tcg_out_ext32s(s, a0, a0); 2664 } 2665 break; 2666 2667 OP_32_64(neg): 2668 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2669 break; 2670 OP_32_64(not): 2671 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2672 break; 2673 2674 OP_32_64(ext8s): 2675 tcg_out_ext8s(s, a0, a1, rexw); 2676 break; 2677 OP_32_64(ext16s): 2678 tcg_out_ext16s(s, a0, a1, rexw); 2679 break; 2680 OP_32_64(ext8u): 2681 tcg_out_ext8u(s, a0, a1); 2682 break; 2683 OP_32_64(ext16u): 2684 tcg_out_ext16u(s, a0, a1); 2685 break; 2686 2687 case INDEX_op_qemu_ld_i32: 2688 tcg_out_qemu_ld(s, args, 0); 2689 break; 2690 case INDEX_op_qemu_ld_i64: 2691 tcg_out_qemu_ld(s, args, 1); 2692 break; 2693 case INDEX_op_qemu_st_i32: 2694 case INDEX_op_qemu_st8_i32: 2695 tcg_out_qemu_st(s, args, 0); 2696 break; 2697 case INDEX_op_qemu_st_i64: 2698 tcg_out_qemu_st(s, args, 1); 2699 break; 2700 2701 OP_32_64(mulu2): 2702 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2703 break; 2704 OP_32_64(muls2): 2705 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2706 break; 2707 OP_32_64(add2): 2708 if (const_args[4]) { 2709 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2710 } else { 2711 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2712 } 2713 if (const_args[5]) { 2714 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2715 } else { 2716 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2717 } 2718 break; 2719 OP_32_64(sub2): 2720 if (const_args[4]) { 2721 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2722 } else { 2723 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2724 } 2725 if (const_args[5]) { 2726 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2727 } else { 2728 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2729 } 2730 break; 2731 2732#if TCG_TARGET_REG_BITS == 32 2733 case INDEX_op_brcond2_i32: 2734 tcg_out_brcond2(s, args, const_args, 0); 2735 break; 2736 case INDEX_op_setcond2_i32: 2737 tcg_out_setcond2(s, args, const_args); 2738 break; 2739#else /* TCG_TARGET_REG_BITS == 64 */ 2740 case INDEX_op_ld32s_i64: 2741 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2742 break; 2743 case INDEX_op_ld_i64: 2744 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2745 break; 2746 case INDEX_op_st_i64: 2747 if (const_args[0]) { 2748 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2749 tcg_out32(s, a0); 2750 } else { 2751 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2752 } 2753 break; 2754 2755 case INDEX_op_brcond_i64: 2756 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2757 break; 2758 case INDEX_op_setcond_i64: 2759 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2); 2760 break; 2761 case INDEX_op_movcond_i64: 2762 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]); 2763 break; 2764 2765 case INDEX_op_bswap64_i64: 2766 tcg_out_bswap64(s, a0); 2767 break; 2768 case INDEX_op_extu_i32_i64: 2769 case INDEX_op_ext32u_i64: 2770 case INDEX_op_extrl_i64_i32: 2771 tcg_out_ext32u(s, a0, a1); 2772 break; 2773 case INDEX_op_ext_i32_i64: 2774 case INDEX_op_ext32s_i64: 2775 tcg_out_ext32s(s, a0, a1); 2776 break; 2777 case INDEX_op_extrh_i64_i32: 2778 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2779 break; 2780#endif 2781 2782 OP_32_64(deposit): 2783 if (args[3] == 0 && args[4] == 8) { 2784 /* load bits 0..7 */ 2785 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2786 } else if (args[3] == 8 && args[4] == 8) { 2787 /* load bits 8..15 */ 2788 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2789 } else if (args[3] == 0 && args[4] == 16) { 2790 /* load bits 0..15 */ 2791 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2792 } else { 2793 tcg_abort(); 2794 } 2795 break; 2796 2797 case INDEX_op_extract_i64: 2798 if (a2 + args[3] == 32) { 2799 /* This is a 32-bit zero-extending right shift. */ 2800 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2801 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2802 break; 2803 } 2804 /* FALLTHRU */ 2805 case INDEX_op_extract_i32: 2806 /* On the off-chance that we can use the high-byte registers. 2807 Otherwise we emit the same ext16 + shift pattern that we 2808 would have gotten from the normal tcg-op.c expansion. */ 2809 tcg_debug_assert(a2 == 8 && args[3] == 8); 2810 if (a1 < 4 && a0 < 8) { 2811 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2812 } else { 2813 tcg_out_ext16u(s, a0, a1); 2814 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2815 } 2816 break; 2817 2818 case INDEX_op_sextract_i32: 2819 /* We don't implement sextract_i64, as we cannot sign-extend to 2820 64-bits without using the REX prefix that explicitly excludes 2821 access to the high-byte registers. */ 2822 tcg_debug_assert(a2 == 8 && args[3] == 8); 2823 if (a1 < 4 && a0 < 8) { 2824 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2825 } else { 2826 tcg_out_ext16s(s, a0, a1, 0); 2827 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2828 } 2829 break; 2830 2831 OP_32_64(extract2): 2832 /* Note that SHRD outputs to the r/m operand. */ 2833 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2834 tcg_out8(s, args[3]); 2835 break; 2836 2837 case INDEX_op_mb: 2838 tcg_out_mb(s, a0); 2839 break; 2840 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2841 case INDEX_op_mov_i64: 2842 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2843 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 2844 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 2845 default: 2846 tcg_abort(); 2847 } 2848 2849#undef OP_32_64 2850} 2851 2852static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2853 unsigned vecl, unsigned vece, 2854 const TCGArg args[TCG_MAX_OP_ARGS], 2855 const int const_args[TCG_MAX_OP_ARGS]) 2856{ 2857 static int const add_insn[4] = { 2858 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2859 }; 2860 static int const ssadd_insn[4] = { 2861 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2862 }; 2863 static int const usadd_insn[4] = { 2864 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2865 }; 2866 static int const sub_insn[4] = { 2867 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2868 }; 2869 static int const sssub_insn[4] = { 2870 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2871 }; 2872 static int const ussub_insn[4] = { 2873 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2874 }; 2875 static int const mul_insn[4] = { 2876 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 2877 }; 2878 static int const shift_imm_insn[4] = { 2879 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2880 }; 2881 static int const cmpeq_insn[4] = { 2882 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2883 }; 2884 static int const cmpgt_insn[4] = { 2885 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2886 }; 2887 static int const punpckl_insn[4] = { 2888 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2889 }; 2890 static int const punpckh_insn[4] = { 2891 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2892 }; 2893 static int const packss_insn[4] = { 2894 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2895 }; 2896 static int const packus_insn[4] = { 2897 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2898 }; 2899 static int const smin_insn[4] = { 2900 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 2901 }; 2902 static int const smax_insn[4] = { 2903 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 2904 }; 2905 static int const umin_insn[4] = { 2906 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 2907 }; 2908 static int const umax_insn[4] = { 2909 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 2910 }; 2911 static int const rotlv_insn[4] = { 2912 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 2913 }; 2914 static int const rotrv_insn[4] = { 2915 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 2916 }; 2917 static int const shlv_insn[4] = { 2918 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 2919 }; 2920 static int const shrv_insn[4] = { 2921 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 2922 }; 2923 static int const sarv_insn[4] = { 2924 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 2925 }; 2926 static int const shls_insn[4] = { 2927 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 2928 }; 2929 static int const shrs_insn[4] = { 2930 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 2931 }; 2932 static int const sars_insn[4] = { 2933 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 2934 }; 2935 static int const vpshldi_insn[4] = { 2936 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 2937 }; 2938 static int const vpshldv_insn[4] = { 2939 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 2940 }; 2941 static int const vpshrdv_insn[4] = { 2942 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 2943 }; 2944 static int const abs_insn[4] = { 2945 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 2946 }; 2947 2948 TCGType type = vecl + TCG_TYPE_V64; 2949 int insn, sub; 2950 TCGArg a0, a1, a2, a3; 2951 2952 a0 = args[0]; 2953 a1 = args[1]; 2954 a2 = args[2]; 2955 2956 switch (opc) { 2957 case INDEX_op_add_vec: 2958 insn = add_insn[vece]; 2959 goto gen_simd; 2960 case INDEX_op_ssadd_vec: 2961 insn = ssadd_insn[vece]; 2962 goto gen_simd; 2963 case INDEX_op_usadd_vec: 2964 insn = usadd_insn[vece]; 2965 goto gen_simd; 2966 case INDEX_op_sub_vec: 2967 insn = sub_insn[vece]; 2968 goto gen_simd; 2969 case INDEX_op_sssub_vec: 2970 insn = sssub_insn[vece]; 2971 goto gen_simd; 2972 case INDEX_op_ussub_vec: 2973 insn = ussub_insn[vece]; 2974 goto gen_simd; 2975 case INDEX_op_mul_vec: 2976 insn = mul_insn[vece]; 2977 goto gen_simd; 2978 case INDEX_op_and_vec: 2979 insn = OPC_PAND; 2980 goto gen_simd; 2981 case INDEX_op_or_vec: 2982 insn = OPC_POR; 2983 goto gen_simd; 2984 case INDEX_op_xor_vec: 2985 insn = OPC_PXOR; 2986 goto gen_simd; 2987 case INDEX_op_smin_vec: 2988 insn = smin_insn[vece]; 2989 goto gen_simd; 2990 case INDEX_op_umin_vec: 2991 insn = umin_insn[vece]; 2992 goto gen_simd; 2993 case INDEX_op_smax_vec: 2994 insn = smax_insn[vece]; 2995 goto gen_simd; 2996 case INDEX_op_umax_vec: 2997 insn = umax_insn[vece]; 2998 goto gen_simd; 2999 case INDEX_op_shlv_vec: 3000 insn = shlv_insn[vece]; 3001 goto gen_simd; 3002 case INDEX_op_shrv_vec: 3003 insn = shrv_insn[vece]; 3004 goto gen_simd; 3005 case INDEX_op_sarv_vec: 3006 insn = sarv_insn[vece]; 3007 goto gen_simd; 3008 case INDEX_op_rotlv_vec: 3009 insn = rotlv_insn[vece]; 3010 goto gen_simd; 3011 case INDEX_op_rotrv_vec: 3012 insn = rotrv_insn[vece]; 3013 goto gen_simd; 3014 case INDEX_op_shls_vec: 3015 insn = shls_insn[vece]; 3016 goto gen_simd; 3017 case INDEX_op_shrs_vec: 3018 insn = shrs_insn[vece]; 3019 goto gen_simd; 3020 case INDEX_op_sars_vec: 3021 insn = sars_insn[vece]; 3022 goto gen_simd; 3023 case INDEX_op_x86_punpckl_vec: 3024 insn = punpckl_insn[vece]; 3025 goto gen_simd; 3026 case INDEX_op_x86_punpckh_vec: 3027 insn = punpckh_insn[vece]; 3028 goto gen_simd; 3029 case INDEX_op_x86_packss_vec: 3030 insn = packss_insn[vece]; 3031 goto gen_simd; 3032 case INDEX_op_x86_packus_vec: 3033 insn = packus_insn[vece]; 3034 goto gen_simd; 3035 case INDEX_op_x86_vpshldv_vec: 3036 insn = vpshldv_insn[vece]; 3037 a1 = a2; 3038 a2 = args[3]; 3039 goto gen_simd; 3040 case INDEX_op_x86_vpshrdv_vec: 3041 insn = vpshrdv_insn[vece]; 3042 a1 = a2; 3043 a2 = args[3]; 3044 goto gen_simd; 3045#if TCG_TARGET_REG_BITS == 32 3046 case INDEX_op_dup2_vec: 3047 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3048 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3049 /* Then replicate the 64-bit elements across the rest of the vector. */ 3050 if (type != TCG_TYPE_V64) { 3051 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3052 } 3053 break; 3054#endif 3055 case INDEX_op_abs_vec: 3056 insn = abs_insn[vece]; 3057 a2 = a1; 3058 a1 = 0; 3059 goto gen_simd; 3060 gen_simd: 3061 tcg_debug_assert(insn != OPC_UD2); 3062 if (type == TCG_TYPE_V256) { 3063 insn |= P_VEXL; 3064 } 3065 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3066 break; 3067 3068 case INDEX_op_cmp_vec: 3069 sub = args[3]; 3070 if (sub == TCG_COND_EQ) { 3071 insn = cmpeq_insn[vece]; 3072 } else if (sub == TCG_COND_GT) { 3073 insn = cmpgt_insn[vece]; 3074 } else { 3075 g_assert_not_reached(); 3076 } 3077 goto gen_simd; 3078 3079 case INDEX_op_andc_vec: 3080 insn = OPC_PANDN; 3081 if (type == TCG_TYPE_V256) { 3082 insn |= P_VEXL; 3083 } 3084 tcg_out_vex_modrm(s, insn, a0, a2, a1); 3085 break; 3086 3087 case INDEX_op_shli_vec: 3088 insn = shift_imm_insn[vece]; 3089 sub = 6; 3090 goto gen_shift; 3091 case INDEX_op_shri_vec: 3092 insn = shift_imm_insn[vece]; 3093 sub = 2; 3094 goto gen_shift; 3095 case INDEX_op_sari_vec: 3096 if (vece == MO_64) { 3097 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3098 } else { 3099 insn = shift_imm_insn[vece]; 3100 } 3101 sub = 4; 3102 goto gen_shift; 3103 case INDEX_op_rotli_vec: 3104 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3105 if (vece == MO_64) { 3106 insn |= P_VEXW; 3107 } 3108 sub = 1; 3109 goto gen_shift; 3110 gen_shift: 3111 tcg_debug_assert(vece != MO_8); 3112 if (type == TCG_TYPE_V256) { 3113 insn |= P_VEXL; 3114 } 3115 tcg_out_vex_modrm(s, insn, sub, a0, a1); 3116 tcg_out8(s, a2); 3117 break; 3118 3119 case INDEX_op_ld_vec: 3120 tcg_out_ld(s, type, a0, a1, a2); 3121 break; 3122 case INDEX_op_st_vec: 3123 tcg_out_st(s, type, a0, a1, a2); 3124 break; 3125 case INDEX_op_dupm_vec: 3126 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3127 break; 3128 3129 case INDEX_op_x86_shufps_vec: 3130 insn = OPC_SHUFPS; 3131 sub = args[3]; 3132 goto gen_simd_imm8; 3133 case INDEX_op_x86_blend_vec: 3134 if (vece == MO_16) { 3135 insn = OPC_PBLENDW; 3136 } else if (vece == MO_32) { 3137 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3138 } else { 3139 g_assert_not_reached(); 3140 } 3141 sub = args[3]; 3142 goto gen_simd_imm8; 3143 case INDEX_op_x86_vperm2i128_vec: 3144 insn = OPC_VPERM2I128; 3145 sub = args[3]; 3146 goto gen_simd_imm8; 3147 case INDEX_op_x86_vpshldi_vec: 3148 insn = vpshldi_insn[vece]; 3149 sub = args[3]; 3150 goto gen_simd_imm8; 3151 3152 case INDEX_op_not_vec: 3153 insn = OPC_VPTERNLOGQ; 3154 a2 = a1; 3155 sub = 0x33; /* !B */ 3156 goto gen_simd_imm8; 3157 case INDEX_op_nor_vec: 3158 insn = OPC_VPTERNLOGQ; 3159 sub = 0x11; /* norCB */ 3160 goto gen_simd_imm8; 3161 case INDEX_op_nand_vec: 3162 insn = OPC_VPTERNLOGQ; 3163 sub = 0x77; /* nandCB */ 3164 goto gen_simd_imm8; 3165 case INDEX_op_eqv_vec: 3166 insn = OPC_VPTERNLOGQ; 3167 sub = 0x99; /* xnorCB */ 3168 goto gen_simd_imm8; 3169 case INDEX_op_orc_vec: 3170 insn = OPC_VPTERNLOGQ; 3171 sub = 0xdd; /* orB!C */ 3172 goto gen_simd_imm8; 3173 3174 case INDEX_op_bitsel_vec: 3175 insn = OPC_VPTERNLOGQ; 3176 a3 = args[3]; 3177 if (a0 == a1) { 3178 a1 = a2; 3179 a2 = a3; 3180 sub = 0xca; /* A?B:C */ 3181 } else if (a0 == a2) { 3182 a2 = a3; 3183 sub = 0xe2; /* B?A:C */ 3184 } else { 3185 tcg_out_mov(s, type, a0, a3); 3186 sub = 0xb8; /* B?C:A */ 3187 } 3188 goto gen_simd_imm8; 3189 3190 gen_simd_imm8: 3191 tcg_debug_assert(insn != OPC_UD2); 3192 if (type == TCG_TYPE_V256) { 3193 insn |= P_VEXL; 3194 } 3195 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3196 tcg_out8(s, sub); 3197 break; 3198 3199 case INDEX_op_x86_vpblendvb_vec: 3200 insn = OPC_VPBLENDVB; 3201 if (type == TCG_TYPE_V256) { 3202 insn |= P_VEXL; 3203 } 3204 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3205 tcg_out8(s, args[3] << 4); 3206 break; 3207 3208 case INDEX_op_x86_psrldq_vec: 3209 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3210 tcg_out8(s, a2); 3211 break; 3212 3213 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3214 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3215 default: 3216 g_assert_not_reached(); 3217 } 3218} 3219 3220static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3221{ 3222 switch (op) { 3223 case INDEX_op_goto_ptr: 3224 return C_O0_I1(r); 3225 3226 case INDEX_op_ld8u_i32: 3227 case INDEX_op_ld8u_i64: 3228 case INDEX_op_ld8s_i32: 3229 case INDEX_op_ld8s_i64: 3230 case INDEX_op_ld16u_i32: 3231 case INDEX_op_ld16u_i64: 3232 case INDEX_op_ld16s_i32: 3233 case INDEX_op_ld16s_i64: 3234 case INDEX_op_ld_i32: 3235 case INDEX_op_ld32u_i64: 3236 case INDEX_op_ld32s_i64: 3237 case INDEX_op_ld_i64: 3238 return C_O1_I1(r, r); 3239 3240 case INDEX_op_st8_i32: 3241 case INDEX_op_st8_i64: 3242 return C_O0_I2(qi, r); 3243 3244 case INDEX_op_st16_i32: 3245 case INDEX_op_st16_i64: 3246 case INDEX_op_st_i32: 3247 case INDEX_op_st32_i64: 3248 return C_O0_I2(ri, r); 3249 3250 case INDEX_op_st_i64: 3251 return C_O0_I2(re, r); 3252 3253 case INDEX_op_add_i32: 3254 case INDEX_op_add_i64: 3255 return C_O1_I2(r, r, re); 3256 3257 case INDEX_op_sub_i32: 3258 case INDEX_op_sub_i64: 3259 case INDEX_op_mul_i32: 3260 case INDEX_op_mul_i64: 3261 case INDEX_op_or_i32: 3262 case INDEX_op_or_i64: 3263 case INDEX_op_xor_i32: 3264 case INDEX_op_xor_i64: 3265 return C_O1_I2(r, 0, re); 3266 3267 case INDEX_op_and_i32: 3268 case INDEX_op_and_i64: 3269 return C_O1_I2(r, 0, reZ); 3270 3271 case INDEX_op_andc_i32: 3272 case INDEX_op_andc_i64: 3273 return C_O1_I2(r, r, rI); 3274 3275 case INDEX_op_shl_i32: 3276 case INDEX_op_shl_i64: 3277 case INDEX_op_shr_i32: 3278 case INDEX_op_shr_i64: 3279 case INDEX_op_sar_i32: 3280 case INDEX_op_sar_i64: 3281 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3282 3283 case INDEX_op_rotl_i32: 3284 case INDEX_op_rotl_i64: 3285 case INDEX_op_rotr_i32: 3286 case INDEX_op_rotr_i64: 3287 return C_O1_I2(r, 0, ci); 3288 3289 case INDEX_op_brcond_i32: 3290 case INDEX_op_brcond_i64: 3291 return C_O0_I2(r, re); 3292 3293 case INDEX_op_bswap16_i32: 3294 case INDEX_op_bswap16_i64: 3295 case INDEX_op_bswap32_i32: 3296 case INDEX_op_bswap32_i64: 3297 case INDEX_op_bswap64_i64: 3298 case INDEX_op_neg_i32: 3299 case INDEX_op_neg_i64: 3300 case INDEX_op_not_i32: 3301 case INDEX_op_not_i64: 3302 case INDEX_op_extrh_i64_i32: 3303 return C_O1_I1(r, 0); 3304 3305 case INDEX_op_ext8s_i32: 3306 case INDEX_op_ext8s_i64: 3307 case INDEX_op_ext8u_i32: 3308 case INDEX_op_ext8u_i64: 3309 return C_O1_I1(r, q); 3310 3311 case INDEX_op_ext16s_i32: 3312 case INDEX_op_ext16s_i64: 3313 case INDEX_op_ext16u_i32: 3314 case INDEX_op_ext16u_i64: 3315 case INDEX_op_ext32s_i64: 3316 case INDEX_op_ext32u_i64: 3317 case INDEX_op_ext_i32_i64: 3318 case INDEX_op_extu_i32_i64: 3319 case INDEX_op_extrl_i64_i32: 3320 case INDEX_op_extract_i32: 3321 case INDEX_op_extract_i64: 3322 case INDEX_op_sextract_i32: 3323 case INDEX_op_ctpop_i32: 3324 case INDEX_op_ctpop_i64: 3325 return C_O1_I1(r, r); 3326 3327 case INDEX_op_extract2_i32: 3328 case INDEX_op_extract2_i64: 3329 return C_O1_I2(r, 0, r); 3330 3331 case INDEX_op_deposit_i32: 3332 case INDEX_op_deposit_i64: 3333 return C_O1_I2(Q, 0, Q); 3334 3335 case INDEX_op_setcond_i32: 3336 case INDEX_op_setcond_i64: 3337 return C_O1_I2(q, r, re); 3338 3339 case INDEX_op_movcond_i32: 3340 case INDEX_op_movcond_i64: 3341 return C_O1_I4(r, r, re, r, 0); 3342 3343 case INDEX_op_div2_i32: 3344 case INDEX_op_div2_i64: 3345 case INDEX_op_divu2_i32: 3346 case INDEX_op_divu2_i64: 3347 return C_O2_I3(a, d, 0, 1, r); 3348 3349 case INDEX_op_mulu2_i32: 3350 case INDEX_op_mulu2_i64: 3351 case INDEX_op_muls2_i32: 3352 case INDEX_op_muls2_i64: 3353 return C_O2_I2(a, d, a, r); 3354 3355 case INDEX_op_add2_i32: 3356 case INDEX_op_add2_i64: 3357 case INDEX_op_sub2_i32: 3358 case INDEX_op_sub2_i64: 3359 return C_O2_I4(r, r, 0, 1, re, re); 3360 3361 case INDEX_op_ctz_i32: 3362 case INDEX_op_ctz_i64: 3363 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3364 3365 case INDEX_op_clz_i32: 3366 case INDEX_op_clz_i64: 3367 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3368 3369 case INDEX_op_qemu_ld_i32: 3370 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS 3371 ? C_O1_I1(r, L) : C_O1_I2(r, L, L)); 3372 3373 case INDEX_op_qemu_st_i32: 3374 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS 3375 ? C_O0_I2(L, L) : C_O0_I3(L, L, L)); 3376 case INDEX_op_qemu_st8_i32: 3377 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS 3378 ? C_O0_I2(s, L) : C_O0_I3(s, L, L)); 3379 3380 case INDEX_op_qemu_ld_i64: 3381 return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) 3382 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L) 3383 : C_O2_I2(r, r, L, L)); 3384 3385 case INDEX_op_qemu_st_i64: 3386 return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) 3387 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L) 3388 : C_O0_I4(L, L, L, L)); 3389 3390 case INDEX_op_brcond2_i32: 3391 return C_O0_I4(r, r, ri, ri); 3392 3393 case INDEX_op_setcond2_i32: 3394 return C_O1_I4(r, r, r, ri, ri); 3395 3396 case INDEX_op_ld_vec: 3397 case INDEX_op_dupm_vec: 3398 return C_O1_I1(x, r); 3399 3400 case INDEX_op_st_vec: 3401 return C_O0_I2(x, r); 3402 3403 case INDEX_op_add_vec: 3404 case INDEX_op_sub_vec: 3405 case INDEX_op_mul_vec: 3406 case INDEX_op_and_vec: 3407 case INDEX_op_or_vec: 3408 case INDEX_op_xor_vec: 3409 case INDEX_op_andc_vec: 3410 case INDEX_op_orc_vec: 3411 case INDEX_op_nand_vec: 3412 case INDEX_op_nor_vec: 3413 case INDEX_op_eqv_vec: 3414 case INDEX_op_ssadd_vec: 3415 case INDEX_op_usadd_vec: 3416 case INDEX_op_sssub_vec: 3417 case INDEX_op_ussub_vec: 3418 case INDEX_op_smin_vec: 3419 case INDEX_op_umin_vec: 3420 case INDEX_op_smax_vec: 3421 case INDEX_op_umax_vec: 3422 case INDEX_op_shlv_vec: 3423 case INDEX_op_shrv_vec: 3424 case INDEX_op_sarv_vec: 3425 case INDEX_op_rotlv_vec: 3426 case INDEX_op_rotrv_vec: 3427 case INDEX_op_shls_vec: 3428 case INDEX_op_shrs_vec: 3429 case INDEX_op_sars_vec: 3430 case INDEX_op_cmp_vec: 3431 case INDEX_op_x86_shufps_vec: 3432 case INDEX_op_x86_blend_vec: 3433 case INDEX_op_x86_packss_vec: 3434 case INDEX_op_x86_packus_vec: 3435 case INDEX_op_x86_vperm2i128_vec: 3436 case INDEX_op_x86_punpckl_vec: 3437 case INDEX_op_x86_punpckh_vec: 3438 case INDEX_op_x86_vpshldi_vec: 3439#if TCG_TARGET_REG_BITS == 32 3440 case INDEX_op_dup2_vec: 3441#endif 3442 return C_O1_I2(x, x, x); 3443 3444 case INDEX_op_abs_vec: 3445 case INDEX_op_dup_vec: 3446 case INDEX_op_not_vec: 3447 case INDEX_op_shli_vec: 3448 case INDEX_op_shri_vec: 3449 case INDEX_op_sari_vec: 3450 case INDEX_op_rotli_vec: 3451 case INDEX_op_x86_psrldq_vec: 3452 return C_O1_I1(x, x); 3453 3454 case INDEX_op_x86_vpshldv_vec: 3455 case INDEX_op_x86_vpshrdv_vec: 3456 return C_O1_I3(x, 0, x, x); 3457 3458 case INDEX_op_bitsel_vec: 3459 case INDEX_op_x86_vpblendvb_vec: 3460 return C_O1_I3(x, x, x, x); 3461 3462 default: 3463 g_assert_not_reached(); 3464 } 3465} 3466 3467int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3468{ 3469 switch (opc) { 3470 case INDEX_op_add_vec: 3471 case INDEX_op_sub_vec: 3472 case INDEX_op_and_vec: 3473 case INDEX_op_or_vec: 3474 case INDEX_op_xor_vec: 3475 case INDEX_op_andc_vec: 3476 case INDEX_op_orc_vec: 3477 case INDEX_op_nand_vec: 3478 case INDEX_op_nor_vec: 3479 case INDEX_op_eqv_vec: 3480 case INDEX_op_not_vec: 3481 case INDEX_op_bitsel_vec: 3482 return 1; 3483 case INDEX_op_cmp_vec: 3484 case INDEX_op_cmpsel_vec: 3485 return -1; 3486 3487 case INDEX_op_rotli_vec: 3488 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3489 3490 case INDEX_op_shli_vec: 3491 case INDEX_op_shri_vec: 3492 /* We must expand the operation for MO_8. */ 3493 return vece == MO_8 ? -1 : 1; 3494 3495 case INDEX_op_sari_vec: 3496 switch (vece) { 3497 case MO_8: 3498 return -1; 3499 case MO_16: 3500 case MO_32: 3501 return 1; 3502 case MO_64: 3503 if (have_avx512vl) { 3504 return 1; 3505 } 3506 /* 3507 * We can emulate this for MO_64, but it does not pay off 3508 * unless we're producing at least 4 values. 3509 */ 3510 return type >= TCG_TYPE_V256 ? -1 : 0; 3511 } 3512 return 0; 3513 3514 case INDEX_op_shls_vec: 3515 case INDEX_op_shrs_vec: 3516 return vece >= MO_16; 3517 case INDEX_op_sars_vec: 3518 switch (vece) { 3519 case MO_16: 3520 case MO_32: 3521 return 1; 3522 case MO_64: 3523 return have_avx512vl; 3524 } 3525 return 0; 3526 case INDEX_op_rotls_vec: 3527 return vece >= MO_16 ? -1 : 0; 3528 3529 case INDEX_op_shlv_vec: 3530 case INDEX_op_shrv_vec: 3531 switch (vece) { 3532 case MO_16: 3533 return have_avx512bw; 3534 case MO_32: 3535 case MO_64: 3536 return have_avx2; 3537 } 3538 return 0; 3539 case INDEX_op_sarv_vec: 3540 switch (vece) { 3541 case MO_16: 3542 return have_avx512bw; 3543 case MO_32: 3544 return have_avx2; 3545 case MO_64: 3546 return have_avx512vl; 3547 } 3548 return 0; 3549 case INDEX_op_rotlv_vec: 3550 case INDEX_op_rotrv_vec: 3551 switch (vece) { 3552 case MO_16: 3553 return have_avx512vbmi2 ? -1 : 0; 3554 case MO_32: 3555 case MO_64: 3556 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3557 } 3558 return 0; 3559 3560 case INDEX_op_mul_vec: 3561 switch (vece) { 3562 case MO_8: 3563 return -1; 3564 case MO_64: 3565 return have_avx512dq; 3566 } 3567 return 1; 3568 3569 case INDEX_op_ssadd_vec: 3570 case INDEX_op_usadd_vec: 3571 case INDEX_op_sssub_vec: 3572 case INDEX_op_ussub_vec: 3573 return vece <= MO_16; 3574 case INDEX_op_smin_vec: 3575 case INDEX_op_smax_vec: 3576 case INDEX_op_umin_vec: 3577 case INDEX_op_umax_vec: 3578 case INDEX_op_abs_vec: 3579 return vece <= MO_32 || have_avx512vl; 3580 3581 default: 3582 return 0; 3583 } 3584} 3585 3586static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3587 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3588{ 3589 TCGv_vec t1, t2; 3590 3591 tcg_debug_assert(vece == MO_8); 3592 3593 t1 = tcg_temp_new_vec(type); 3594 t2 = tcg_temp_new_vec(type); 3595 3596 /* 3597 * Unpack to W, shift, and repack. Tricky bits: 3598 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3599 * i.e. duplicate in other half of the 16-bit lane. 3600 * (2) For right-shift, add 8 so that the high half of the lane 3601 * becomes zero. For left-shift, and left-rotate, we must 3602 * shift up and down again. 3603 * (3) Step 2 leaves high half zero such that PACKUSWB 3604 * (pack with unsigned saturation) does not modify 3605 * the quantity. 3606 */ 3607 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3608 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3609 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3610 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3611 3612 if (opc != INDEX_op_rotli_vec) { 3613 imm += 8; 3614 } 3615 if (opc == INDEX_op_shri_vec) { 3616 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3617 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3618 } else { 3619 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3620 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3621 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3622 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3623 } 3624 3625 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3626 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3627 tcg_temp_free_vec(t1); 3628 tcg_temp_free_vec(t2); 3629} 3630 3631static void expand_vec_sari(TCGType type, unsigned vece, 3632 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3633{ 3634 TCGv_vec t1, t2; 3635 3636 switch (vece) { 3637 case MO_8: 3638 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3639 t1 = tcg_temp_new_vec(type); 3640 t2 = tcg_temp_new_vec(type); 3641 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3642 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3643 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3644 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3645 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3646 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3647 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3648 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3649 tcg_temp_free_vec(t1); 3650 tcg_temp_free_vec(t2); 3651 break; 3652 3653 case MO_64: 3654 if (imm <= 32) { 3655 /* 3656 * We can emulate a small sign extend by performing an arithmetic 3657 * 32-bit shift and overwriting the high half of a 64-bit logical 3658 * shift. Note that the ISA says shift of 32 is valid, but TCG 3659 * does not, so we have to bound the smaller shift -- we get the 3660 * same result in the high half either way. 3661 */ 3662 t1 = tcg_temp_new_vec(type); 3663 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3664 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3665 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3666 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3667 tcgv_vec_arg(t1), 0xaa); 3668 tcg_temp_free_vec(t1); 3669 } else { 3670 /* Otherwise we will need to use a compare vs 0 to produce 3671 * the sign-extend, shift and merge. 3672 */ 3673 t1 = tcg_const_zeros_vec(type); 3674 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); 3675 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3676 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3677 tcg_gen_or_vec(MO_64, v0, v0, t1); 3678 tcg_temp_free_vec(t1); 3679 } 3680 break; 3681 3682 default: 3683 g_assert_not_reached(); 3684 } 3685} 3686 3687static void expand_vec_rotli(TCGType type, unsigned vece, 3688 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3689{ 3690 TCGv_vec t; 3691 3692 if (vece == MO_8) { 3693 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3694 return; 3695 } 3696 3697 if (have_avx512vbmi2) { 3698 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3699 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3700 return; 3701 } 3702 3703 t = tcg_temp_new_vec(type); 3704 tcg_gen_shli_vec(vece, t, v1, imm); 3705 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3706 tcg_gen_or_vec(vece, v0, v0, t); 3707 tcg_temp_free_vec(t); 3708} 3709 3710static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3711 TCGv_vec v1, TCGv_vec sh, bool right) 3712{ 3713 TCGv_vec t; 3714 3715 if (have_avx512vbmi2) { 3716 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3717 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3718 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3719 return; 3720 } 3721 3722 t = tcg_temp_new_vec(type); 3723 tcg_gen_dupi_vec(vece, t, 8 << vece); 3724 tcg_gen_sub_vec(vece, t, t, sh); 3725 if (right) { 3726 tcg_gen_shlv_vec(vece, t, v1, t); 3727 tcg_gen_shrv_vec(vece, v0, v1, sh); 3728 } else { 3729 tcg_gen_shrv_vec(vece, t, v1, t); 3730 tcg_gen_shlv_vec(vece, v0, v1, sh); 3731 } 3732 tcg_gen_or_vec(vece, v0, v0, t); 3733 tcg_temp_free_vec(t); 3734} 3735 3736static void expand_vec_rotls(TCGType type, unsigned vece, 3737 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3738{ 3739 TCGv_vec t = tcg_temp_new_vec(type); 3740 3741 tcg_debug_assert(vece != MO_8); 3742 3743 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3744 tcg_gen_dup_i32_vec(vece, t, lsh); 3745 if (vece >= MO_32) { 3746 tcg_gen_rotlv_vec(vece, v0, v1, t); 3747 } else { 3748 expand_vec_rotv(type, vece, v0, v1, t, false); 3749 } 3750 } else { 3751 TCGv_i32 rsh = tcg_temp_new_i32(); 3752 3753 tcg_gen_neg_i32(rsh, lsh); 3754 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3755 tcg_gen_shls_vec(vece, t, v1, lsh); 3756 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3757 tcg_gen_or_vec(vece, v0, v0, t); 3758 3759 tcg_temp_free_i32(rsh); 3760 } 3761 3762 tcg_temp_free_vec(t); 3763} 3764 3765static void expand_vec_mul(TCGType type, unsigned vece, 3766 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3767{ 3768 TCGv_vec t1, t2, t3, t4, zero; 3769 3770 tcg_debug_assert(vece == MO_8); 3771 3772 /* 3773 * Unpack v1 bytes to words, 0 | x. 3774 * Unpack v2 bytes to words, y | 0. 3775 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3776 * Shift logical right by 8 bits to clear the high 8 bytes before 3777 * using an unsigned saturated pack. 3778 * 3779 * The difference between the V64, V128 and V256 cases is merely how 3780 * we distribute the expansion between temporaries. 3781 */ 3782 switch (type) { 3783 case TCG_TYPE_V64: 3784 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3785 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3786 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3787 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3788 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3789 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3790 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3791 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3792 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3793 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3794 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3795 tcg_temp_free_vec(t1); 3796 tcg_temp_free_vec(t2); 3797 break; 3798 3799 case TCG_TYPE_V128: 3800 case TCG_TYPE_V256: 3801 t1 = tcg_temp_new_vec(type); 3802 t2 = tcg_temp_new_vec(type); 3803 t3 = tcg_temp_new_vec(type); 3804 t4 = tcg_temp_new_vec(type); 3805 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3806 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3807 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3808 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3809 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3810 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3811 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3812 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3813 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3814 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3815 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3816 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3817 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3818 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3819 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3820 tcg_temp_free_vec(t1); 3821 tcg_temp_free_vec(t2); 3822 tcg_temp_free_vec(t3); 3823 tcg_temp_free_vec(t4); 3824 break; 3825 3826 default: 3827 g_assert_not_reached(); 3828 } 3829} 3830 3831static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3832 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3833{ 3834 enum { 3835 NEED_INV = 1, 3836 NEED_SWAP = 2, 3837 NEED_BIAS = 4, 3838 NEED_UMIN = 8, 3839 NEED_UMAX = 16, 3840 }; 3841 TCGv_vec t1, t2, t3; 3842 uint8_t fixup; 3843 3844 switch (cond) { 3845 case TCG_COND_EQ: 3846 case TCG_COND_GT: 3847 fixup = 0; 3848 break; 3849 case TCG_COND_NE: 3850 case TCG_COND_LE: 3851 fixup = NEED_INV; 3852 break; 3853 case TCG_COND_LT: 3854 fixup = NEED_SWAP; 3855 break; 3856 case TCG_COND_GE: 3857 fixup = NEED_SWAP | NEED_INV; 3858 break; 3859 case TCG_COND_LEU: 3860 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3861 fixup = NEED_UMIN; 3862 } else { 3863 fixup = NEED_BIAS | NEED_INV; 3864 } 3865 break; 3866 case TCG_COND_GTU: 3867 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3868 fixup = NEED_UMIN | NEED_INV; 3869 } else { 3870 fixup = NEED_BIAS; 3871 } 3872 break; 3873 case TCG_COND_GEU: 3874 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3875 fixup = NEED_UMAX; 3876 } else { 3877 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3878 } 3879 break; 3880 case TCG_COND_LTU: 3881 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3882 fixup = NEED_UMAX | NEED_INV; 3883 } else { 3884 fixup = NEED_BIAS | NEED_SWAP; 3885 } 3886 break; 3887 default: 3888 g_assert_not_reached(); 3889 } 3890 3891 if (fixup & NEED_INV) { 3892 cond = tcg_invert_cond(cond); 3893 } 3894 if (fixup & NEED_SWAP) { 3895 t1 = v1, v1 = v2, v2 = t1; 3896 cond = tcg_swap_cond(cond); 3897 } 3898 3899 t1 = t2 = NULL; 3900 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3901 t1 = tcg_temp_new_vec(type); 3902 if (fixup & NEED_UMIN) { 3903 tcg_gen_umin_vec(vece, t1, v1, v2); 3904 } else { 3905 tcg_gen_umax_vec(vece, t1, v1, v2); 3906 } 3907 v2 = t1; 3908 cond = TCG_COND_EQ; 3909 } else if (fixup & NEED_BIAS) { 3910 t1 = tcg_temp_new_vec(type); 3911 t2 = tcg_temp_new_vec(type); 3912 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 3913 tcg_gen_sub_vec(vece, t1, v1, t3); 3914 tcg_gen_sub_vec(vece, t2, v2, t3); 3915 v1 = t1; 3916 v2 = t2; 3917 cond = tcg_signed_cond(cond); 3918 } 3919 3920 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 3921 /* Expand directly; do not recurse. */ 3922 vec_gen_4(INDEX_op_cmp_vec, type, vece, 3923 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 3924 3925 if (t1) { 3926 tcg_temp_free_vec(t1); 3927 if (t2) { 3928 tcg_temp_free_vec(t2); 3929 } 3930 } 3931 return fixup & NEED_INV; 3932} 3933 3934static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 3935 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3936{ 3937 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 3938 tcg_gen_not_vec(vece, v0, v0); 3939 } 3940} 3941 3942static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 3943 TCGv_vec c1, TCGv_vec c2, 3944 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 3945{ 3946 TCGv_vec t = tcg_temp_new_vec(type); 3947 3948 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 3949 /* Invert the sense of the compare by swapping arguments. */ 3950 TCGv_vec x; 3951 x = v3, v3 = v4, v4 = x; 3952 } 3953 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 3954 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 3955 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 3956 tcg_temp_free_vec(t); 3957} 3958 3959void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 3960 TCGArg a0, ...) 3961{ 3962 va_list va; 3963 TCGArg a2; 3964 TCGv_vec v0, v1, v2, v3, v4; 3965 3966 va_start(va, a0); 3967 v0 = temp_tcgv_vec(arg_temp(a0)); 3968 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3969 a2 = va_arg(va, TCGArg); 3970 3971 switch (opc) { 3972 case INDEX_op_shli_vec: 3973 case INDEX_op_shri_vec: 3974 expand_vec_shi(type, vece, opc, v0, v1, a2); 3975 break; 3976 3977 case INDEX_op_sari_vec: 3978 expand_vec_sari(type, vece, v0, v1, a2); 3979 break; 3980 3981 case INDEX_op_rotli_vec: 3982 expand_vec_rotli(type, vece, v0, v1, a2); 3983 break; 3984 3985 case INDEX_op_rotls_vec: 3986 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 3987 break; 3988 3989 case INDEX_op_rotlv_vec: 3990 v2 = temp_tcgv_vec(arg_temp(a2)); 3991 expand_vec_rotv(type, vece, v0, v1, v2, false); 3992 break; 3993 case INDEX_op_rotrv_vec: 3994 v2 = temp_tcgv_vec(arg_temp(a2)); 3995 expand_vec_rotv(type, vece, v0, v1, v2, true); 3996 break; 3997 3998 case INDEX_op_mul_vec: 3999 v2 = temp_tcgv_vec(arg_temp(a2)); 4000 expand_vec_mul(type, vece, v0, v1, v2); 4001 break; 4002 4003 case INDEX_op_cmp_vec: 4004 v2 = temp_tcgv_vec(arg_temp(a2)); 4005 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 4006 break; 4007 4008 case INDEX_op_cmpsel_vec: 4009 v2 = temp_tcgv_vec(arg_temp(a2)); 4010 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4011 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 4012 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 4013 break; 4014 4015 default: 4016 break; 4017 } 4018 4019 va_end(va); 4020} 4021 4022static const int tcg_target_callee_save_regs[] = { 4023#if TCG_TARGET_REG_BITS == 64 4024 TCG_REG_RBP, 4025 TCG_REG_RBX, 4026#if defined(_WIN64) 4027 TCG_REG_RDI, 4028 TCG_REG_RSI, 4029#endif 4030 TCG_REG_R12, 4031 TCG_REG_R13, 4032 TCG_REG_R14, /* Currently used for the global env. */ 4033 TCG_REG_R15, 4034#else 4035 TCG_REG_EBP, /* Currently used for the global env. */ 4036 TCG_REG_EBX, 4037 TCG_REG_ESI, 4038 TCG_REG_EDI, 4039#endif 4040}; 4041 4042/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4043 and tcg_register_jit. */ 4044 4045#define PUSH_SIZE \ 4046 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4047 * (TCG_TARGET_REG_BITS / 8)) 4048 4049#define FRAME_SIZE \ 4050 ((PUSH_SIZE \ 4051 + TCG_STATIC_CALL_ARGS_SIZE \ 4052 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4053 + TCG_TARGET_STACK_ALIGN - 1) \ 4054 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4055 4056/* Generate global QEMU prologue and epilogue code */ 4057static void tcg_target_qemu_prologue(TCGContext *s) 4058{ 4059 int i, stack_addend; 4060 4061 /* TB prologue */ 4062 4063 /* Reserve some stack space, also for TCG temps. */ 4064 stack_addend = FRAME_SIZE - PUSH_SIZE; 4065 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4066 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4067 4068 /* Save all callee saved registers. */ 4069 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4070 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4071 } 4072 4073#if TCG_TARGET_REG_BITS == 32 4074 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4075 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4076 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4077 /* jmp *tb. */ 4078 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4079 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4080 + stack_addend); 4081#else 4082# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64 4083 if (guest_base) { 4084 int seg = setup_guest_base_seg(); 4085 if (seg != 0) { 4086 x86_guest_base_seg = seg; 4087 } else if (guest_base == (int32_t)guest_base) { 4088 x86_guest_base_offset = guest_base; 4089 } else { 4090 /* Choose R12 because, as a base, it requires a SIB byte. */ 4091 x86_guest_base_index = TCG_REG_R12; 4092 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base); 4093 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index); 4094 } 4095 } 4096# endif 4097 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4098 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4099 /* jmp *tb. */ 4100 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4101#endif 4102 4103 /* 4104 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4105 * and fall through to the rest of the epilogue. 4106 */ 4107 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4108 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4109 4110 /* TB epilogue */ 4111 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4112 4113 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4114 4115 if (have_avx2) { 4116 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4117 } 4118 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4119 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4120 } 4121 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4122} 4123 4124static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4125{ 4126 memset(p, 0x90, count); 4127} 4128 4129static void tcg_target_init(TCGContext *s) 4130{ 4131#ifdef CONFIG_CPUID_H 4132 unsigned a, b, c, d, b7 = 0, c7 = 0; 4133 unsigned max = __get_cpuid_max(0, 0); 4134 4135 if (max >= 7) { 4136 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ 4137 __cpuid_count(7, 0, a, b7, c7, d); 4138 have_bmi1 = (b7 & bit_BMI) != 0; 4139 have_bmi2 = (b7 & bit_BMI2) != 0; 4140 } 4141 4142 if (max >= 1) { 4143 __cpuid(1, a, b, c, d); 4144#ifndef have_cmov 4145 /* For 32-bit, 99% certainty that we're running on hardware that 4146 supports cmov, but we still need to check. In case cmov is not 4147 available, we'll use a small forward branch. */ 4148 have_cmov = (d & bit_CMOV) != 0; 4149#endif 4150 4151 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we 4152 need to probe for it. */ 4153 have_movbe = (c & bit_MOVBE) != 0; 4154 have_popcnt = (c & bit_POPCNT) != 0; 4155 4156 /* There are a number of things we must check before we can be 4157 sure of not hitting invalid opcode. */ 4158 if (c & bit_OSXSAVE) { 4159 unsigned xcrl, xcrh; 4160 /* The xgetbv instruction is not available to older versions of 4161 * the assembler, so we encode the instruction manually. 4162 */ 4163 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); 4164 if ((xcrl & 6) == 6) { 4165 have_avx1 = (c & bit_AVX) != 0; 4166 have_avx2 = (b7 & bit_AVX2) != 0; 4167 4168 /* 4169 * There are interesting instructions in AVX512, so long 4170 * as we have AVX512VL, which indicates support for EVEX 4171 * on sizes smaller than 512 bits. We are required to 4172 * check that OPMASK and all extended ZMM state are enabled 4173 * even if we're not using them -- the insns will fault. 4174 */ 4175 if ((xcrl & 0xe0) == 0xe0 4176 && (b7 & bit_AVX512F) 4177 && (b7 & bit_AVX512VL)) { 4178 have_avx512vl = true; 4179 have_avx512bw = (b7 & bit_AVX512BW) != 0; 4180 have_avx512dq = (b7 & bit_AVX512DQ) != 0; 4181 have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0; 4182 } 4183 } 4184 } 4185 } 4186 4187 max = __get_cpuid_max(0x8000000, 0); 4188 if (max >= 1) { 4189 __cpuid(0x80000001, a, b, c, d); 4190 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */ 4191 have_lzcnt = (c & bit_LZCNT) != 0; 4192 } 4193#endif /* CONFIG_CPUID_H */ 4194 4195 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4196 if (TCG_TARGET_REG_BITS == 64) { 4197 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4198 } 4199 if (have_avx1) { 4200 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4201 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4202 } 4203 if (have_avx2) { 4204 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4205 } 4206 4207 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4208 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4209 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4210 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4211 if (TCG_TARGET_REG_BITS == 64) { 4212#if !defined(_WIN64) 4213 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4214 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4215#endif 4216 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4217 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4218 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4219 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4220 } 4221 4222 s->reserved_regs = 0; 4223 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4224} 4225 4226typedef struct { 4227 DebugFrameHeader h; 4228 uint8_t fde_def_cfa[4]; 4229 uint8_t fde_reg_ofs[14]; 4230} DebugFrame; 4231 4232/* We're expecting a 2 byte uleb128 encoded value. */ 4233QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4234 4235#if !defined(__ELF__) 4236 /* Host machine without ELF. */ 4237#elif TCG_TARGET_REG_BITS == 64 4238#define ELF_HOST_MACHINE EM_X86_64 4239static const DebugFrame debug_frame = { 4240 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4241 .h.cie.id = -1, 4242 .h.cie.version = 1, 4243 .h.cie.code_align = 1, 4244 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4245 .h.cie.return_column = 16, 4246 4247 /* Total FDE size does not include the "len" member. */ 4248 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4249 4250 .fde_def_cfa = { 4251 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4252 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4253 (FRAME_SIZE >> 7) 4254 }, 4255 .fde_reg_ofs = { 4256 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4257 /* The following ordering must match tcg_target_callee_save_regs. */ 4258 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4259 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4260 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4261 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4262 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4263 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4264 } 4265}; 4266#else 4267#define ELF_HOST_MACHINE EM_386 4268static const DebugFrame debug_frame = { 4269 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4270 .h.cie.id = -1, 4271 .h.cie.version = 1, 4272 .h.cie.code_align = 1, 4273 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4274 .h.cie.return_column = 8, 4275 4276 /* Total FDE size does not include the "len" member. */ 4277 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4278 4279 .fde_def_cfa = { 4280 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4281 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4282 (FRAME_SIZE >> 7) 4283 }, 4284 .fde_reg_ofs = { 4285 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4286 /* The following ordering must match tcg_target_callee_save_regs. */ 4287 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4288 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4289 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4290 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4291 } 4292}; 4293#endif 4294 4295#if defined(ELF_HOST_MACHINE) 4296void tcg_register_jit(const void *buf, size_t buf_size) 4297{ 4298 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4299} 4300#endif 4301