1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94static const int tcg_target_call_iarg_regs[] = { 95#if TCG_TARGET_REG_BITS == 64 96#if defined(_WIN64) 97 TCG_REG_RCX, 98 TCG_REG_RDX, 99#else 100 TCG_REG_RDI, 101 TCG_REG_RSI, 102 TCG_REG_RDX, 103 TCG_REG_RCX, 104#endif 105 TCG_REG_R8, 106 TCG_REG_R9, 107#else 108 /* 32 bit mode uses stack based calling convention (GCC default). */ 109#endif 110}; 111 112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 113{ 114 switch (kind) { 115 case TCG_CALL_RET_NORMAL: 116 tcg_debug_assert(slot >= 0 && slot <= 1); 117 return slot ? TCG_REG_EDX : TCG_REG_EAX; 118#ifdef _WIN64 119 case TCG_CALL_RET_BY_VEC: 120 tcg_debug_assert(slot == 0); 121 return TCG_REG_XMM0; 122#endif 123 default: 124 g_assert_not_reached(); 125 } 126} 127 128/* Constants we accept. */ 129#define TCG_CT_CONST_S32 0x100 130#define TCG_CT_CONST_U32 0x200 131#define TCG_CT_CONST_I32 0x400 132#define TCG_CT_CONST_WSZ 0x800 133 134/* Registers used with L constraint, which are the first argument 135 registers on x86_64, and two random call clobbered registers on 136 i386. */ 137#if TCG_TARGET_REG_BITS == 64 138# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 139# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 140#else 141# define TCG_REG_L0 TCG_REG_EAX 142# define TCG_REG_L1 TCG_REG_EDX 143#endif 144 145#define ALL_BYTEH_REGS 0x0000000fu 146#if TCG_TARGET_REG_BITS == 64 147# define ALL_GENERAL_REGS 0x0000ffffu 148# define ALL_VECTOR_REGS 0xffff0000u 149# define ALL_BYTEL_REGS ALL_GENERAL_REGS 150#else 151# define ALL_GENERAL_REGS 0x000000ffu 152# define ALL_VECTOR_REGS 0x00ff0000u 153# define ALL_BYTEL_REGS ALL_BYTEH_REGS 154#endif 155#ifdef CONFIG_SOFTMMU 156# define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1)) 157#else 158# define SOFTMMU_RESERVE_REGS 0 159#endif 160 161/* The host compiler should supply <cpuid.h> to enable runtime features 162 detection, as we're not going to go so far as our own inline assembly. 163 If not available, default values will be assumed. */ 164#if defined(CONFIG_CPUID_H) 165#include "qemu/cpuid.h" 166#endif 167 168/* For 64-bit, we always know that CMOV is available. */ 169#if TCG_TARGET_REG_BITS == 64 170# define have_cmov 1 171#elif defined(CONFIG_CPUID_H) 172static bool have_cmov; 173#else 174# define have_cmov 0 175#endif 176 177/* We need these symbols in tcg-target.h, and we can't properly conditionalize 178 it there. Therefore we always define the variable. */ 179bool have_bmi1; 180bool have_popcnt; 181bool have_avx1; 182bool have_avx2; 183bool have_avx512bw; 184bool have_avx512dq; 185bool have_avx512vbmi2; 186bool have_avx512vl; 187bool have_movbe; 188bool have_atomic16; 189 190#ifdef CONFIG_CPUID_H 191static bool have_bmi2; 192static bool have_lzcnt; 193#else 194# define have_bmi2 0 195# define have_lzcnt 0 196#endif 197 198static const tcg_insn_unit *tb_ret_addr; 199 200static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 201 intptr_t value, intptr_t addend) 202{ 203 value += addend; 204 switch(type) { 205 case R_386_PC32: 206 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 207 if (value != (int32_t)value) { 208 return false; 209 } 210 /* FALLTHRU */ 211 case R_386_32: 212 tcg_patch32(code_ptr, value); 213 break; 214 case R_386_PC8: 215 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 216 if (value != (int8_t)value) { 217 return false; 218 } 219 tcg_patch8(code_ptr, value); 220 break; 221 default: 222 g_assert_not_reached(); 223 } 224 return true; 225} 226 227/* test if a constant matches the constraint */ 228static bool tcg_target_const_match(int64_t val, TCGType type, int ct) 229{ 230 if (ct & TCG_CT_CONST) { 231 return 1; 232 } 233 if (type == TCG_TYPE_I32) { 234 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) { 235 return 1; 236 } 237 } else { 238 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 239 return 1; 240 } 241 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 242 return 1; 243 } 244 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 245 return 1; 246 } 247 } 248 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 249 return 1; 250 } 251 return 0; 252} 253 254# define LOWREGMASK(x) ((x) & 7) 255 256#define P_EXT 0x100 /* 0x0f opcode prefix */ 257#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 258#define P_DATA16 0x400 /* 0x66 opcode prefix */ 259#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 260#if TCG_TARGET_REG_BITS == 64 261# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 262# define P_REXB_R 0x2000 /* REG field as byte register */ 263# define P_REXB_RM 0x4000 /* R/M field as byte register */ 264# define P_GS 0x8000 /* gs segment override */ 265#else 266# define P_REXW 0 267# define P_REXB_R 0 268# define P_REXB_RM 0 269# define P_GS 0 270#endif 271#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 272#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 273#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 274#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 275#define P_EVEX 0x100000 /* Requires EVEX encoding */ 276 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVL_Iv (0xb8) 307#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 308#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 309#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 310#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 311#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 312#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 313#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 314#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 315#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 316#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 318#define OPC_MOVSBL (0xbe | P_EXT) 319#define OPC_MOVSWL (0xbf | P_EXT) 320#define OPC_MOVSLQ (0x63 | P_REXW) 321#define OPC_MOVZBL (0xb6 | P_EXT) 322#define OPC_MOVZWL (0xb7 | P_EXT) 323#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 324#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 325#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 326#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 327#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 328#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 329#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 330#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 331#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 332#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 333#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 334#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 335#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 336#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 337#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 338#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 339#define OPC_PAND (0xdb | P_EXT | P_DATA16) 340#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 341#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 342#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 343#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 344#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 345#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 346#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 347#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 348#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 349#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 350#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 351#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 352#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 353#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 354#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 355#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 356#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 357#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 358#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 359#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 360#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 361#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 362#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 363#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 364#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 365#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 366#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 367#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 368#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 369#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 370#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 371#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 372#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 373#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 374#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 375#define OPC_POR (0xeb | P_EXT | P_DATA16) 376#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 377#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 378#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 379#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 380#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 381#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 382#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 383#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 384#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 385#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 386#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 387#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 388#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 389#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 390#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 391#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 392#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 393#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 394#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 395#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 396#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 397#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 398#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 399#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 400#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 401#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 402#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 404#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 405#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 406#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 407#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 408#define OPC_PXOR (0xef | P_EXT | P_DATA16) 409#define OPC_POP_r32 (0x58) 410#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 411#define OPC_PUSH_r32 (0x50) 412#define OPC_PUSH_Iv (0x68) 413#define OPC_PUSH_Ib (0x6a) 414#define OPC_RET (0xc3) 415#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 416#define OPC_SHIFT_1 (0xd1) 417#define OPC_SHIFT_Ib (0xc1) 418#define OPC_SHIFT_cl (0xd3) 419#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 420#define OPC_SHUFPS (0xc6 | P_EXT) 421#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 422#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 423#define OPC_SHRD_Ib (0xac | P_EXT) 424#define OPC_TESTL (0x85) 425#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 426#define OPC_UD2 (0x0b | P_EXT) 427#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 428#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 429#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 430#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 431#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 432#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 433#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 434#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 435#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 436#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 437#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 438#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 439#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 440#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 442#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 443#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 445#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 446#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 447#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 448#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 449#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 450#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 451#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 452#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 453#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 454#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 455#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 456#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 457#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 458#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 459#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 460#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 461#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 462#define OPC_VZEROUPPER (0x77 | P_EXT) 463#define OPC_XCHG_ax_r32 (0x90) 464#define OPC_XCHG_EvGv (0x87) 465 466#define OPC_GRP3_Eb (0xf6) 467#define OPC_GRP3_Ev (0xf7) 468#define OPC_GRP5 (0xff) 469#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 470 471/* Group 1 opcode extensions for 0x80-0x83. 472 These are also used as modifiers for OPC_ARITH. */ 473#define ARITH_ADD 0 474#define ARITH_OR 1 475#define ARITH_ADC 2 476#define ARITH_SBB 3 477#define ARITH_AND 4 478#define ARITH_SUB 5 479#define ARITH_XOR 6 480#define ARITH_CMP 7 481 482/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 483#define SHIFT_ROL 0 484#define SHIFT_ROR 1 485#define SHIFT_SHL 4 486#define SHIFT_SHR 5 487#define SHIFT_SAR 7 488 489/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 490#define EXT3_TESTi 0 491#define EXT3_NOT 2 492#define EXT3_NEG 3 493#define EXT3_MUL 4 494#define EXT3_IMUL 5 495#define EXT3_DIV 6 496#define EXT3_IDIV 7 497 498/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 499#define EXT5_INC_Ev 0 500#define EXT5_DEC_Ev 1 501#define EXT5_CALLN_Ev 2 502#define EXT5_JMPN_Ev 4 503 504/* Condition codes to be added to OPC_JCC_{long,short}. */ 505#define JCC_JMP (-1) 506#define JCC_JO 0x0 507#define JCC_JNO 0x1 508#define JCC_JB 0x2 509#define JCC_JAE 0x3 510#define JCC_JE 0x4 511#define JCC_JNE 0x5 512#define JCC_JBE 0x6 513#define JCC_JA 0x7 514#define JCC_JS 0x8 515#define JCC_JNS 0x9 516#define JCC_JP 0xa 517#define JCC_JNP 0xb 518#define JCC_JL 0xc 519#define JCC_JGE 0xd 520#define JCC_JLE 0xe 521#define JCC_JG 0xf 522 523static const uint8_t tcg_cond_to_jcc[] = { 524 [TCG_COND_EQ] = JCC_JE, 525 [TCG_COND_NE] = JCC_JNE, 526 [TCG_COND_LT] = JCC_JL, 527 [TCG_COND_GE] = JCC_JGE, 528 [TCG_COND_LE] = JCC_JLE, 529 [TCG_COND_GT] = JCC_JG, 530 [TCG_COND_LTU] = JCC_JB, 531 [TCG_COND_GEU] = JCC_JAE, 532 [TCG_COND_LEU] = JCC_JBE, 533 [TCG_COND_GTU] = JCC_JA, 534}; 535 536#if TCG_TARGET_REG_BITS == 64 537static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 538{ 539 int rex; 540 541 if (opc & P_GS) { 542 tcg_out8(s, 0x65); 543 } 544 if (opc & P_DATA16) { 545 /* We should never be asking for both 16 and 64-bit operation. */ 546 tcg_debug_assert((opc & P_REXW) == 0); 547 tcg_out8(s, 0x66); 548 } 549 if (opc & P_SIMDF3) { 550 tcg_out8(s, 0xf3); 551 } else if (opc & P_SIMDF2) { 552 tcg_out8(s, 0xf2); 553 } 554 555 rex = 0; 556 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 557 rex |= (r & 8) >> 1; /* REX.R */ 558 rex |= (x & 8) >> 2; /* REX.X */ 559 rex |= (rm & 8) >> 3; /* REX.B */ 560 561 /* P_REXB_{R,RM} indicates that the given register is the low byte. 562 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 563 as otherwise the encoding indicates %[abcd]h. Note that the values 564 that are ORed in merely indicate that the REX byte must be present; 565 those bits get discarded in output. */ 566 rex |= opc & (r >= 4 ? P_REXB_R : 0); 567 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 568 569 if (rex) { 570 tcg_out8(s, (uint8_t)(rex | 0x40)); 571 } 572 573 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 574 tcg_out8(s, 0x0f); 575 if (opc & P_EXT38) { 576 tcg_out8(s, 0x38); 577 } else if (opc & P_EXT3A) { 578 tcg_out8(s, 0x3a); 579 } 580 } 581 582 tcg_out8(s, opc); 583} 584#else 585static void tcg_out_opc(TCGContext *s, int opc) 586{ 587 if (opc & P_DATA16) { 588 tcg_out8(s, 0x66); 589 } 590 if (opc & P_SIMDF3) { 591 tcg_out8(s, 0xf3); 592 } else if (opc & P_SIMDF2) { 593 tcg_out8(s, 0xf2); 594 } 595 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 596 tcg_out8(s, 0x0f); 597 if (opc & P_EXT38) { 598 tcg_out8(s, 0x38); 599 } else if (opc & P_EXT3A) { 600 tcg_out8(s, 0x3a); 601 } 602 } 603 tcg_out8(s, opc); 604} 605/* Discard the register arguments to tcg_out_opc early, so as not to penalize 606 the 32-bit compilation paths. This method works with all versions of gcc, 607 whereas relying on optimization may not be able to exclude them. */ 608#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 609#endif 610 611static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 612{ 613 tcg_out_opc(s, opc, r, rm, 0); 614 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 615} 616 617static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 618 int rm, int index) 619{ 620 int tmp; 621 622 /* Use the two byte form if possible, which cannot encode 623 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 624 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 625 && ((rm | index) & 8) == 0) { 626 /* Two byte VEX prefix. */ 627 tcg_out8(s, 0xc5); 628 629 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 630 } else { 631 /* Three byte VEX prefix. */ 632 tcg_out8(s, 0xc4); 633 634 /* VEX.m-mmmm */ 635 if (opc & P_EXT3A) { 636 tmp = 3; 637 } else if (opc & P_EXT38) { 638 tmp = 2; 639 } else if (opc & P_EXT) { 640 tmp = 1; 641 } else { 642 g_assert_not_reached(); 643 } 644 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 645 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 646 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 647 tcg_out8(s, tmp); 648 649 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 650 } 651 652 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 653 /* VEX.pp */ 654 if (opc & P_DATA16) { 655 tmp |= 1; /* 0x66 */ 656 } else if (opc & P_SIMDF3) { 657 tmp |= 2; /* 0xf3 */ 658 } else if (opc & P_SIMDF2) { 659 tmp |= 3; /* 0xf2 */ 660 } 661 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 662 tcg_out8(s, tmp); 663 tcg_out8(s, opc); 664} 665 666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 667 int rm, int index) 668{ 669 /* The entire 4-byte evex prefix; with R' and V' set. */ 670 uint32_t p = 0x08041062; 671 int mm, pp; 672 673 tcg_debug_assert(have_avx512vl); 674 675 /* EVEX.mm */ 676 if (opc & P_EXT3A) { 677 mm = 3; 678 } else if (opc & P_EXT38) { 679 mm = 2; 680 } else if (opc & P_EXT) { 681 mm = 1; 682 } else { 683 g_assert_not_reached(); 684 } 685 686 /* EVEX.pp */ 687 if (opc & P_DATA16) { 688 pp = 1; /* 0x66 */ 689 } else if (opc & P_SIMDF3) { 690 pp = 2; /* 0xf3 */ 691 } else if (opc & P_SIMDF2) { 692 pp = 3; /* 0xf2 */ 693 } else { 694 pp = 0; 695 } 696 697 p = deposit32(p, 8, 2, mm); 698 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 699 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 700 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 701 p = deposit32(p, 16, 2, pp); 702 p = deposit32(p, 19, 4, ~v); 703 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 704 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 705 706 tcg_out32(s, p); 707 tcg_out8(s, opc); 708} 709 710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 711{ 712 if (opc & P_EVEX) { 713 tcg_out_evex_opc(s, opc, r, v, rm, 0); 714 } else { 715 tcg_out_vex_opc(s, opc, r, v, rm, 0); 716 } 717 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 718} 719 720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 721 We handle either RM and INDEX missing with a negative value. In 64-bit 722 mode for absolute addresses, ~RM is the size of the immediate operand 723 that will follow the instruction. */ 724 725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 726 int shift, intptr_t offset) 727{ 728 int mod, len; 729 730 if (index < 0 && rm < 0) { 731 if (TCG_TARGET_REG_BITS == 64) { 732 /* Try for a rip-relative addressing mode. This has replaced 733 the 32-bit-mode absolute addressing encoding. */ 734 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 735 intptr_t disp = offset - pc; 736 if (disp == (int32_t)disp) { 737 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 738 tcg_out32(s, disp); 739 return; 740 } 741 742 /* Try for an absolute address encoding. This requires the 743 use of the MODRM+SIB encoding and is therefore larger than 744 rip-relative addressing. */ 745 if (offset == (int32_t)offset) { 746 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 747 tcg_out8(s, (4 << 3) | 5); 748 tcg_out32(s, offset); 749 return; 750 } 751 752 /* ??? The memory isn't directly addressable. */ 753 g_assert_not_reached(); 754 } else { 755 /* Absolute address. */ 756 tcg_out8(s, (r << 3) | 5); 757 tcg_out32(s, offset); 758 return; 759 } 760 } 761 762 /* Find the length of the immediate addend. Note that the encoding 763 that would be used for (%ebp) indicates absolute addressing. */ 764 if (rm < 0) { 765 mod = 0, len = 4, rm = 5; 766 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 767 mod = 0, len = 0; 768 } else if (offset == (int8_t)offset) { 769 mod = 0x40, len = 1; 770 } else { 771 mod = 0x80, len = 4; 772 } 773 774 /* Use a single byte MODRM format if possible. Note that the encoding 775 that would be used for %esp is the escape to the two byte form. */ 776 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 777 /* Single byte MODRM format. */ 778 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 779 } else { 780 /* Two byte MODRM+SIB format. */ 781 782 /* Note that the encoding that would place %esp into the index 783 field indicates no index register. In 64-bit mode, the REX.X 784 bit counts, so %r12 can be used as the index. */ 785 if (index < 0) { 786 index = 4; 787 } else { 788 tcg_debug_assert(index != TCG_REG_ESP); 789 } 790 791 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 792 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 793 } 794 795 if (len == 1) { 796 tcg_out8(s, offset); 797 } else if (len == 4) { 798 tcg_out32(s, offset); 799 } 800} 801 802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 803 int index, int shift, intptr_t offset) 804{ 805 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 806 tcg_out_sib_offset(s, r, rm, index, shift, offset); 807} 808 809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 810 int rm, int index, int shift, 811 intptr_t offset) 812{ 813 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 814 tcg_out_sib_offset(s, r, rm, index, shift, offset); 815} 816 817/* A simplification of the above with no index or shift. */ 818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 819 int rm, intptr_t offset) 820{ 821 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 822} 823 824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 825 int v, int rm, intptr_t offset) 826{ 827 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 828} 829 830/* Output an opcode with an expected reference to the constant pool. */ 831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 832{ 833 tcg_out_opc(s, opc, r, 0, 0); 834 /* Absolute for 32-bit, pc-relative for 64-bit. */ 835 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 836 tcg_out32(s, 0); 837} 838 839/* Output an opcode with an expected reference to the constant pool. */ 840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 841{ 842 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 843 /* Absolute for 32-bit, pc-relative for 64-bit. */ 844 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 845 tcg_out32(s, 0); 846} 847 848/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 850{ 851 /* Propagate an opcode prefix, such as P_REXW. */ 852 int ext = subop & ~0x7; 853 subop &= 0x7; 854 855 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 856} 857 858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 859{ 860 int rexw = 0; 861 862 if (arg == ret) { 863 return true; 864 } 865 switch (type) { 866 case TCG_TYPE_I64: 867 rexw = P_REXW; 868 /* fallthru */ 869 case TCG_TYPE_I32: 870 if (ret < 16) { 871 if (arg < 16) { 872 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 873 } else { 874 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 875 } 876 } else { 877 if (arg < 16) { 878 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 879 } else { 880 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 881 } 882 } 883 break; 884 885 case TCG_TYPE_V64: 886 tcg_debug_assert(ret >= 16 && arg >= 16); 887 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 888 break; 889 case TCG_TYPE_V128: 890 tcg_debug_assert(ret >= 16 && arg >= 16); 891 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 892 break; 893 case TCG_TYPE_V256: 894 tcg_debug_assert(ret >= 16 && arg >= 16); 895 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 896 break; 897 898 default: 899 g_assert_not_reached(); 900 } 901 return true; 902} 903 904static const int avx2_dup_insn[4] = { 905 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 906 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 907}; 908 909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 910 TCGReg r, TCGReg a) 911{ 912 if (have_avx2) { 913 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 914 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 915 } else { 916 switch (vece) { 917 case MO_8: 918 /* ??? With zero in a register, use PSHUFB. */ 919 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 920 a = r; 921 /* FALLTHRU */ 922 case MO_16: 923 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 924 a = r; 925 /* FALLTHRU */ 926 case MO_32: 927 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 928 /* imm8 operand: all output lanes selected from input lane 0. */ 929 tcg_out8(s, 0); 930 break; 931 case MO_64: 932 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 933 break; 934 default: 935 g_assert_not_reached(); 936 } 937 } 938 return true; 939} 940 941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 942 TCGReg r, TCGReg base, intptr_t offset) 943{ 944 if (have_avx2) { 945 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 946 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 947 r, 0, base, offset); 948 } else { 949 switch (vece) { 950 case MO_64: 951 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 952 break; 953 case MO_32: 954 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 955 break; 956 case MO_16: 957 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 958 tcg_out8(s, 0); /* imm8 */ 959 tcg_out_dup_vec(s, type, vece, r, r); 960 break; 961 case MO_8: 962 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 963 tcg_out8(s, 0); /* imm8 */ 964 tcg_out_dup_vec(s, type, vece, r, r); 965 break; 966 default: 967 g_assert_not_reached(); 968 } 969 } 970 return true; 971} 972 973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 974 TCGReg ret, int64_t arg) 975{ 976 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 977 978 if (arg == 0) { 979 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 980 return; 981 } 982 if (arg == -1) { 983 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 984 return; 985 } 986 987 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 988 if (have_avx2) { 989 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 990 } else { 991 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 992 } 993 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 994 } else { 995 if (type == TCG_TYPE_V64) { 996 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 997 } else if (have_avx2) { 998 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 999 } else { 1000 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1001 } 1002 if (TCG_TARGET_REG_BITS == 64) { 1003 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1004 } else { 1005 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1006 } 1007 } 1008} 1009 1010static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1011 TCGReg ret, tcg_target_long arg) 1012{ 1013 if (arg == 0) { 1014 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1015 return; 1016 } 1017 if (arg == -1) { 1018 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1019 return; 1020 } 1021 1022 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1023 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1024 if (TCG_TARGET_REG_BITS == 64) { 1025 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1026 } else { 1027 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1028 } 1029} 1030 1031static void tcg_out_movi_int(TCGContext *s, TCGType type, 1032 TCGReg ret, tcg_target_long arg) 1033{ 1034 tcg_target_long diff; 1035 1036 if (arg == 0) { 1037 tgen_arithr(s, ARITH_XOR, ret, ret); 1038 return; 1039 } 1040 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1041 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1042 tcg_out32(s, arg); 1043 return; 1044 } 1045 if (arg == (int32_t)arg) { 1046 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1047 tcg_out32(s, arg); 1048 return; 1049 } 1050 1051 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1052 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1053 if (diff == (int32_t)diff) { 1054 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1055 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1056 tcg_out32(s, diff); 1057 return; 1058 } 1059 1060 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1061 tcg_out64(s, arg); 1062} 1063 1064static void tcg_out_movi(TCGContext *s, TCGType type, 1065 TCGReg ret, tcg_target_long arg) 1066{ 1067 switch (type) { 1068 case TCG_TYPE_I32: 1069#if TCG_TARGET_REG_BITS == 64 1070 case TCG_TYPE_I64: 1071#endif 1072 if (ret < 16) { 1073 tcg_out_movi_int(s, type, ret, arg); 1074 } else { 1075 tcg_out_movi_vec(s, type, ret, arg); 1076 } 1077 break; 1078 default: 1079 g_assert_not_reached(); 1080 } 1081} 1082 1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1084{ 1085 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1086 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1087 return true; 1088} 1089 1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1091 tcg_target_long imm) 1092{ 1093 /* This function is only used for passing structs by reference. */ 1094 tcg_debug_assert(imm == (int32_t)imm); 1095 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1096} 1097 1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1099{ 1100 if (val == (int8_t)val) { 1101 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1102 tcg_out8(s, val); 1103 } else if (val == (int32_t)val) { 1104 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1105 tcg_out32(s, val); 1106 } else { 1107 g_assert_not_reached(); 1108 } 1109} 1110 1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1112{ 1113 /* Given the strength of x86 memory ordering, we only need care for 1114 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1115 faster than "mfence", so don't bother with the sse insn. */ 1116 if (a0 & TCG_MO_ST_LD) { 1117 tcg_out8(s, 0xf0); 1118 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1119 tcg_out8(s, 0); 1120 } 1121} 1122 1123static inline void tcg_out_push(TCGContext *s, int reg) 1124{ 1125 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1126} 1127 1128static inline void tcg_out_pop(TCGContext *s, int reg) 1129{ 1130 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1131} 1132 1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1134 TCGReg arg1, intptr_t arg2) 1135{ 1136 switch (type) { 1137 case TCG_TYPE_I32: 1138 if (ret < 16) { 1139 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1140 } else { 1141 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1142 } 1143 break; 1144 case TCG_TYPE_I64: 1145 if (ret < 16) { 1146 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1147 break; 1148 } 1149 /* FALLTHRU */ 1150 case TCG_TYPE_V64: 1151 /* There is no instruction that can validate 8-byte alignment. */ 1152 tcg_debug_assert(ret >= 16); 1153 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1154 break; 1155 case TCG_TYPE_V128: 1156 /* 1157 * The gvec infrastructure is asserts that v128 vector loads 1158 * and stores use a 16-byte aligned offset. Validate that the 1159 * final pointer is aligned by using an insn that will SIGSEGV. 1160 */ 1161 tcg_debug_assert(ret >= 16); 1162 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1163 break; 1164 case TCG_TYPE_V256: 1165 /* 1166 * The gvec infrastructure only requires 16-byte alignment, 1167 * so here we must use an unaligned load. 1168 */ 1169 tcg_debug_assert(ret >= 16); 1170 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1171 ret, 0, arg1, arg2); 1172 break; 1173 default: 1174 g_assert_not_reached(); 1175 } 1176} 1177 1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1179 TCGReg arg1, intptr_t arg2) 1180{ 1181 switch (type) { 1182 case TCG_TYPE_I32: 1183 if (arg < 16) { 1184 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1185 } else { 1186 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1187 } 1188 break; 1189 case TCG_TYPE_I64: 1190 if (arg < 16) { 1191 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1192 break; 1193 } 1194 /* FALLTHRU */ 1195 case TCG_TYPE_V64: 1196 /* There is no instruction that can validate 8-byte alignment. */ 1197 tcg_debug_assert(arg >= 16); 1198 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1199 break; 1200 case TCG_TYPE_V128: 1201 /* 1202 * The gvec infrastructure is asserts that v128 vector loads 1203 * and stores use a 16-byte aligned offset. Validate that the 1204 * final pointer is aligned by using an insn that will SIGSEGV. 1205 * 1206 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1207 * for _WIN64, which must have SSE2 but may not have AVX. 1208 */ 1209 tcg_debug_assert(arg >= 16); 1210 if (have_avx1) { 1211 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1212 } else { 1213 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1214 } 1215 break; 1216 case TCG_TYPE_V256: 1217 /* 1218 * The gvec infrastructure only requires 16-byte alignment, 1219 * so here we must use an unaligned store. 1220 */ 1221 tcg_debug_assert(arg >= 16); 1222 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1223 arg, 0, arg1, arg2); 1224 break; 1225 default: 1226 g_assert_not_reached(); 1227 } 1228} 1229 1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1231 TCGReg base, intptr_t ofs) 1232{ 1233 int rexw = 0; 1234 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1235 if (val != (int32_t)val) { 1236 return false; 1237 } 1238 rexw = P_REXW; 1239 } else if (type != TCG_TYPE_I32) { 1240 return false; 1241 } 1242 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1243 tcg_out32(s, val); 1244 return true; 1245} 1246 1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1248{ 1249 /* Propagate an opcode prefix, such as P_DATA16. */ 1250 int ext = subopc & ~0x7; 1251 subopc &= 0x7; 1252 1253 if (count == 1) { 1254 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1255 } else { 1256 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1257 tcg_out8(s, count); 1258 } 1259} 1260 1261static inline void tcg_out_bswap32(TCGContext *s, int reg) 1262{ 1263 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1264} 1265 1266static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1267{ 1268 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1269} 1270 1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1272{ 1273 /* movzbl */ 1274 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1275 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1276} 1277 1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1279{ 1280 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1281 /* movsbl */ 1282 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1283 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1284} 1285 1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1287{ 1288 /* movzwl */ 1289 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1290} 1291 1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1293{ 1294 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1295 /* movsw[lq] */ 1296 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1297} 1298 1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1300{ 1301 /* 32-bit mov zero extends. */ 1302 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1303} 1304 1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1306{ 1307 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1308 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1309} 1310 1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1312{ 1313 tcg_out_ext32s(s, dest, src); 1314} 1315 1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1317{ 1318 tcg_out_ext32u(s, dest, src); 1319} 1320 1321static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1322{ 1323 tcg_out_ext32u(s, dest, src); 1324} 1325 1326static inline void tcg_out_bswap64(TCGContext *s, int reg) 1327{ 1328 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1329} 1330 1331static void tgen_arithi(TCGContext *s, int c, int r0, 1332 tcg_target_long val, int cf) 1333{ 1334 int rexw = 0; 1335 1336 if (TCG_TARGET_REG_BITS == 64) { 1337 rexw = c & -8; 1338 c &= 7; 1339 } 1340 1341 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1342 partial flags update stalls on Pentium4 and are not recommended 1343 by current Intel optimization manuals. */ 1344 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) { 1345 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1346 if (TCG_TARGET_REG_BITS == 64) { 1347 /* The single-byte increment encodings are re-tasked as the 1348 REX prefixes. Use the MODRM encoding. */ 1349 tcg_out_modrm(s, OPC_GRP5 + rexw, 1350 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1351 } else { 1352 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1353 } 1354 return; 1355 } 1356 1357 if (c == ARITH_AND) { 1358 if (TCG_TARGET_REG_BITS == 64) { 1359 if (val == 0xffffffffu) { 1360 tcg_out_ext32u(s, r0, r0); 1361 return; 1362 } 1363 if (val == (uint32_t)val) { 1364 /* AND with no high bits set can use a 32-bit operation. */ 1365 rexw = 0; 1366 } 1367 } 1368 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1369 tcg_out_ext8u(s, r0, r0); 1370 return; 1371 } 1372 if (val == 0xffffu) { 1373 tcg_out_ext16u(s, r0, r0); 1374 return; 1375 } 1376 } 1377 1378 if (val == (int8_t)val) { 1379 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1380 tcg_out8(s, val); 1381 return; 1382 } 1383 if (rexw == 0 || val == (int32_t)val) { 1384 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1385 tcg_out32(s, val); 1386 return; 1387 } 1388 1389 g_assert_not_reached(); 1390} 1391 1392static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1393{ 1394 if (val != 0) { 1395 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1396 } 1397} 1398 1399/* Set SMALL to force a short forward branch. */ 1400static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1401{ 1402 int32_t val, val1; 1403 1404 if (l->has_value) { 1405 val = tcg_pcrel_diff(s, l->u.value_ptr); 1406 val1 = val - 2; 1407 if ((int8_t)val1 == val1) { 1408 if (opc == -1) { 1409 tcg_out8(s, OPC_JMP_short); 1410 } else { 1411 tcg_out8(s, OPC_JCC_short + opc); 1412 } 1413 tcg_out8(s, val1); 1414 } else { 1415 tcg_debug_assert(!small); 1416 if (opc == -1) { 1417 tcg_out8(s, OPC_JMP_long); 1418 tcg_out32(s, val - 5); 1419 } else { 1420 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1421 tcg_out32(s, val - 6); 1422 } 1423 } 1424 } else if (small) { 1425 if (opc == -1) { 1426 tcg_out8(s, OPC_JMP_short); 1427 } else { 1428 tcg_out8(s, OPC_JCC_short + opc); 1429 } 1430 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1431 s->code_ptr += 1; 1432 } else { 1433 if (opc == -1) { 1434 tcg_out8(s, OPC_JMP_long); 1435 } else { 1436 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1437 } 1438 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1439 s->code_ptr += 4; 1440 } 1441} 1442 1443static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1444 int const_arg2, int rexw) 1445{ 1446 if (const_arg2) { 1447 if (arg2 == 0) { 1448 /* test r, r */ 1449 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1450 } else { 1451 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1452 } 1453 } else { 1454 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1455 } 1456} 1457 1458static void tcg_out_brcond32(TCGContext *s, TCGCond cond, 1459 TCGArg arg1, TCGArg arg2, int const_arg2, 1460 TCGLabel *label, int small) 1461{ 1462 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1463 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1464} 1465 1466#if TCG_TARGET_REG_BITS == 64 1467static void tcg_out_brcond64(TCGContext *s, TCGCond cond, 1468 TCGArg arg1, TCGArg arg2, int const_arg2, 1469 TCGLabel *label, int small) 1470{ 1471 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1472 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1473} 1474#else 1475/* XXX: we implement it at the target level to avoid having to 1476 handle cross basic blocks temporaries */ 1477static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1478 const int *const_args, int small) 1479{ 1480 TCGLabel *label_next = gen_new_label(); 1481 TCGLabel *label_this = arg_label(args[5]); 1482 1483 switch(args[4]) { 1484 case TCG_COND_EQ: 1485 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1486 label_next, 1); 1487 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3], 1488 label_this, small); 1489 break; 1490 case TCG_COND_NE: 1491 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1492 label_this, small); 1493 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3], 1494 label_this, small); 1495 break; 1496 case TCG_COND_LT: 1497 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1498 label_this, small); 1499 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1500 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1501 label_this, small); 1502 break; 1503 case TCG_COND_LE: 1504 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1505 label_this, small); 1506 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1507 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1508 label_this, small); 1509 break; 1510 case TCG_COND_GT: 1511 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1512 label_this, small); 1513 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1514 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1515 label_this, small); 1516 break; 1517 case TCG_COND_GE: 1518 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1519 label_this, small); 1520 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1521 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1522 label_this, small); 1523 break; 1524 case TCG_COND_LTU: 1525 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1526 label_this, small); 1527 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1528 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1529 label_this, small); 1530 break; 1531 case TCG_COND_LEU: 1532 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1533 label_this, small); 1534 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1535 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1536 label_this, small); 1537 break; 1538 case TCG_COND_GTU: 1539 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1540 label_this, small); 1541 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1542 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1543 label_this, small); 1544 break; 1545 case TCG_COND_GEU: 1546 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1547 label_this, small); 1548 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1549 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1550 label_this, small); 1551 break; 1552 default: 1553 g_assert_not_reached(); 1554 } 1555 tcg_out_label(s, label_next); 1556} 1557#endif 1558 1559static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest, 1560 TCGArg arg1, TCGArg arg2, int const_arg2) 1561{ 1562 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1563 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1564 tcg_out_ext8u(s, dest, dest); 1565} 1566 1567#if TCG_TARGET_REG_BITS == 64 1568static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest, 1569 TCGArg arg1, TCGArg arg2, int const_arg2) 1570{ 1571 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1572 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1573 tcg_out_ext8u(s, dest, dest); 1574} 1575#else 1576static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1577 const int *const_args) 1578{ 1579 TCGArg new_args[6]; 1580 TCGLabel *label_true, *label_over; 1581 1582 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1583 1584 if (args[0] == args[1] || args[0] == args[2] 1585 || (!const_args[3] && args[0] == args[3]) 1586 || (!const_args[4] && args[0] == args[4])) { 1587 /* When the destination overlaps with one of the argument 1588 registers, don't do anything tricky. */ 1589 label_true = gen_new_label(); 1590 label_over = gen_new_label(); 1591 1592 new_args[5] = label_arg(label_true); 1593 tcg_out_brcond2(s, new_args, const_args+1, 1); 1594 1595 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1596 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1597 tcg_out_label(s, label_true); 1598 1599 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1600 tcg_out_label(s, label_over); 1601 } else { 1602 /* When the destination does not overlap one of the arguments, 1603 clear the destination first, jump if cond false, and emit an 1604 increment in the true case. This results in smaller code. */ 1605 1606 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1607 1608 label_over = gen_new_label(); 1609 new_args[4] = tcg_invert_cond(new_args[4]); 1610 new_args[5] = label_arg(label_over); 1611 tcg_out_brcond2(s, new_args, const_args+1, 1); 1612 1613 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1614 tcg_out_label(s, label_over); 1615 } 1616} 1617#endif 1618 1619static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1620 TCGReg dest, TCGReg v1) 1621{ 1622 if (have_cmov) { 1623 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1624 } else { 1625 TCGLabel *over = gen_new_label(); 1626 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1627 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1628 tcg_out_label(s, over); 1629 } 1630} 1631 1632static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest, 1633 TCGReg c1, TCGArg c2, int const_c2, 1634 TCGReg v1) 1635{ 1636 tcg_out_cmp(s, c1, c2, const_c2, 0); 1637 tcg_out_cmov(s, cond, 0, dest, v1); 1638} 1639 1640#if TCG_TARGET_REG_BITS == 64 1641static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest, 1642 TCGReg c1, TCGArg c2, int const_c2, 1643 TCGReg v1) 1644{ 1645 tcg_out_cmp(s, c1, c2, const_c2, P_REXW); 1646 tcg_out_cmov(s, cond, P_REXW, dest, v1); 1647} 1648#endif 1649 1650static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1651 TCGArg arg2, bool const_a2) 1652{ 1653 if (have_bmi1) { 1654 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1655 if (const_a2) { 1656 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1657 } else { 1658 tcg_debug_assert(dest != arg2); 1659 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1660 } 1661 } else { 1662 tcg_debug_assert(dest != arg2); 1663 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1664 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1665 } 1666} 1667 1668static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1669 TCGArg arg2, bool const_a2) 1670{ 1671 if (have_lzcnt) { 1672 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1673 if (const_a2) { 1674 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1675 } else { 1676 tcg_debug_assert(dest != arg2); 1677 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1678 } 1679 } else { 1680 tcg_debug_assert(!const_a2); 1681 tcg_debug_assert(dest != arg1); 1682 tcg_debug_assert(dest != arg2); 1683 1684 /* Recall that the output of BSR is the index not the count. */ 1685 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1686 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1687 1688 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1689 tcg_out_cmp(s, arg1, 0, 1, rexw); 1690 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1691 } 1692} 1693 1694static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1695{ 1696 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1697 1698 if (disp == (int32_t)disp) { 1699 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1700 tcg_out32(s, disp); 1701 } else { 1702 /* rip-relative addressing into the constant pool. 1703 This is 6 + 8 = 14 bytes, as compared to using an 1704 immediate load 10 + 6 = 16 bytes, plus we may 1705 be able to re-use the pool constant for more calls. */ 1706 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1707 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1708 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1709 tcg_out32(s, 0); 1710 } 1711} 1712 1713static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1714 const TCGHelperInfo *info) 1715{ 1716 tcg_out_branch(s, 1, dest); 1717 1718#ifndef _WIN32 1719 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1720 /* 1721 * The sysv i386 abi for struct return places a reference as the 1722 * first argument of the stack, and pops that argument with the 1723 * return statement. Since we want to retain the aligned stack 1724 * pointer for the callee, we do not want to actually push that 1725 * argument before the call but rely on the normal store to the 1726 * stack slot. But we do need to compensate for the pop in order 1727 * to reset our correct stack pointer value. 1728 * Pushing a garbage value back onto the stack is quickest. 1729 */ 1730 tcg_out_push(s, TCG_REG_EAX); 1731 } 1732#endif 1733} 1734 1735static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1736{ 1737 tcg_out_branch(s, 0, dest); 1738} 1739 1740static void tcg_out_nopn(TCGContext *s, int n) 1741{ 1742 int i; 1743 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1744 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1745 * duplicate prefix, and all of the interesting recent cores can 1746 * decode and discard the duplicates in a single cycle. 1747 */ 1748 tcg_debug_assert(n >= 1); 1749 for (i = 1; i < n; ++i) { 1750 tcg_out8(s, 0x66); 1751 } 1752 tcg_out8(s, 0x90); 1753} 1754 1755/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */ 1756static void __attribute__((unused)) 1757tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i) 1758{ 1759 /* 1760 * This is used for testing alignment, so we can usually use testb. 1761 * For i686, we have to use testl for %esi/%edi. 1762 */ 1763 if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) { 1764 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r); 1765 tcg_out8(s, i); 1766 } else { 1767 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r); 1768 tcg_out32(s, i); 1769 } 1770} 1771 1772typedef struct { 1773 TCGReg base; 1774 int index; 1775 int ofs; 1776 int seg; 1777} HostAddress; 1778 1779bool tcg_target_has_memory_bswap(MemOp memop) 1780{ 1781 return have_movbe; 1782} 1783 1784/* 1785 * Because i686 has no register parameters and because x86_64 has xchg 1786 * to handle addr/data register overlap, we have placed all input arguments 1787 * before we need might need a scratch reg. 1788 * 1789 * Even then, a scratch is only needed for l->raddr. Rather than expose 1790 * a general-purpose scratch when we don't actually know it's available, 1791 * use the ra_gen hook to load into RAX if needed. 1792 */ 1793#if TCG_TARGET_REG_BITS == 64 1794static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1795{ 1796 if (arg < 0) { 1797 arg = TCG_REG_RAX; 1798 } 1799 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1800 return arg; 1801} 1802static const TCGLdstHelperParam ldst_helper_param = { 1803 .ra_gen = ldst_ra_gen 1804}; 1805#else 1806static const TCGLdstHelperParam ldst_helper_param = { }; 1807#endif 1808 1809/* 1810 * Generate code for the slow path for a load at the end of block 1811 */ 1812static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1813{ 1814 MemOp opc = get_memop(l->oi); 1815 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1816 1817 /* resolve label address */ 1818 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1819 if (label_ptr[1]) { 1820 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1821 } 1822 1823 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 1824 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 1825 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 1826 1827 tcg_out_jmp(s, l->raddr); 1828 return true; 1829} 1830 1831/* 1832 * Generate code for the slow path for a store at the end of block 1833 */ 1834static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1835{ 1836 MemOp opc = get_memop(l->oi); 1837 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1838 1839 /* resolve label address */ 1840 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1841 if (label_ptr[1]) { 1842 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1843 } 1844 1845 tcg_out_st_helper_args(s, l, &ldst_helper_param); 1846 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 1847 1848 tcg_out_jmp(s, l->raddr); 1849 return true; 1850} 1851 1852#ifndef CONFIG_SOFTMMU 1853static HostAddress x86_guest_base = { 1854 .index = -1 1855}; 1856 1857#if defined(__x86_64__) && defined(__linux__) 1858# include <asm/prctl.h> 1859# include <sys/prctl.h> 1860int arch_prctl(int code, unsigned long addr); 1861static inline int setup_guest_base_seg(void) 1862{ 1863 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 1864 return P_GS; 1865 } 1866 return 0; 1867} 1868#elif defined(__x86_64__) && \ 1869 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 1870# include <machine/sysarch.h> 1871static inline int setup_guest_base_seg(void) 1872{ 1873 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 1874 return P_GS; 1875 } 1876 return 0; 1877} 1878#else 1879static inline int setup_guest_base_seg(void) 1880{ 1881 return 0; 1882} 1883#endif /* setup_guest_base_seg */ 1884#endif /* !SOFTMMU */ 1885 1886/* 1887 * For softmmu, perform the TLB load and compare. 1888 * For useronly, perform any required alignment tests. 1889 * In both cases, return a TCGLabelQemuLdst structure if the slow path 1890 * is required and fill in @h with the host address for the fast path. 1891 */ 1892static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 1893 TCGReg addrlo, TCGReg addrhi, 1894 MemOpIdx oi, bool is_ld) 1895{ 1896 TCGLabelQemuLdst *ldst = NULL; 1897 MemOp opc = get_memop(oi); 1898 unsigned a_bits = get_alignment_bits(opc); 1899 unsigned a_mask = (1 << a_bits) - 1; 1900 1901#ifdef CONFIG_SOFTMMU 1902 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 1903 : offsetof(CPUTLBEntry, addr_write); 1904 TCGType ttype = TCG_TYPE_I32; 1905 TCGType tlbtype = TCG_TYPE_I32; 1906 int trexw = 0, hrexw = 0, tlbrexw = 0; 1907 unsigned mem_index = get_mmuidx(oi); 1908 unsigned s_bits = opc & MO_SIZE; 1909 unsigned s_mask = (1 << s_bits) - 1; 1910 target_ulong tlb_mask; 1911 1912 ldst = new_ldst_label(s); 1913 ldst->is_ld = is_ld; 1914 ldst->oi = oi; 1915 ldst->addrlo_reg = addrlo; 1916 ldst->addrhi_reg = addrhi; 1917 1918 if (TCG_TARGET_REG_BITS == 64) { 1919 if (TARGET_LONG_BITS == 64) { 1920 ttype = TCG_TYPE_I64; 1921 trexw = P_REXW; 1922 } 1923 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 1924 hrexw = P_REXW; 1925 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) { 1926 tlbtype = TCG_TYPE_I64; 1927 tlbrexw = P_REXW; 1928 } 1929 } 1930 } 1931 1932 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 1933 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 1934 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 1935 1936 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 1937 TLB_MASK_TABLE_OFS(mem_index) + 1938 offsetof(CPUTLBDescFast, mask)); 1939 1940 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 1941 TLB_MASK_TABLE_OFS(mem_index) + 1942 offsetof(CPUTLBDescFast, table)); 1943 1944 /* 1945 * If the required alignment is at least as large as the access, simply 1946 * copy the address and mask. For lesser alignments, check that we don't 1947 * cross pages for the complete access. 1948 */ 1949 if (a_bits >= s_bits) { 1950 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 1951 } else { 1952 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 1953 addrlo, s_mask - a_mask); 1954 } 1955 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask; 1956 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 1957 1958 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 1959 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 1960 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 1961 1962 /* jne slow_path */ 1963 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1964 ldst->label_ptr[0] = s->code_ptr; 1965 s->code_ptr += 4; 1966 1967 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1968 /* cmp 4(TCG_REG_L0), addrhi */ 1969 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4); 1970 1971 /* jne slow_path */ 1972 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1973 ldst->label_ptr[1] = s->code_ptr; 1974 s->code_ptr += 4; 1975 } 1976 1977 /* TLB Hit. */ 1978 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 1979 offsetof(CPUTLBEntry, addend)); 1980 1981 *h = (HostAddress) { 1982 .base = addrlo, 1983 .index = TCG_REG_L0, 1984 }; 1985#else 1986 if (a_bits) { 1987 ldst = new_ldst_label(s); 1988 1989 ldst->is_ld = is_ld; 1990 ldst->oi = oi; 1991 ldst->addrlo_reg = addrlo; 1992 ldst->addrhi_reg = addrhi; 1993 1994 tcg_out_testi(s, addrlo, a_mask); 1995 /* jne slow_path */ 1996 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1997 ldst->label_ptr[0] = s->code_ptr; 1998 s->code_ptr += 4; 1999 } 2000 2001 *h = x86_guest_base; 2002 h->base = addrlo; 2003#endif 2004 2005 return ldst; 2006} 2007 2008static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2009 HostAddress h, TCGType type, MemOp memop) 2010{ 2011 bool use_movbe = false; 2012 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2013 int movop = OPC_MOVL_GvEv; 2014 2015 /* Do big-endian loads with movbe. */ 2016 if (memop & MO_BSWAP) { 2017 tcg_debug_assert(have_movbe); 2018 use_movbe = true; 2019 movop = OPC_MOVBE_GyMy; 2020 } 2021 2022 switch (memop & MO_SSIZE) { 2023 case MO_UB: 2024 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2025 h.base, h.index, 0, h.ofs); 2026 break; 2027 case MO_SB: 2028 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2029 h.base, h.index, 0, h.ofs); 2030 break; 2031 case MO_UW: 2032 if (use_movbe) { 2033 /* There is no extending movbe; only low 16-bits are modified. */ 2034 if (datalo != h.base && datalo != h.index) { 2035 /* XOR breaks dependency chains. */ 2036 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2037 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2038 datalo, h.base, h.index, 0, h.ofs); 2039 } else { 2040 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2041 datalo, h.base, h.index, 0, h.ofs); 2042 tcg_out_ext16u(s, datalo, datalo); 2043 } 2044 } else { 2045 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2046 h.base, h.index, 0, h.ofs); 2047 } 2048 break; 2049 case MO_SW: 2050 if (use_movbe) { 2051 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2052 datalo, h.base, h.index, 0, h.ofs); 2053 tcg_out_ext16s(s, type, datalo, datalo); 2054 } else { 2055 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2056 datalo, h.base, h.index, 0, h.ofs); 2057 } 2058 break; 2059 case MO_UL: 2060 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2061 h.base, h.index, 0, h.ofs); 2062 break; 2063#if TCG_TARGET_REG_BITS == 64 2064 case MO_SL: 2065 if (use_movbe) { 2066 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2067 h.base, h.index, 0, h.ofs); 2068 tcg_out_ext32s(s, datalo, datalo); 2069 } else { 2070 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2071 h.base, h.index, 0, h.ofs); 2072 } 2073 break; 2074#endif 2075 case MO_UQ: 2076 if (TCG_TARGET_REG_BITS == 64) { 2077 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2078 h.base, h.index, 0, h.ofs); 2079 break; 2080 } 2081 if (use_movbe) { 2082 TCGReg t = datalo; 2083 datalo = datahi; 2084 datahi = t; 2085 } 2086 if (h.base == datalo || h.index == datalo) { 2087 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2088 h.base, h.index, 0, h.ofs); 2089 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2090 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2091 } else { 2092 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2093 h.base, h.index, 0, h.ofs); 2094 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2095 h.base, h.index, 0, h.ofs + 4); 2096 } 2097 break; 2098 default: 2099 g_assert_not_reached(); 2100 } 2101} 2102 2103static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2104 TCGReg addrlo, TCGReg addrhi, 2105 MemOpIdx oi, TCGType data_type) 2106{ 2107 TCGLabelQemuLdst *ldst; 2108 HostAddress h; 2109 2110 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2111 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2112 2113 if (ldst) { 2114 ldst->type = data_type; 2115 ldst->datalo_reg = datalo; 2116 ldst->datahi_reg = datahi; 2117 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2118 } 2119} 2120 2121static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2122 HostAddress h, MemOp memop) 2123{ 2124 bool use_movbe = false; 2125 int movop = OPC_MOVL_EvGv; 2126 2127 /* 2128 * Do big-endian stores with movbe or softmmu. 2129 * User-only without movbe will have its swapping done generically. 2130 */ 2131 if (memop & MO_BSWAP) { 2132 tcg_debug_assert(have_movbe); 2133 use_movbe = true; 2134 movop = OPC_MOVBE_MyGy; 2135 } 2136 2137 switch (memop & MO_SIZE) { 2138 case MO_8: 2139 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2140 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2141 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2142 datalo, h.base, h.index, 0, h.ofs); 2143 break; 2144 case MO_16: 2145 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2146 h.base, h.index, 0, h.ofs); 2147 break; 2148 case MO_32: 2149 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2150 h.base, h.index, 0, h.ofs); 2151 break; 2152 case MO_64: 2153 if (TCG_TARGET_REG_BITS == 64) { 2154 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2155 h.base, h.index, 0, h.ofs); 2156 } else { 2157 if (use_movbe) { 2158 TCGReg t = datalo; 2159 datalo = datahi; 2160 datahi = t; 2161 } 2162 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2163 h.base, h.index, 0, h.ofs); 2164 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2165 h.base, h.index, 0, h.ofs + 4); 2166 } 2167 break; 2168 default: 2169 g_assert_not_reached(); 2170 } 2171} 2172 2173static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2174 TCGReg addrlo, TCGReg addrhi, 2175 MemOpIdx oi, TCGType data_type) 2176{ 2177 TCGLabelQemuLdst *ldst; 2178 HostAddress h; 2179 2180 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2181 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2182 2183 if (ldst) { 2184 ldst->type = data_type; 2185 ldst->datalo_reg = datalo; 2186 ldst->datahi_reg = datahi; 2187 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2188 } 2189} 2190 2191static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2192{ 2193 /* Reuse the zeroing that exists for goto_ptr. */ 2194 if (a0 == 0) { 2195 tcg_out_jmp(s, tcg_code_gen_epilogue); 2196 } else { 2197 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2198 tcg_out_jmp(s, tb_ret_addr); 2199 } 2200} 2201 2202static void tcg_out_goto_tb(TCGContext *s, int which) 2203{ 2204 /* 2205 * Jump displacement must be aligned for atomic patching; 2206 * see if we need to add extra nops before jump 2207 */ 2208 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2209 if (gap != 1) { 2210 tcg_out_nopn(s, gap - 1); 2211 } 2212 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2213 set_jmp_insn_offset(s, which); 2214 tcg_out32(s, 0); 2215 set_jmp_reset_offset(s, which); 2216} 2217 2218void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2219 uintptr_t jmp_rx, uintptr_t jmp_rw) 2220{ 2221 /* patch the branch destination */ 2222 uintptr_t addr = tb->jmp_target_addr[n]; 2223 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2224 /* no need to flush icache explicitly */ 2225} 2226 2227static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2228 const TCGArg args[TCG_MAX_OP_ARGS], 2229 const int const_args[TCG_MAX_OP_ARGS]) 2230{ 2231 TCGArg a0, a1, a2; 2232 int c, const_a2, vexop, rexw = 0; 2233 2234#if TCG_TARGET_REG_BITS == 64 2235# define OP_32_64(x) \ 2236 case glue(glue(INDEX_op_, x), _i64): \ 2237 rexw = P_REXW; /* FALLTHRU */ \ 2238 case glue(glue(INDEX_op_, x), _i32) 2239#else 2240# define OP_32_64(x) \ 2241 case glue(glue(INDEX_op_, x), _i32) 2242#endif 2243 2244 /* Hoist the loads of the most common arguments. */ 2245 a0 = args[0]; 2246 a1 = args[1]; 2247 a2 = args[2]; 2248 const_a2 = const_args[2]; 2249 2250 switch (opc) { 2251 case INDEX_op_goto_ptr: 2252 /* jmp to the given host address (could be epilogue) */ 2253 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2254 break; 2255 case INDEX_op_br: 2256 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2257 break; 2258 OP_32_64(ld8u): 2259 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2260 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2261 break; 2262 OP_32_64(ld8s): 2263 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2264 break; 2265 OP_32_64(ld16u): 2266 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2267 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2268 break; 2269 OP_32_64(ld16s): 2270 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2271 break; 2272#if TCG_TARGET_REG_BITS == 64 2273 case INDEX_op_ld32u_i64: 2274#endif 2275 case INDEX_op_ld_i32: 2276 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2277 break; 2278 2279 OP_32_64(st8): 2280 if (const_args[0]) { 2281 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2282 tcg_out8(s, a0); 2283 } else { 2284 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2285 } 2286 break; 2287 OP_32_64(st16): 2288 if (const_args[0]) { 2289 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2290 tcg_out16(s, a0); 2291 } else { 2292 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2293 } 2294 break; 2295#if TCG_TARGET_REG_BITS == 64 2296 case INDEX_op_st32_i64: 2297#endif 2298 case INDEX_op_st_i32: 2299 if (const_args[0]) { 2300 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2301 tcg_out32(s, a0); 2302 } else { 2303 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2304 } 2305 break; 2306 2307 OP_32_64(add): 2308 /* For 3-operand addition, use LEA. */ 2309 if (a0 != a1) { 2310 TCGArg c3 = 0; 2311 if (const_a2) { 2312 c3 = a2, a2 = -1; 2313 } else if (a0 == a2) { 2314 /* Watch out for dest = src + dest, since we've removed 2315 the matching constraint on the add. */ 2316 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2317 break; 2318 } 2319 2320 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2321 break; 2322 } 2323 c = ARITH_ADD; 2324 goto gen_arith; 2325 OP_32_64(sub): 2326 c = ARITH_SUB; 2327 goto gen_arith; 2328 OP_32_64(and): 2329 c = ARITH_AND; 2330 goto gen_arith; 2331 OP_32_64(or): 2332 c = ARITH_OR; 2333 goto gen_arith; 2334 OP_32_64(xor): 2335 c = ARITH_XOR; 2336 goto gen_arith; 2337 gen_arith: 2338 if (const_a2) { 2339 tgen_arithi(s, c + rexw, a0, a2, 0); 2340 } else { 2341 tgen_arithr(s, c + rexw, a0, a2); 2342 } 2343 break; 2344 2345 OP_32_64(andc): 2346 if (const_a2) { 2347 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2348 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2349 } else { 2350 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2351 } 2352 break; 2353 2354 OP_32_64(mul): 2355 if (const_a2) { 2356 int32_t val; 2357 val = a2; 2358 if (val == (int8_t)val) { 2359 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2360 tcg_out8(s, val); 2361 } else { 2362 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2363 tcg_out32(s, val); 2364 } 2365 } else { 2366 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2367 } 2368 break; 2369 2370 OP_32_64(div2): 2371 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2372 break; 2373 OP_32_64(divu2): 2374 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2375 break; 2376 2377 OP_32_64(shl): 2378 /* For small constant 3-operand shift, use LEA. */ 2379 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2380 if (a2 - 1 == 0) { 2381 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2382 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2383 } else { 2384 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2385 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2386 } 2387 break; 2388 } 2389 c = SHIFT_SHL; 2390 vexop = OPC_SHLX; 2391 goto gen_shift_maybe_vex; 2392 OP_32_64(shr): 2393 c = SHIFT_SHR; 2394 vexop = OPC_SHRX; 2395 goto gen_shift_maybe_vex; 2396 OP_32_64(sar): 2397 c = SHIFT_SAR; 2398 vexop = OPC_SARX; 2399 goto gen_shift_maybe_vex; 2400 OP_32_64(rotl): 2401 c = SHIFT_ROL; 2402 goto gen_shift; 2403 OP_32_64(rotr): 2404 c = SHIFT_ROR; 2405 goto gen_shift; 2406 gen_shift_maybe_vex: 2407 if (have_bmi2) { 2408 if (!const_a2) { 2409 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2410 break; 2411 } 2412 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2413 } 2414 /* FALLTHRU */ 2415 gen_shift: 2416 if (const_a2) { 2417 tcg_out_shifti(s, c + rexw, a0, a2); 2418 } else { 2419 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2420 } 2421 break; 2422 2423 OP_32_64(ctz): 2424 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2425 break; 2426 OP_32_64(clz): 2427 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2428 break; 2429 OP_32_64(ctpop): 2430 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2431 break; 2432 2433 case INDEX_op_brcond_i32: 2434 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2435 break; 2436 case INDEX_op_setcond_i32: 2437 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2); 2438 break; 2439 case INDEX_op_movcond_i32: 2440 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]); 2441 break; 2442 2443 OP_32_64(bswap16): 2444 if (a2 & TCG_BSWAP_OS) { 2445 /* Output must be sign-extended. */ 2446 if (rexw) { 2447 tcg_out_bswap64(s, a0); 2448 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2449 } else { 2450 tcg_out_bswap32(s, a0); 2451 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2452 } 2453 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2454 /* Output must be zero-extended, but input isn't. */ 2455 tcg_out_bswap32(s, a0); 2456 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2457 } else { 2458 tcg_out_rolw_8(s, a0); 2459 } 2460 break; 2461 OP_32_64(bswap32): 2462 tcg_out_bswap32(s, a0); 2463 if (rexw && (a2 & TCG_BSWAP_OS)) { 2464 tcg_out_ext32s(s, a0, a0); 2465 } 2466 break; 2467 2468 OP_32_64(neg): 2469 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2470 break; 2471 OP_32_64(not): 2472 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2473 break; 2474 2475 case INDEX_op_qemu_ld_i32: 2476 if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { 2477 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2478 } else { 2479 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2480 } 2481 break; 2482 case INDEX_op_qemu_ld_i64: 2483 if (TCG_TARGET_REG_BITS == 64) { 2484 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2485 } else if (TARGET_LONG_BITS == 32) { 2486 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2487 } else { 2488 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2489 } 2490 break; 2491 case INDEX_op_qemu_st_i32: 2492 case INDEX_op_qemu_st8_i32: 2493 if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { 2494 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2495 } else { 2496 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2497 } 2498 break; 2499 case INDEX_op_qemu_st_i64: 2500 if (TCG_TARGET_REG_BITS == 64) { 2501 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2502 } else if (TARGET_LONG_BITS == 32) { 2503 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2504 } else { 2505 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2506 } 2507 break; 2508 2509 OP_32_64(mulu2): 2510 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2511 break; 2512 OP_32_64(muls2): 2513 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2514 break; 2515 OP_32_64(add2): 2516 if (const_args[4]) { 2517 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2518 } else { 2519 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2520 } 2521 if (const_args[5]) { 2522 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2523 } else { 2524 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2525 } 2526 break; 2527 OP_32_64(sub2): 2528 if (const_args[4]) { 2529 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2530 } else { 2531 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2532 } 2533 if (const_args[5]) { 2534 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2535 } else { 2536 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2537 } 2538 break; 2539 2540#if TCG_TARGET_REG_BITS == 32 2541 case INDEX_op_brcond2_i32: 2542 tcg_out_brcond2(s, args, const_args, 0); 2543 break; 2544 case INDEX_op_setcond2_i32: 2545 tcg_out_setcond2(s, args, const_args); 2546 break; 2547#else /* TCG_TARGET_REG_BITS == 64 */ 2548 case INDEX_op_ld32s_i64: 2549 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2550 break; 2551 case INDEX_op_ld_i64: 2552 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2553 break; 2554 case INDEX_op_st_i64: 2555 if (const_args[0]) { 2556 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2557 tcg_out32(s, a0); 2558 } else { 2559 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2560 } 2561 break; 2562 2563 case INDEX_op_brcond_i64: 2564 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2565 break; 2566 case INDEX_op_setcond_i64: 2567 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2); 2568 break; 2569 case INDEX_op_movcond_i64: 2570 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]); 2571 break; 2572 2573 case INDEX_op_bswap64_i64: 2574 tcg_out_bswap64(s, a0); 2575 break; 2576 case INDEX_op_extrh_i64_i32: 2577 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2578 break; 2579#endif 2580 2581 OP_32_64(deposit): 2582 if (args[3] == 0 && args[4] == 8) { 2583 /* load bits 0..7 */ 2584 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2585 } else if (args[3] == 8 && args[4] == 8) { 2586 /* load bits 8..15 */ 2587 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2588 } else if (args[3] == 0 && args[4] == 16) { 2589 /* load bits 0..15 */ 2590 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2591 } else { 2592 g_assert_not_reached(); 2593 } 2594 break; 2595 2596 case INDEX_op_extract_i64: 2597 if (a2 + args[3] == 32) { 2598 /* This is a 32-bit zero-extending right shift. */ 2599 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2600 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2601 break; 2602 } 2603 /* FALLTHRU */ 2604 case INDEX_op_extract_i32: 2605 /* On the off-chance that we can use the high-byte registers. 2606 Otherwise we emit the same ext16 + shift pattern that we 2607 would have gotten from the normal tcg-op.c expansion. */ 2608 tcg_debug_assert(a2 == 8 && args[3] == 8); 2609 if (a1 < 4 && a0 < 8) { 2610 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2611 } else { 2612 tcg_out_ext16u(s, a0, a1); 2613 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2614 } 2615 break; 2616 2617 case INDEX_op_sextract_i32: 2618 /* We don't implement sextract_i64, as we cannot sign-extend to 2619 64-bits without using the REX prefix that explicitly excludes 2620 access to the high-byte registers. */ 2621 tcg_debug_assert(a2 == 8 && args[3] == 8); 2622 if (a1 < 4 && a0 < 8) { 2623 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2624 } else { 2625 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 2626 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2627 } 2628 break; 2629 2630 OP_32_64(extract2): 2631 /* Note that SHRD outputs to the r/m operand. */ 2632 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2633 tcg_out8(s, args[3]); 2634 break; 2635 2636 case INDEX_op_mb: 2637 tcg_out_mb(s, a0); 2638 break; 2639 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2640 case INDEX_op_mov_i64: 2641 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2642 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 2643 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 2644 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 2645 case INDEX_op_ext8s_i64: 2646 case INDEX_op_ext8u_i32: 2647 case INDEX_op_ext8u_i64: 2648 case INDEX_op_ext16s_i32: 2649 case INDEX_op_ext16s_i64: 2650 case INDEX_op_ext16u_i32: 2651 case INDEX_op_ext16u_i64: 2652 case INDEX_op_ext32s_i64: 2653 case INDEX_op_ext32u_i64: 2654 case INDEX_op_ext_i32_i64: 2655 case INDEX_op_extu_i32_i64: 2656 case INDEX_op_extrl_i64_i32: 2657 default: 2658 g_assert_not_reached(); 2659 } 2660 2661#undef OP_32_64 2662} 2663 2664static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2665 unsigned vecl, unsigned vece, 2666 const TCGArg args[TCG_MAX_OP_ARGS], 2667 const int const_args[TCG_MAX_OP_ARGS]) 2668{ 2669 static int const add_insn[4] = { 2670 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2671 }; 2672 static int const ssadd_insn[4] = { 2673 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2674 }; 2675 static int const usadd_insn[4] = { 2676 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2677 }; 2678 static int const sub_insn[4] = { 2679 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2680 }; 2681 static int const sssub_insn[4] = { 2682 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2683 }; 2684 static int const ussub_insn[4] = { 2685 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2686 }; 2687 static int const mul_insn[4] = { 2688 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 2689 }; 2690 static int const shift_imm_insn[4] = { 2691 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2692 }; 2693 static int const cmpeq_insn[4] = { 2694 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2695 }; 2696 static int const cmpgt_insn[4] = { 2697 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2698 }; 2699 static int const punpckl_insn[4] = { 2700 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2701 }; 2702 static int const punpckh_insn[4] = { 2703 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2704 }; 2705 static int const packss_insn[4] = { 2706 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2707 }; 2708 static int const packus_insn[4] = { 2709 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2710 }; 2711 static int const smin_insn[4] = { 2712 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 2713 }; 2714 static int const smax_insn[4] = { 2715 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 2716 }; 2717 static int const umin_insn[4] = { 2718 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 2719 }; 2720 static int const umax_insn[4] = { 2721 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 2722 }; 2723 static int const rotlv_insn[4] = { 2724 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 2725 }; 2726 static int const rotrv_insn[4] = { 2727 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 2728 }; 2729 static int const shlv_insn[4] = { 2730 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 2731 }; 2732 static int const shrv_insn[4] = { 2733 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 2734 }; 2735 static int const sarv_insn[4] = { 2736 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 2737 }; 2738 static int const shls_insn[4] = { 2739 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 2740 }; 2741 static int const shrs_insn[4] = { 2742 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 2743 }; 2744 static int const sars_insn[4] = { 2745 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 2746 }; 2747 static int const vpshldi_insn[4] = { 2748 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 2749 }; 2750 static int const vpshldv_insn[4] = { 2751 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 2752 }; 2753 static int const vpshrdv_insn[4] = { 2754 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 2755 }; 2756 static int const abs_insn[4] = { 2757 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 2758 }; 2759 2760 TCGType type = vecl + TCG_TYPE_V64; 2761 int insn, sub; 2762 TCGArg a0, a1, a2, a3; 2763 2764 a0 = args[0]; 2765 a1 = args[1]; 2766 a2 = args[2]; 2767 2768 switch (opc) { 2769 case INDEX_op_add_vec: 2770 insn = add_insn[vece]; 2771 goto gen_simd; 2772 case INDEX_op_ssadd_vec: 2773 insn = ssadd_insn[vece]; 2774 goto gen_simd; 2775 case INDEX_op_usadd_vec: 2776 insn = usadd_insn[vece]; 2777 goto gen_simd; 2778 case INDEX_op_sub_vec: 2779 insn = sub_insn[vece]; 2780 goto gen_simd; 2781 case INDEX_op_sssub_vec: 2782 insn = sssub_insn[vece]; 2783 goto gen_simd; 2784 case INDEX_op_ussub_vec: 2785 insn = ussub_insn[vece]; 2786 goto gen_simd; 2787 case INDEX_op_mul_vec: 2788 insn = mul_insn[vece]; 2789 goto gen_simd; 2790 case INDEX_op_and_vec: 2791 insn = OPC_PAND; 2792 goto gen_simd; 2793 case INDEX_op_or_vec: 2794 insn = OPC_POR; 2795 goto gen_simd; 2796 case INDEX_op_xor_vec: 2797 insn = OPC_PXOR; 2798 goto gen_simd; 2799 case INDEX_op_smin_vec: 2800 insn = smin_insn[vece]; 2801 goto gen_simd; 2802 case INDEX_op_umin_vec: 2803 insn = umin_insn[vece]; 2804 goto gen_simd; 2805 case INDEX_op_smax_vec: 2806 insn = smax_insn[vece]; 2807 goto gen_simd; 2808 case INDEX_op_umax_vec: 2809 insn = umax_insn[vece]; 2810 goto gen_simd; 2811 case INDEX_op_shlv_vec: 2812 insn = shlv_insn[vece]; 2813 goto gen_simd; 2814 case INDEX_op_shrv_vec: 2815 insn = shrv_insn[vece]; 2816 goto gen_simd; 2817 case INDEX_op_sarv_vec: 2818 insn = sarv_insn[vece]; 2819 goto gen_simd; 2820 case INDEX_op_rotlv_vec: 2821 insn = rotlv_insn[vece]; 2822 goto gen_simd; 2823 case INDEX_op_rotrv_vec: 2824 insn = rotrv_insn[vece]; 2825 goto gen_simd; 2826 case INDEX_op_shls_vec: 2827 insn = shls_insn[vece]; 2828 goto gen_simd; 2829 case INDEX_op_shrs_vec: 2830 insn = shrs_insn[vece]; 2831 goto gen_simd; 2832 case INDEX_op_sars_vec: 2833 insn = sars_insn[vece]; 2834 goto gen_simd; 2835 case INDEX_op_x86_punpckl_vec: 2836 insn = punpckl_insn[vece]; 2837 goto gen_simd; 2838 case INDEX_op_x86_punpckh_vec: 2839 insn = punpckh_insn[vece]; 2840 goto gen_simd; 2841 case INDEX_op_x86_packss_vec: 2842 insn = packss_insn[vece]; 2843 goto gen_simd; 2844 case INDEX_op_x86_packus_vec: 2845 insn = packus_insn[vece]; 2846 goto gen_simd; 2847 case INDEX_op_x86_vpshldv_vec: 2848 insn = vpshldv_insn[vece]; 2849 a1 = a2; 2850 a2 = args[3]; 2851 goto gen_simd; 2852 case INDEX_op_x86_vpshrdv_vec: 2853 insn = vpshrdv_insn[vece]; 2854 a1 = a2; 2855 a2 = args[3]; 2856 goto gen_simd; 2857#if TCG_TARGET_REG_BITS == 32 2858 case INDEX_op_dup2_vec: 2859 /* First merge the two 32-bit inputs to a single 64-bit element. */ 2860 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 2861 /* Then replicate the 64-bit elements across the rest of the vector. */ 2862 if (type != TCG_TYPE_V64) { 2863 tcg_out_dup_vec(s, type, MO_64, a0, a0); 2864 } 2865 break; 2866#endif 2867 case INDEX_op_abs_vec: 2868 insn = abs_insn[vece]; 2869 a2 = a1; 2870 a1 = 0; 2871 goto gen_simd; 2872 gen_simd: 2873 tcg_debug_assert(insn != OPC_UD2); 2874 if (type == TCG_TYPE_V256) { 2875 insn |= P_VEXL; 2876 } 2877 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2878 break; 2879 2880 case INDEX_op_cmp_vec: 2881 sub = args[3]; 2882 if (sub == TCG_COND_EQ) { 2883 insn = cmpeq_insn[vece]; 2884 } else if (sub == TCG_COND_GT) { 2885 insn = cmpgt_insn[vece]; 2886 } else { 2887 g_assert_not_reached(); 2888 } 2889 goto gen_simd; 2890 2891 case INDEX_op_andc_vec: 2892 insn = OPC_PANDN; 2893 if (type == TCG_TYPE_V256) { 2894 insn |= P_VEXL; 2895 } 2896 tcg_out_vex_modrm(s, insn, a0, a2, a1); 2897 break; 2898 2899 case INDEX_op_shli_vec: 2900 insn = shift_imm_insn[vece]; 2901 sub = 6; 2902 goto gen_shift; 2903 case INDEX_op_shri_vec: 2904 insn = shift_imm_insn[vece]; 2905 sub = 2; 2906 goto gen_shift; 2907 case INDEX_op_sari_vec: 2908 if (vece == MO_64) { 2909 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 2910 } else { 2911 insn = shift_imm_insn[vece]; 2912 } 2913 sub = 4; 2914 goto gen_shift; 2915 case INDEX_op_rotli_vec: 2916 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 2917 if (vece == MO_64) { 2918 insn |= P_VEXW; 2919 } 2920 sub = 1; 2921 goto gen_shift; 2922 gen_shift: 2923 tcg_debug_assert(vece != MO_8); 2924 if (type == TCG_TYPE_V256) { 2925 insn |= P_VEXL; 2926 } 2927 tcg_out_vex_modrm(s, insn, sub, a0, a1); 2928 tcg_out8(s, a2); 2929 break; 2930 2931 case INDEX_op_ld_vec: 2932 tcg_out_ld(s, type, a0, a1, a2); 2933 break; 2934 case INDEX_op_st_vec: 2935 tcg_out_st(s, type, a0, a1, a2); 2936 break; 2937 case INDEX_op_dupm_vec: 2938 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 2939 break; 2940 2941 case INDEX_op_x86_shufps_vec: 2942 insn = OPC_SHUFPS; 2943 sub = args[3]; 2944 goto gen_simd_imm8; 2945 case INDEX_op_x86_blend_vec: 2946 if (vece == MO_16) { 2947 insn = OPC_PBLENDW; 2948 } else if (vece == MO_32) { 2949 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 2950 } else { 2951 g_assert_not_reached(); 2952 } 2953 sub = args[3]; 2954 goto gen_simd_imm8; 2955 case INDEX_op_x86_vperm2i128_vec: 2956 insn = OPC_VPERM2I128; 2957 sub = args[3]; 2958 goto gen_simd_imm8; 2959 case INDEX_op_x86_vpshldi_vec: 2960 insn = vpshldi_insn[vece]; 2961 sub = args[3]; 2962 goto gen_simd_imm8; 2963 2964 case INDEX_op_not_vec: 2965 insn = OPC_VPTERNLOGQ; 2966 a2 = a1; 2967 sub = 0x33; /* !B */ 2968 goto gen_simd_imm8; 2969 case INDEX_op_nor_vec: 2970 insn = OPC_VPTERNLOGQ; 2971 sub = 0x11; /* norCB */ 2972 goto gen_simd_imm8; 2973 case INDEX_op_nand_vec: 2974 insn = OPC_VPTERNLOGQ; 2975 sub = 0x77; /* nandCB */ 2976 goto gen_simd_imm8; 2977 case INDEX_op_eqv_vec: 2978 insn = OPC_VPTERNLOGQ; 2979 sub = 0x99; /* xnorCB */ 2980 goto gen_simd_imm8; 2981 case INDEX_op_orc_vec: 2982 insn = OPC_VPTERNLOGQ; 2983 sub = 0xdd; /* orB!C */ 2984 goto gen_simd_imm8; 2985 2986 case INDEX_op_bitsel_vec: 2987 insn = OPC_VPTERNLOGQ; 2988 a3 = args[3]; 2989 if (a0 == a1) { 2990 a1 = a2; 2991 a2 = a3; 2992 sub = 0xca; /* A?B:C */ 2993 } else if (a0 == a2) { 2994 a2 = a3; 2995 sub = 0xe2; /* B?A:C */ 2996 } else { 2997 tcg_out_mov(s, type, a0, a3); 2998 sub = 0xb8; /* B?C:A */ 2999 } 3000 goto gen_simd_imm8; 3001 3002 gen_simd_imm8: 3003 tcg_debug_assert(insn != OPC_UD2); 3004 if (type == TCG_TYPE_V256) { 3005 insn |= P_VEXL; 3006 } 3007 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3008 tcg_out8(s, sub); 3009 break; 3010 3011 case INDEX_op_x86_vpblendvb_vec: 3012 insn = OPC_VPBLENDVB; 3013 if (type == TCG_TYPE_V256) { 3014 insn |= P_VEXL; 3015 } 3016 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3017 tcg_out8(s, args[3] << 4); 3018 break; 3019 3020 case INDEX_op_x86_psrldq_vec: 3021 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3022 tcg_out8(s, a2); 3023 break; 3024 3025 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3026 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3027 default: 3028 g_assert_not_reached(); 3029 } 3030} 3031 3032static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3033{ 3034 switch (op) { 3035 case INDEX_op_goto_ptr: 3036 return C_O0_I1(r); 3037 3038 case INDEX_op_ld8u_i32: 3039 case INDEX_op_ld8u_i64: 3040 case INDEX_op_ld8s_i32: 3041 case INDEX_op_ld8s_i64: 3042 case INDEX_op_ld16u_i32: 3043 case INDEX_op_ld16u_i64: 3044 case INDEX_op_ld16s_i32: 3045 case INDEX_op_ld16s_i64: 3046 case INDEX_op_ld_i32: 3047 case INDEX_op_ld32u_i64: 3048 case INDEX_op_ld32s_i64: 3049 case INDEX_op_ld_i64: 3050 return C_O1_I1(r, r); 3051 3052 case INDEX_op_st8_i32: 3053 case INDEX_op_st8_i64: 3054 return C_O0_I2(qi, r); 3055 3056 case INDEX_op_st16_i32: 3057 case INDEX_op_st16_i64: 3058 case INDEX_op_st_i32: 3059 case INDEX_op_st32_i64: 3060 return C_O0_I2(ri, r); 3061 3062 case INDEX_op_st_i64: 3063 return C_O0_I2(re, r); 3064 3065 case INDEX_op_add_i32: 3066 case INDEX_op_add_i64: 3067 return C_O1_I2(r, r, re); 3068 3069 case INDEX_op_sub_i32: 3070 case INDEX_op_sub_i64: 3071 case INDEX_op_mul_i32: 3072 case INDEX_op_mul_i64: 3073 case INDEX_op_or_i32: 3074 case INDEX_op_or_i64: 3075 case INDEX_op_xor_i32: 3076 case INDEX_op_xor_i64: 3077 return C_O1_I2(r, 0, re); 3078 3079 case INDEX_op_and_i32: 3080 case INDEX_op_and_i64: 3081 return C_O1_I2(r, 0, reZ); 3082 3083 case INDEX_op_andc_i32: 3084 case INDEX_op_andc_i64: 3085 return C_O1_I2(r, r, rI); 3086 3087 case INDEX_op_shl_i32: 3088 case INDEX_op_shl_i64: 3089 case INDEX_op_shr_i32: 3090 case INDEX_op_shr_i64: 3091 case INDEX_op_sar_i32: 3092 case INDEX_op_sar_i64: 3093 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3094 3095 case INDEX_op_rotl_i32: 3096 case INDEX_op_rotl_i64: 3097 case INDEX_op_rotr_i32: 3098 case INDEX_op_rotr_i64: 3099 return C_O1_I2(r, 0, ci); 3100 3101 case INDEX_op_brcond_i32: 3102 case INDEX_op_brcond_i64: 3103 return C_O0_I2(r, re); 3104 3105 case INDEX_op_bswap16_i32: 3106 case INDEX_op_bswap16_i64: 3107 case INDEX_op_bswap32_i32: 3108 case INDEX_op_bswap32_i64: 3109 case INDEX_op_bswap64_i64: 3110 case INDEX_op_neg_i32: 3111 case INDEX_op_neg_i64: 3112 case INDEX_op_not_i32: 3113 case INDEX_op_not_i64: 3114 case INDEX_op_extrh_i64_i32: 3115 return C_O1_I1(r, 0); 3116 3117 case INDEX_op_ext8s_i32: 3118 case INDEX_op_ext8s_i64: 3119 case INDEX_op_ext8u_i32: 3120 case INDEX_op_ext8u_i64: 3121 return C_O1_I1(r, q); 3122 3123 case INDEX_op_ext16s_i32: 3124 case INDEX_op_ext16s_i64: 3125 case INDEX_op_ext16u_i32: 3126 case INDEX_op_ext16u_i64: 3127 case INDEX_op_ext32s_i64: 3128 case INDEX_op_ext32u_i64: 3129 case INDEX_op_ext_i32_i64: 3130 case INDEX_op_extu_i32_i64: 3131 case INDEX_op_extrl_i64_i32: 3132 case INDEX_op_extract_i32: 3133 case INDEX_op_extract_i64: 3134 case INDEX_op_sextract_i32: 3135 case INDEX_op_ctpop_i32: 3136 case INDEX_op_ctpop_i64: 3137 return C_O1_I1(r, r); 3138 3139 case INDEX_op_extract2_i32: 3140 case INDEX_op_extract2_i64: 3141 return C_O1_I2(r, 0, r); 3142 3143 case INDEX_op_deposit_i32: 3144 case INDEX_op_deposit_i64: 3145 return C_O1_I2(Q, 0, Q); 3146 3147 case INDEX_op_setcond_i32: 3148 case INDEX_op_setcond_i64: 3149 return C_O1_I2(q, r, re); 3150 3151 case INDEX_op_movcond_i32: 3152 case INDEX_op_movcond_i64: 3153 return C_O1_I4(r, r, re, r, 0); 3154 3155 case INDEX_op_div2_i32: 3156 case INDEX_op_div2_i64: 3157 case INDEX_op_divu2_i32: 3158 case INDEX_op_divu2_i64: 3159 return C_O2_I3(a, d, 0, 1, r); 3160 3161 case INDEX_op_mulu2_i32: 3162 case INDEX_op_mulu2_i64: 3163 case INDEX_op_muls2_i32: 3164 case INDEX_op_muls2_i64: 3165 return C_O2_I2(a, d, a, r); 3166 3167 case INDEX_op_add2_i32: 3168 case INDEX_op_add2_i64: 3169 case INDEX_op_sub2_i32: 3170 case INDEX_op_sub2_i64: 3171 return C_O2_I4(r, r, 0, 1, re, re); 3172 3173 case INDEX_op_ctz_i32: 3174 case INDEX_op_ctz_i64: 3175 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3176 3177 case INDEX_op_clz_i32: 3178 case INDEX_op_clz_i64: 3179 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3180 3181 case INDEX_op_qemu_ld_i32: 3182 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS 3183 ? C_O1_I1(r, L) : C_O1_I2(r, L, L)); 3184 3185 case INDEX_op_qemu_st_i32: 3186 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS 3187 ? C_O0_I2(L, L) : C_O0_I3(L, L, L)); 3188 case INDEX_op_qemu_st8_i32: 3189 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS 3190 ? C_O0_I2(s, L) : C_O0_I3(s, L, L)); 3191 3192 case INDEX_op_qemu_ld_i64: 3193 return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) 3194 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L) 3195 : C_O2_I2(r, r, L, L)); 3196 3197 case INDEX_op_qemu_st_i64: 3198 return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) 3199 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L) 3200 : C_O0_I4(L, L, L, L)); 3201 3202 case INDEX_op_brcond2_i32: 3203 return C_O0_I4(r, r, ri, ri); 3204 3205 case INDEX_op_setcond2_i32: 3206 return C_O1_I4(r, r, r, ri, ri); 3207 3208 case INDEX_op_ld_vec: 3209 case INDEX_op_dupm_vec: 3210 return C_O1_I1(x, r); 3211 3212 case INDEX_op_st_vec: 3213 return C_O0_I2(x, r); 3214 3215 case INDEX_op_add_vec: 3216 case INDEX_op_sub_vec: 3217 case INDEX_op_mul_vec: 3218 case INDEX_op_and_vec: 3219 case INDEX_op_or_vec: 3220 case INDEX_op_xor_vec: 3221 case INDEX_op_andc_vec: 3222 case INDEX_op_orc_vec: 3223 case INDEX_op_nand_vec: 3224 case INDEX_op_nor_vec: 3225 case INDEX_op_eqv_vec: 3226 case INDEX_op_ssadd_vec: 3227 case INDEX_op_usadd_vec: 3228 case INDEX_op_sssub_vec: 3229 case INDEX_op_ussub_vec: 3230 case INDEX_op_smin_vec: 3231 case INDEX_op_umin_vec: 3232 case INDEX_op_smax_vec: 3233 case INDEX_op_umax_vec: 3234 case INDEX_op_shlv_vec: 3235 case INDEX_op_shrv_vec: 3236 case INDEX_op_sarv_vec: 3237 case INDEX_op_rotlv_vec: 3238 case INDEX_op_rotrv_vec: 3239 case INDEX_op_shls_vec: 3240 case INDEX_op_shrs_vec: 3241 case INDEX_op_sars_vec: 3242 case INDEX_op_cmp_vec: 3243 case INDEX_op_x86_shufps_vec: 3244 case INDEX_op_x86_blend_vec: 3245 case INDEX_op_x86_packss_vec: 3246 case INDEX_op_x86_packus_vec: 3247 case INDEX_op_x86_vperm2i128_vec: 3248 case INDEX_op_x86_punpckl_vec: 3249 case INDEX_op_x86_punpckh_vec: 3250 case INDEX_op_x86_vpshldi_vec: 3251#if TCG_TARGET_REG_BITS == 32 3252 case INDEX_op_dup2_vec: 3253#endif 3254 return C_O1_I2(x, x, x); 3255 3256 case INDEX_op_abs_vec: 3257 case INDEX_op_dup_vec: 3258 case INDEX_op_not_vec: 3259 case INDEX_op_shli_vec: 3260 case INDEX_op_shri_vec: 3261 case INDEX_op_sari_vec: 3262 case INDEX_op_rotli_vec: 3263 case INDEX_op_x86_psrldq_vec: 3264 return C_O1_I1(x, x); 3265 3266 case INDEX_op_x86_vpshldv_vec: 3267 case INDEX_op_x86_vpshrdv_vec: 3268 return C_O1_I3(x, 0, x, x); 3269 3270 case INDEX_op_bitsel_vec: 3271 case INDEX_op_x86_vpblendvb_vec: 3272 return C_O1_I3(x, x, x, x); 3273 3274 default: 3275 g_assert_not_reached(); 3276 } 3277} 3278 3279int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3280{ 3281 switch (opc) { 3282 case INDEX_op_add_vec: 3283 case INDEX_op_sub_vec: 3284 case INDEX_op_and_vec: 3285 case INDEX_op_or_vec: 3286 case INDEX_op_xor_vec: 3287 case INDEX_op_andc_vec: 3288 case INDEX_op_orc_vec: 3289 case INDEX_op_nand_vec: 3290 case INDEX_op_nor_vec: 3291 case INDEX_op_eqv_vec: 3292 case INDEX_op_not_vec: 3293 case INDEX_op_bitsel_vec: 3294 return 1; 3295 case INDEX_op_cmp_vec: 3296 case INDEX_op_cmpsel_vec: 3297 return -1; 3298 3299 case INDEX_op_rotli_vec: 3300 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3301 3302 case INDEX_op_shli_vec: 3303 case INDEX_op_shri_vec: 3304 /* We must expand the operation for MO_8. */ 3305 return vece == MO_8 ? -1 : 1; 3306 3307 case INDEX_op_sari_vec: 3308 switch (vece) { 3309 case MO_8: 3310 return -1; 3311 case MO_16: 3312 case MO_32: 3313 return 1; 3314 case MO_64: 3315 if (have_avx512vl) { 3316 return 1; 3317 } 3318 /* 3319 * We can emulate this for MO_64, but it does not pay off 3320 * unless we're producing at least 4 values. 3321 */ 3322 return type >= TCG_TYPE_V256 ? -1 : 0; 3323 } 3324 return 0; 3325 3326 case INDEX_op_shls_vec: 3327 case INDEX_op_shrs_vec: 3328 return vece >= MO_16; 3329 case INDEX_op_sars_vec: 3330 switch (vece) { 3331 case MO_16: 3332 case MO_32: 3333 return 1; 3334 case MO_64: 3335 return have_avx512vl; 3336 } 3337 return 0; 3338 case INDEX_op_rotls_vec: 3339 return vece >= MO_16 ? -1 : 0; 3340 3341 case INDEX_op_shlv_vec: 3342 case INDEX_op_shrv_vec: 3343 switch (vece) { 3344 case MO_16: 3345 return have_avx512bw; 3346 case MO_32: 3347 case MO_64: 3348 return have_avx2; 3349 } 3350 return 0; 3351 case INDEX_op_sarv_vec: 3352 switch (vece) { 3353 case MO_16: 3354 return have_avx512bw; 3355 case MO_32: 3356 return have_avx2; 3357 case MO_64: 3358 return have_avx512vl; 3359 } 3360 return 0; 3361 case INDEX_op_rotlv_vec: 3362 case INDEX_op_rotrv_vec: 3363 switch (vece) { 3364 case MO_16: 3365 return have_avx512vbmi2 ? -1 : 0; 3366 case MO_32: 3367 case MO_64: 3368 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3369 } 3370 return 0; 3371 3372 case INDEX_op_mul_vec: 3373 switch (vece) { 3374 case MO_8: 3375 return -1; 3376 case MO_64: 3377 return have_avx512dq; 3378 } 3379 return 1; 3380 3381 case INDEX_op_ssadd_vec: 3382 case INDEX_op_usadd_vec: 3383 case INDEX_op_sssub_vec: 3384 case INDEX_op_ussub_vec: 3385 return vece <= MO_16; 3386 case INDEX_op_smin_vec: 3387 case INDEX_op_smax_vec: 3388 case INDEX_op_umin_vec: 3389 case INDEX_op_umax_vec: 3390 case INDEX_op_abs_vec: 3391 return vece <= MO_32 || have_avx512vl; 3392 3393 default: 3394 return 0; 3395 } 3396} 3397 3398static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3399 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3400{ 3401 TCGv_vec t1, t2; 3402 3403 tcg_debug_assert(vece == MO_8); 3404 3405 t1 = tcg_temp_new_vec(type); 3406 t2 = tcg_temp_new_vec(type); 3407 3408 /* 3409 * Unpack to W, shift, and repack. Tricky bits: 3410 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3411 * i.e. duplicate in other half of the 16-bit lane. 3412 * (2) For right-shift, add 8 so that the high half of the lane 3413 * becomes zero. For left-shift, and left-rotate, we must 3414 * shift up and down again. 3415 * (3) Step 2 leaves high half zero such that PACKUSWB 3416 * (pack with unsigned saturation) does not modify 3417 * the quantity. 3418 */ 3419 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3420 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3421 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3422 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3423 3424 if (opc != INDEX_op_rotli_vec) { 3425 imm += 8; 3426 } 3427 if (opc == INDEX_op_shri_vec) { 3428 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3429 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3430 } else { 3431 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3432 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3433 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3434 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3435 } 3436 3437 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3438 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3439 tcg_temp_free_vec(t1); 3440 tcg_temp_free_vec(t2); 3441} 3442 3443static void expand_vec_sari(TCGType type, unsigned vece, 3444 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3445{ 3446 TCGv_vec t1, t2; 3447 3448 switch (vece) { 3449 case MO_8: 3450 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3451 t1 = tcg_temp_new_vec(type); 3452 t2 = tcg_temp_new_vec(type); 3453 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3454 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3455 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3456 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3457 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3458 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3459 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3460 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3461 tcg_temp_free_vec(t1); 3462 tcg_temp_free_vec(t2); 3463 break; 3464 3465 case MO_64: 3466 t1 = tcg_temp_new_vec(type); 3467 if (imm <= 32) { 3468 /* 3469 * We can emulate a small sign extend by performing an arithmetic 3470 * 32-bit shift and overwriting the high half of a 64-bit logical 3471 * shift. Note that the ISA says shift of 32 is valid, but TCG 3472 * does not, so we have to bound the smaller shift -- we get the 3473 * same result in the high half either way. 3474 */ 3475 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3476 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3477 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3478 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3479 tcgv_vec_arg(t1), 0xaa); 3480 } else { 3481 /* Otherwise we will need to use a compare vs 0 to produce 3482 * the sign-extend, shift and merge. 3483 */ 3484 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 3485 tcg_constant_vec(type, MO_64, 0), v1); 3486 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3487 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3488 tcg_gen_or_vec(MO_64, v0, v0, t1); 3489 } 3490 tcg_temp_free_vec(t1); 3491 break; 3492 3493 default: 3494 g_assert_not_reached(); 3495 } 3496} 3497 3498static void expand_vec_rotli(TCGType type, unsigned vece, 3499 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3500{ 3501 TCGv_vec t; 3502 3503 if (vece == MO_8) { 3504 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3505 return; 3506 } 3507 3508 if (have_avx512vbmi2) { 3509 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3510 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3511 return; 3512 } 3513 3514 t = tcg_temp_new_vec(type); 3515 tcg_gen_shli_vec(vece, t, v1, imm); 3516 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3517 tcg_gen_or_vec(vece, v0, v0, t); 3518 tcg_temp_free_vec(t); 3519} 3520 3521static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3522 TCGv_vec v1, TCGv_vec sh, bool right) 3523{ 3524 TCGv_vec t; 3525 3526 if (have_avx512vbmi2) { 3527 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3528 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3529 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3530 return; 3531 } 3532 3533 t = tcg_temp_new_vec(type); 3534 tcg_gen_dupi_vec(vece, t, 8 << vece); 3535 tcg_gen_sub_vec(vece, t, t, sh); 3536 if (right) { 3537 tcg_gen_shlv_vec(vece, t, v1, t); 3538 tcg_gen_shrv_vec(vece, v0, v1, sh); 3539 } else { 3540 tcg_gen_shrv_vec(vece, t, v1, t); 3541 tcg_gen_shlv_vec(vece, v0, v1, sh); 3542 } 3543 tcg_gen_or_vec(vece, v0, v0, t); 3544 tcg_temp_free_vec(t); 3545} 3546 3547static void expand_vec_rotls(TCGType type, unsigned vece, 3548 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3549{ 3550 TCGv_vec t = tcg_temp_new_vec(type); 3551 3552 tcg_debug_assert(vece != MO_8); 3553 3554 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3555 tcg_gen_dup_i32_vec(vece, t, lsh); 3556 if (vece >= MO_32) { 3557 tcg_gen_rotlv_vec(vece, v0, v1, t); 3558 } else { 3559 expand_vec_rotv(type, vece, v0, v1, t, false); 3560 } 3561 } else { 3562 TCGv_i32 rsh = tcg_temp_new_i32(); 3563 3564 tcg_gen_neg_i32(rsh, lsh); 3565 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3566 tcg_gen_shls_vec(vece, t, v1, lsh); 3567 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3568 tcg_gen_or_vec(vece, v0, v0, t); 3569 3570 tcg_temp_free_i32(rsh); 3571 } 3572 3573 tcg_temp_free_vec(t); 3574} 3575 3576static void expand_vec_mul(TCGType type, unsigned vece, 3577 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3578{ 3579 TCGv_vec t1, t2, t3, t4, zero; 3580 3581 tcg_debug_assert(vece == MO_8); 3582 3583 /* 3584 * Unpack v1 bytes to words, 0 | x. 3585 * Unpack v2 bytes to words, y | 0. 3586 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3587 * Shift logical right by 8 bits to clear the high 8 bytes before 3588 * using an unsigned saturated pack. 3589 * 3590 * The difference between the V64, V128 and V256 cases is merely how 3591 * we distribute the expansion between temporaries. 3592 */ 3593 switch (type) { 3594 case TCG_TYPE_V64: 3595 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3596 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3597 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3598 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3599 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3600 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3601 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3602 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3603 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3604 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3605 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3606 tcg_temp_free_vec(t1); 3607 tcg_temp_free_vec(t2); 3608 break; 3609 3610 case TCG_TYPE_V128: 3611 case TCG_TYPE_V256: 3612 t1 = tcg_temp_new_vec(type); 3613 t2 = tcg_temp_new_vec(type); 3614 t3 = tcg_temp_new_vec(type); 3615 t4 = tcg_temp_new_vec(type); 3616 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3617 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3618 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3619 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3620 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3621 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3622 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3623 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3624 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3625 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3626 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3627 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3628 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3629 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3630 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3631 tcg_temp_free_vec(t1); 3632 tcg_temp_free_vec(t2); 3633 tcg_temp_free_vec(t3); 3634 tcg_temp_free_vec(t4); 3635 break; 3636 3637 default: 3638 g_assert_not_reached(); 3639 } 3640} 3641 3642static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3643 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3644{ 3645 enum { 3646 NEED_INV = 1, 3647 NEED_SWAP = 2, 3648 NEED_BIAS = 4, 3649 NEED_UMIN = 8, 3650 NEED_UMAX = 16, 3651 }; 3652 TCGv_vec t1, t2, t3; 3653 uint8_t fixup; 3654 3655 switch (cond) { 3656 case TCG_COND_EQ: 3657 case TCG_COND_GT: 3658 fixup = 0; 3659 break; 3660 case TCG_COND_NE: 3661 case TCG_COND_LE: 3662 fixup = NEED_INV; 3663 break; 3664 case TCG_COND_LT: 3665 fixup = NEED_SWAP; 3666 break; 3667 case TCG_COND_GE: 3668 fixup = NEED_SWAP | NEED_INV; 3669 break; 3670 case TCG_COND_LEU: 3671 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3672 fixup = NEED_UMIN; 3673 } else { 3674 fixup = NEED_BIAS | NEED_INV; 3675 } 3676 break; 3677 case TCG_COND_GTU: 3678 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3679 fixup = NEED_UMIN | NEED_INV; 3680 } else { 3681 fixup = NEED_BIAS; 3682 } 3683 break; 3684 case TCG_COND_GEU: 3685 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3686 fixup = NEED_UMAX; 3687 } else { 3688 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3689 } 3690 break; 3691 case TCG_COND_LTU: 3692 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3693 fixup = NEED_UMAX | NEED_INV; 3694 } else { 3695 fixup = NEED_BIAS | NEED_SWAP; 3696 } 3697 break; 3698 default: 3699 g_assert_not_reached(); 3700 } 3701 3702 if (fixup & NEED_INV) { 3703 cond = tcg_invert_cond(cond); 3704 } 3705 if (fixup & NEED_SWAP) { 3706 t1 = v1, v1 = v2, v2 = t1; 3707 cond = tcg_swap_cond(cond); 3708 } 3709 3710 t1 = t2 = NULL; 3711 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3712 t1 = tcg_temp_new_vec(type); 3713 if (fixup & NEED_UMIN) { 3714 tcg_gen_umin_vec(vece, t1, v1, v2); 3715 } else { 3716 tcg_gen_umax_vec(vece, t1, v1, v2); 3717 } 3718 v2 = t1; 3719 cond = TCG_COND_EQ; 3720 } else if (fixup & NEED_BIAS) { 3721 t1 = tcg_temp_new_vec(type); 3722 t2 = tcg_temp_new_vec(type); 3723 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 3724 tcg_gen_sub_vec(vece, t1, v1, t3); 3725 tcg_gen_sub_vec(vece, t2, v2, t3); 3726 v1 = t1; 3727 v2 = t2; 3728 cond = tcg_signed_cond(cond); 3729 } 3730 3731 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 3732 /* Expand directly; do not recurse. */ 3733 vec_gen_4(INDEX_op_cmp_vec, type, vece, 3734 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 3735 3736 if (t1) { 3737 tcg_temp_free_vec(t1); 3738 if (t2) { 3739 tcg_temp_free_vec(t2); 3740 } 3741 } 3742 return fixup & NEED_INV; 3743} 3744 3745static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 3746 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3747{ 3748 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 3749 tcg_gen_not_vec(vece, v0, v0); 3750 } 3751} 3752 3753static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 3754 TCGv_vec c1, TCGv_vec c2, 3755 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 3756{ 3757 TCGv_vec t = tcg_temp_new_vec(type); 3758 3759 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 3760 /* Invert the sense of the compare by swapping arguments. */ 3761 TCGv_vec x; 3762 x = v3, v3 = v4, v4 = x; 3763 } 3764 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 3765 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 3766 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 3767 tcg_temp_free_vec(t); 3768} 3769 3770void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 3771 TCGArg a0, ...) 3772{ 3773 va_list va; 3774 TCGArg a2; 3775 TCGv_vec v0, v1, v2, v3, v4; 3776 3777 va_start(va, a0); 3778 v0 = temp_tcgv_vec(arg_temp(a0)); 3779 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3780 a2 = va_arg(va, TCGArg); 3781 3782 switch (opc) { 3783 case INDEX_op_shli_vec: 3784 case INDEX_op_shri_vec: 3785 expand_vec_shi(type, vece, opc, v0, v1, a2); 3786 break; 3787 3788 case INDEX_op_sari_vec: 3789 expand_vec_sari(type, vece, v0, v1, a2); 3790 break; 3791 3792 case INDEX_op_rotli_vec: 3793 expand_vec_rotli(type, vece, v0, v1, a2); 3794 break; 3795 3796 case INDEX_op_rotls_vec: 3797 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 3798 break; 3799 3800 case INDEX_op_rotlv_vec: 3801 v2 = temp_tcgv_vec(arg_temp(a2)); 3802 expand_vec_rotv(type, vece, v0, v1, v2, false); 3803 break; 3804 case INDEX_op_rotrv_vec: 3805 v2 = temp_tcgv_vec(arg_temp(a2)); 3806 expand_vec_rotv(type, vece, v0, v1, v2, true); 3807 break; 3808 3809 case INDEX_op_mul_vec: 3810 v2 = temp_tcgv_vec(arg_temp(a2)); 3811 expand_vec_mul(type, vece, v0, v1, v2); 3812 break; 3813 3814 case INDEX_op_cmp_vec: 3815 v2 = temp_tcgv_vec(arg_temp(a2)); 3816 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 3817 break; 3818 3819 case INDEX_op_cmpsel_vec: 3820 v2 = temp_tcgv_vec(arg_temp(a2)); 3821 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3822 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3823 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 3824 break; 3825 3826 default: 3827 break; 3828 } 3829 3830 va_end(va); 3831} 3832 3833static const int tcg_target_callee_save_regs[] = { 3834#if TCG_TARGET_REG_BITS == 64 3835 TCG_REG_RBP, 3836 TCG_REG_RBX, 3837#if defined(_WIN64) 3838 TCG_REG_RDI, 3839 TCG_REG_RSI, 3840#endif 3841 TCG_REG_R12, 3842 TCG_REG_R13, 3843 TCG_REG_R14, /* Currently used for the global env. */ 3844 TCG_REG_R15, 3845#else 3846 TCG_REG_EBP, /* Currently used for the global env. */ 3847 TCG_REG_EBX, 3848 TCG_REG_ESI, 3849 TCG_REG_EDI, 3850#endif 3851}; 3852 3853/* Compute frame size via macros, to share between tcg_target_qemu_prologue 3854 and tcg_register_jit. */ 3855 3856#define PUSH_SIZE \ 3857 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 3858 * (TCG_TARGET_REG_BITS / 8)) 3859 3860#define FRAME_SIZE \ 3861 ((PUSH_SIZE \ 3862 + TCG_STATIC_CALL_ARGS_SIZE \ 3863 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 3864 + TCG_TARGET_STACK_ALIGN - 1) \ 3865 & ~(TCG_TARGET_STACK_ALIGN - 1)) 3866 3867/* Generate global QEMU prologue and epilogue code */ 3868static void tcg_target_qemu_prologue(TCGContext *s) 3869{ 3870 int i, stack_addend; 3871 3872 /* TB prologue */ 3873 3874 /* Reserve some stack space, also for TCG temps. */ 3875 stack_addend = FRAME_SIZE - PUSH_SIZE; 3876 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 3877 CPU_TEMP_BUF_NLONGS * sizeof(long)); 3878 3879 /* Save all callee saved registers. */ 3880 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 3881 tcg_out_push(s, tcg_target_callee_save_regs[i]); 3882 } 3883 3884#if TCG_TARGET_REG_BITS == 32 3885 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 3886 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 3887 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3888 /* jmp *tb. */ 3889 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 3890 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 3891 + stack_addend); 3892#else 3893# if !defined(CONFIG_SOFTMMU) 3894 if (guest_base) { 3895 int seg = setup_guest_base_seg(); 3896 if (seg != 0) { 3897 x86_guest_base.seg = seg; 3898 } else if (guest_base == (int32_t)guest_base) { 3899 x86_guest_base.ofs = guest_base; 3900 } else { 3901 /* Choose R12 because, as a base, it requires a SIB byte. */ 3902 x86_guest_base.index = TCG_REG_R12; 3903 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 3904 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 3905 } 3906 } 3907# endif 3908 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 3909 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3910 /* jmp *tb. */ 3911 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 3912#endif 3913 3914 /* 3915 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 3916 * and fall through to the rest of the epilogue. 3917 */ 3918 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 3919 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 3920 3921 /* TB epilogue */ 3922 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 3923 3924 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 3925 3926 if (have_avx2) { 3927 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 3928 } 3929 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 3930 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 3931 } 3932 tcg_out_opc(s, OPC_RET, 0, 0, 0); 3933} 3934 3935static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 3936{ 3937 memset(p, 0x90, count); 3938} 3939 3940static void tcg_target_init(TCGContext *s) 3941{ 3942#ifdef CONFIG_CPUID_H 3943 unsigned a, b, c, d, b7 = 0, c7 = 0; 3944 unsigned max = __get_cpuid_max(0, 0); 3945 3946 if (max >= 7) { 3947 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ 3948 __cpuid_count(7, 0, a, b7, c7, d); 3949 have_bmi1 = (b7 & bit_BMI) != 0; 3950 have_bmi2 = (b7 & bit_BMI2) != 0; 3951 } 3952 3953 if (max >= 1) { 3954 __cpuid(1, a, b, c, d); 3955#ifndef have_cmov 3956 /* For 32-bit, 99% certainty that we're running on hardware that 3957 supports cmov, but we still need to check. In case cmov is not 3958 available, we'll use a small forward branch. */ 3959 have_cmov = (d & bit_CMOV) != 0; 3960#endif 3961 3962 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we 3963 need to probe for it. */ 3964 have_movbe = (c & bit_MOVBE) != 0; 3965 have_popcnt = (c & bit_POPCNT) != 0; 3966 3967 /* There are a number of things we must check before we can be 3968 sure of not hitting invalid opcode. */ 3969 if (c & bit_OSXSAVE) { 3970 unsigned bv = xgetbv_low(0); 3971 3972 if ((bv & 6) == 6) { 3973 have_avx1 = (c & bit_AVX) != 0; 3974 have_avx2 = (b7 & bit_AVX2) != 0; 3975 3976 /* 3977 * There are interesting instructions in AVX512, so long 3978 * as we have AVX512VL, which indicates support for EVEX 3979 * on sizes smaller than 512 bits. We are required to 3980 * check that OPMASK and all extended ZMM state are enabled 3981 * even if we're not using them -- the insns will fault. 3982 */ 3983 if ((bv & 0xe0) == 0xe0 3984 && (b7 & bit_AVX512F) 3985 && (b7 & bit_AVX512VL)) { 3986 have_avx512vl = true; 3987 have_avx512bw = (b7 & bit_AVX512BW) != 0; 3988 have_avx512dq = (b7 & bit_AVX512DQ) != 0; 3989 have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0; 3990 } 3991 3992 /* 3993 * The Intel SDM has added: 3994 * Processors that enumerate support for Intel® AVX 3995 * (by setting the feature flag CPUID.01H:ECX.AVX[bit 28]) 3996 * guarantee that the 16-byte memory operations performed 3997 * by the following instructions will always be carried 3998 * out atomically: 3999 * - MOVAPD, MOVAPS, and MOVDQA. 4000 * - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. 4001 * - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded 4002 * with EVEX.128 and k0 (masking disabled). 4003 * Note that these instructions require the linear addresses 4004 * of their memory operands to be 16-byte aligned. 4005 * 4006 * AMD has provided an even stronger guarantee that processors 4007 * with AVX provide 16-byte atomicity for all cachable, 4008 * naturally aligned single loads and stores, e.g. MOVDQU. 4009 * 4010 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 4011 */ 4012 if (have_avx1) { 4013 __cpuid(0, a, b, c, d); 4014 have_atomic16 = (c == signature_INTEL_ecx || 4015 c == signature_AMD_ecx); 4016 } 4017 } 4018 } 4019 } 4020 4021 max = __get_cpuid_max(0x8000000, 0); 4022 if (max >= 1) { 4023 __cpuid(0x80000001, a, b, c, d); 4024 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */ 4025 have_lzcnt = (c & bit_LZCNT) != 0; 4026 } 4027#endif /* CONFIG_CPUID_H */ 4028 4029 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4030 if (TCG_TARGET_REG_BITS == 64) { 4031 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4032 } 4033 if (have_avx1) { 4034 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4035 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4036 } 4037 if (have_avx2) { 4038 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4039 } 4040 4041 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4042 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4043 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4044 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4045 if (TCG_TARGET_REG_BITS == 64) { 4046#if !defined(_WIN64) 4047 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4048 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4049#endif 4050 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4051 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4052 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4053 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4054 } 4055 4056 s->reserved_regs = 0; 4057 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4058#ifdef _WIN64 4059 /* These are call saved, and we don't save them, so don't use them. */ 4060 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4061 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4062 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4063 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4064 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4065 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4066 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4067 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4068 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4069 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4070#endif 4071} 4072 4073typedef struct { 4074 DebugFrameHeader h; 4075 uint8_t fde_def_cfa[4]; 4076 uint8_t fde_reg_ofs[14]; 4077} DebugFrame; 4078 4079/* We're expecting a 2 byte uleb128 encoded value. */ 4080QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4081 4082#if !defined(__ELF__) 4083 /* Host machine without ELF. */ 4084#elif TCG_TARGET_REG_BITS == 64 4085#define ELF_HOST_MACHINE EM_X86_64 4086static const DebugFrame debug_frame = { 4087 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4088 .h.cie.id = -1, 4089 .h.cie.version = 1, 4090 .h.cie.code_align = 1, 4091 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4092 .h.cie.return_column = 16, 4093 4094 /* Total FDE size does not include the "len" member. */ 4095 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4096 4097 .fde_def_cfa = { 4098 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4099 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4100 (FRAME_SIZE >> 7) 4101 }, 4102 .fde_reg_ofs = { 4103 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4104 /* The following ordering must match tcg_target_callee_save_regs. */ 4105 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4106 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4107 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4108 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4109 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4110 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4111 } 4112}; 4113#else 4114#define ELF_HOST_MACHINE EM_386 4115static const DebugFrame debug_frame = { 4116 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4117 .h.cie.id = -1, 4118 .h.cie.version = 1, 4119 .h.cie.code_align = 1, 4120 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4121 .h.cie.return_column = 8, 4122 4123 /* Total FDE size does not include the "len" member. */ 4124 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4125 4126 .fde_def_cfa = { 4127 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4128 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4129 (FRAME_SIZE >> 7) 4130 }, 4131 .fde_reg_ofs = { 4132 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4133 /* The following ordering must match tcg_target_callee_save_regs. */ 4134 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4135 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4136 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4137 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4138 } 4139}; 4140#endif 4141 4142#if defined(ELF_HOST_MACHINE) 4143void tcg_register_jit(const void *buf, size_t buf_size) 4144{ 4145 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4146} 4147#endif 4148