1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94static const int tcg_target_call_iarg_regs[] = { 95#if TCG_TARGET_REG_BITS == 64 96#if defined(_WIN64) 97 TCG_REG_RCX, 98 TCG_REG_RDX, 99#else 100 TCG_REG_RDI, 101 TCG_REG_RSI, 102 TCG_REG_RDX, 103 TCG_REG_RCX, 104#endif 105 TCG_REG_R8, 106 TCG_REG_R9, 107#else 108 /* 32 bit mode uses stack based calling convention (GCC default). */ 109#endif 110}; 111 112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 113{ 114 switch (kind) { 115 case TCG_CALL_RET_NORMAL: 116 tcg_debug_assert(slot >= 0 && slot <= 1); 117 return slot ? TCG_REG_EDX : TCG_REG_EAX; 118#ifdef _WIN64 119 case TCG_CALL_RET_BY_VEC: 120 tcg_debug_assert(slot == 0); 121 return TCG_REG_XMM0; 122#endif 123 default: 124 g_assert_not_reached(); 125 } 126} 127 128/* Constants we accept. */ 129#define TCG_CT_CONST_S32 0x100 130#define TCG_CT_CONST_U32 0x200 131#define TCG_CT_CONST_I32 0x400 132#define TCG_CT_CONST_WSZ 0x800 133 134/* Registers used with L constraint, which are the first argument 135 registers on x86_64, and two random call clobbered registers on 136 i386. */ 137#if TCG_TARGET_REG_BITS == 64 138# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 139# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 140#else 141# define TCG_REG_L0 TCG_REG_EAX 142# define TCG_REG_L1 TCG_REG_EDX 143#endif 144 145#define ALL_BYTEH_REGS 0x0000000fu 146#if TCG_TARGET_REG_BITS == 64 147# define ALL_GENERAL_REGS 0x0000ffffu 148# define ALL_VECTOR_REGS 0xffff0000u 149# define ALL_BYTEL_REGS ALL_GENERAL_REGS 150#else 151# define ALL_GENERAL_REGS 0x000000ffu 152# define ALL_VECTOR_REGS 0x00ff0000u 153# define ALL_BYTEL_REGS ALL_BYTEH_REGS 154#endif 155#ifdef CONFIG_SOFTMMU 156# define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1)) 157#else 158# define SOFTMMU_RESERVE_REGS 0 159#endif 160 161/* The host compiler should supply <cpuid.h> to enable runtime features 162 detection, as we're not going to go so far as our own inline assembly. 163 If not available, default values will be assumed. */ 164#if defined(CONFIG_CPUID_H) 165#include "qemu/cpuid.h" 166#endif 167 168/* For 64-bit, we always know that CMOV is available. */ 169#if TCG_TARGET_REG_BITS == 64 170# define have_cmov 1 171#elif defined(CONFIG_CPUID_H) 172static bool have_cmov; 173#else 174# define have_cmov 0 175#endif 176 177/* We need these symbols in tcg-target.h, and we can't properly conditionalize 178 it there. Therefore we always define the variable. */ 179bool have_bmi1; 180bool have_popcnt; 181bool have_avx1; 182bool have_avx2; 183bool have_avx512bw; 184bool have_avx512dq; 185bool have_avx512vbmi2; 186bool have_avx512vl; 187bool have_movbe; 188bool have_atomic16; 189 190#ifdef CONFIG_CPUID_H 191static bool have_bmi2; 192static bool have_lzcnt; 193#else 194# define have_bmi2 0 195# define have_lzcnt 0 196#endif 197 198static const tcg_insn_unit *tb_ret_addr; 199 200static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 201 intptr_t value, intptr_t addend) 202{ 203 value += addend; 204 switch(type) { 205 case R_386_PC32: 206 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 207 if (value != (int32_t)value) { 208 return false; 209 } 210 /* FALLTHRU */ 211 case R_386_32: 212 tcg_patch32(code_ptr, value); 213 break; 214 case R_386_PC8: 215 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 216 if (value != (int8_t)value) { 217 return false; 218 } 219 tcg_patch8(code_ptr, value); 220 break; 221 default: 222 g_assert_not_reached(); 223 } 224 return true; 225} 226 227/* test if a constant matches the constraint */ 228static bool tcg_target_const_match(int64_t val, TCGType type, int ct) 229{ 230 if (ct & TCG_CT_CONST) { 231 return 1; 232 } 233 if (type == TCG_TYPE_I32) { 234 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) { 235 return 1; 236 } 237 } else { 238 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 239 return 1; 240 } 241 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 242 return 1; 243 } 244 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 245 return 1; 246 } 247 } 248 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 249 return 1; 250 } 251 return 0; 252} 253 254# define LOWREGMASK(x) ((x) & 7) 255 256#define P_EXT 0x100 /* 0x0f opcode prefix */ 257#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 258#define P_DATA16 0x400 /* 0x66 opcode prefix */ 259#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 260#if TCG_TARGET_REG_BITS == 64 261# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 262# define P_REXB_R 0x2000 /* REG field as byte register */ 263# define P_REXB_RM 0x4000 /* R/M field as byte register */ 264# define P_GS 0x8000 /* gs segment override */ 265#else 266# define P_REXW 0 267# define P_REXB_R 0 268# define P_REXB_RM 0 269# define P_GS 0 270#endif 271#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 272#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 273#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 274#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 275#define P_EVEX 0x100000 /* Requires EVEX encoding */ 276 277#define OPC_ARITH_EvIz (0x81) 278#define OPC_ARITH_EvIb (0x83) 279#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 280#define OPC_ANDN (0xf2 | P_EXT38) 281#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 282#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 283#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 284#define OPC_BSF (0xbc | P_EXT) 285#define OPC_BSR (0xbd | P_EXT) 286#define OPC_BSWAP (0xc8 | P_EXT) 287#define OPC_CALL_Jz (0xe8) 288#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 289#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 290#define OPC_DEC_r32 (0x48) 291#define OPC_IMUL_GvEv (0xaf | P_EXT) 292#define OPC_IMUL_GvEvIb (0x6b) 293#define OPC_IMUL_GvEvIz (0x69) 294#define OPC_INC_r32 (0x40) 295#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 296#define OPC_JCC_short (0x70) /* ... plus condition code */ 297#define OPC_JMP_long (0xe9) 298#define OPC_JMP_short (0xeb) 299#define OPC_LEA (0x8d) 300#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 301#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 302#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 303#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 304#define OPC_MOVB_EvIz (0xc6) 305#define OPC_MOVL_EvIz (0xc7) 306#define OPC_MOVL_Iv (0xb8) 307#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 308#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 309#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 310#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 311#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 312#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 313#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 314#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 315#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 316#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 317#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 318#define OPC_MOVSBL (0xbe | P_EXT) 319#define OPC_MOVSWL (0xbf | P_EXT) 320#define OPC_MOVSLQ (0x63 | P_REXW) 321#define OPC_MOVZBL (0xb6 | P_EXT) 322#define OPC_MOVZWL (0xb7 | P_EXT) 323#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 324#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 325#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 326#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 327#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 328#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 329#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 330#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 331#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 332#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 333#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 334#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 335#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 336#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 337#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 338#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 339#define OPC_PAND (0xdb | P_EXT | P_DATA16) 340#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 341#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 342#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 343#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 344#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 345#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 346#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 347#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 348#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 349#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 350#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 351#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 352#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 353#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 354#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 355#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 356#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 357#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 358#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 359#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 360#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 361#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 362#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 363#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 364#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 365#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 366#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 367#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 368#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 369#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 370#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 371#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 372#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 373#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 374#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 375#define OPC_POR (0xeb | P_EXT | P_DATA16) 376#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 377#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 378#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 379#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 380#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 381#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 382#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 383#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 384#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 385#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 386#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 387#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 388#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 389#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 390#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 391#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 392#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 393#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 394#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 395#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 396#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 397#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 398#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 399#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 400#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 401#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 402#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 403#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 404#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 405#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 406#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 407#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 408#define OPC_PXOR (0xef | P_EXT | P_DATA16) 409#define OPC_POP_r32 (0x58) 410#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 411#define OPC_PUSH_r32 (0x50) 412#define OPC_PUSH_Iv (0x68) 413#define OPC_PUSH_Ib (0x6a) 414#define OPC_RET (0xc3) 415#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 416#define OPC_SHIFT_1 (0xd1) 417#define OPC_SHIFT_Ib (0xc1) 418#define OPC_SHIFT_cl (0xd3) 419#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 420#define OPC_SHUFPS (0xc6 | P_EXT) 421#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 422#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 423#define OPC_SHRD_Ib (0xac | P_EXT) 424#define OPC_TESTL (0x85) 425#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 426#define OPC_UD2 (0x0b | P_EXT) 427#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 428#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 429#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 430#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 431#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 432#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 433#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 434#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 435#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 436#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 437#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 438#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 439#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 440#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 441#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 442#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 443#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 445#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 446#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 447#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 448#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 449#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 450#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 451#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 452#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 453#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 454#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 455#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 456#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 457#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 458#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 459#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 460#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 461#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 462#define OPC_VZEROUPPER (0x77 | P_EXT) 463#define OPC_XCHG_ax_r32 (0x90) 464#define OPC_XCHG_EvGv (0x87) 465 466#define OPC_GRP3_Eb (0xf6) 467#define OPC_GRP3_Ev (0xf7) 468#define OPC_GRP5 (0xff) 469#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 470 471/* Group 1 opcode extensions for 0x80-0x83. 472 These are also used as modifiers for OPC_ARITH. */ 473#define ARITH_ADD 0 474#define ARITH_OR 1 475#define ARITH_ADC 2 476#define ARITH_SBB 3 477#define ARITH_AND 4 478#define ARITH_SUB 5 479#define ARITH_XOR 6 480#define ARITH_CMP 7 481 482/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 483#define SHIFT_ROL 0 484#define SHIFT_ROR 1 485#define SHIFT_SHL 4 486#define SHIFT_SHR 5 487#define SHIFT_SAR 7 488 489/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 490#define EXT3_TESTi 0 491#define EXT3_NOT 2 492#define EXT3_NEG 3 493#define EXT3_MUL 4 494#define EXT3_IMUL 5 495#define EXT3_DIV 6 496#define EXT3_IDIV 7 497 498/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 499#define EXT5_INC_Ev 0 500#define EXT5_DEC_Ev 1 501#define EXT5_CALLN_Ev 2 502#define EXT5_JMPN_Ev 4 503 504/* Condition codes to be added to OPC_JCC_{long,short}. */ 505#define JCC_JMP (-1) 506#define JCC_JO 0x0 507#define JCC_JNO 0x1 508#define JCC_JB 0x2 509#define JCC_JAE 0x3 510#define JCC_JE 0x4 511#define JCC_JNE 0x5 512#define JCC_JBE 0x6 513#define JCC_JA 0x7 514#define JCC_JS 0x8 515#define JCC_JNS 0x9 516#define JCC_JP 0xa 517#define JCC_JNP 0xb 518#define JCC_JL 0xc 519#define JCC_JGE 0xd 520#define JCC_JLE 0xe 521#define JCC_JG 0xf 522 523static const uint8_t tcg_cond_to_jcc[] = { 524 [TCG_COND_EQ] = JCC_JE, 525 [TCG_COND_NE] = JCC_JNE, 526 [TCG_COND_LT] = JCC_JL, 527 [TCG_COND_GE] = JCC_JGE, 528 [TCG_COND_LE] = JCC_JLE, 529 [TCG_COND_GT] = JCC_JG, 530 [TCG_COND_LTU] = JCC_JB, 531 [TCG_COND_GEU] = JCC_JAE, 532 [TCG_COND_LEU] = JCC_JBE, 533 [TCG_COND_GTU] = JCC_JA, 534}; 535 536#if TCG_TARGET_REG_BITS == 64 537static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 538{ 539 int rex; 540 541 if (opc & P_GS) { 542 tcg_out8(s, 0x65); 543 } 544 if (opc & P_DATA16) { 545 /* We should never be asking for both 16 and 64-bit operation. */ 546 tcg_debug_assert((opc & P_REXW) == 0); 547 tcg_out8(s, 0x66); 548 } 549 if (opc & P_SIMDF3) { 550 tcg_out8(s, 0xf3); 551 } else if (opc & P_SIMDF2) { 552 tcg_out8(s, 0xf2); 553 } 554 555 rex = 0; 556 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 557 rex |= (r & 8) >> 1; /* REX.R */ 558 rex |= (x & 8) >> 2; /* REX.X */ 559 rex |= (rm & 8) >> 3; /* REX.B */ 560 561 /* P_REXB_{R,RM} indicates that the given register is the low byte. 562 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 563 as otherwise the encoding indicates %[abcd]h. Note that the values 564 that are ORed in merely indicate that the REX byte must be present; 565 those bits get discarded in output. */ 566 rex |= opc & (r >= 4 ? P_REXB_R : 0); 567 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 568 569 if (rex) { 570 tcg_out8(s, (uint8_t)(rex | 0x40)); 571 } 572 573 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 574 tcg_out8(s, 0x0f); 575 if (opc & P_EXT38) { 576 tcg_out8(s, 0x38); 577 } else if (opc & P_EXT3A) { 578 tcg_out8(s, 0x3a); 579 } 580 } 581 582 tcg_out8(s, opc); 583} 584#else 585static void tcg_out_opc(TCGContext *s, int opc) 586{ 587 if (opc & P_DATA16) { 588 tcg_out8(s, 0x66); 589 } 590 if (opc & P_SIMDF3) { 591 tcg_out8(s, 0xf3); 592 } else if (opc & P_SIMDF2) { 593 tcg_out8(s, 0xf2); 594 } 595 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 596 tcg_out8(s, 0x0f); 597 if (opc & P_EXT38) { 598 tcg_out8(s, 0x38); 599 } else if (opc & P_EXT3A) { 600 tcg_out8(s, 0x3a); 601 } 602 } 603 tcg_out8(s, opc); 604} 605/* Discard the register arguments to tcg_out_opc early, so as not to penalize 606 the 32-bit compilation paths. This method works with all versions of gcc, 607 whereas relying on optimization may not be able to exclude them. */ 608#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 609#endif 610 611static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 612{ 613 tcg_out_opc(s, opc, r, rm, 0); 614 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 615} 616 617static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 618 int rm, int index) 619{ 620 int tmp; 621 622 /* Use the two byte form if possible, which cannot encode 623 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 624 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 625 && ((rm | index) & 8) == 0) { 626 /* Two byte VEX prefix. */ 627 tcg_out8(s, 0xc5); 628 629 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 630 } else { 631 /* Three byte VEX prefix. */ 632 tcg_out8(s, 0xc4); 633 634 /* VEX.m-mmmm */ 635 if (opc & P_EXT3A) { 636 tmp = 3; 637 } else if (opc & P_EXT38) { 638 tmp = 2; 639 } else if (opc & P_EXT) { 640 tmp = 1; 641 } else { 642 g_assert_not_reached(); 643 } 644 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 645 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 646 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 647 tcg_out8(s, tmp); 648 649 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 650 } 651 652 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 653 /* VEX.pp */ 654 if (opc & P_DATA16) { 655 tmp |= 1; /* 0x66 */ 656 } else if (opc & P_SIMDF3) { 657 tmp |= 2; /* 0xf3 */ 658 } else if (opc & P_SIMDF2) { 659 tmp |= 3; /* 0xf2 */ 660 } 661 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 662 tcg_out8(s, tmp); 663 tcg_out8(s, opc); 664} 665 666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 667 int rm, int index) 668{ 669 /* The entire 4-byte evex prefix; with R' and V' set. */ 670 uint32_t p = 0x08041062; 671 int mm, pp; 672 673 tcg_debug_assert(have_avx512vl); 674 675 /* EVEX.mm */ 676 if (opc & P_EXT3A) { 677 mm = 3; 678 } else if (opc & P_EXT38) { 679 mm = 2; 680 } else if (opc & P_EXT) { 681 mm = 1; 682 } else { 683 g_assert_not_reached(); 684 } 685 686 /* EVEX.pp */ 687 if (opc & P_DATA16) { 688 pp = 1; /* 0x66 */ 689 } else if (opc & P_SIMDF3) { 690 pp = 2; /* 0xf3 */ 691 } else if (opc & P_SIMDF2) { 692 pp = 3; /* 0xf2 */ 693 } else { 694 pp = 0; 695 } 696 697 p = deposit32(p, 8, 2, mm); 698 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 699 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 700 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 701 p = deposit32(p, 16, 2, pp); 702 p = deposit32(p, 19, 4, ~v); 703 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 704 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 705 706 tcg_out32(s, p); 707 tcg_out8(s, opc); 708} 709 710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 711{ 712 if (opc & P_EVEX) { 713 tcg_out_evex_opc(s, opc, r, v, rm, 0); 714 } else { 715 tcg_out_vex_opc(s, opc, r, v, rm, 0); 716 } 717 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 718} 719 720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 721 We handle either RM and INDEX missing with a negative value. In 64-bit 722 mode for absolute addresses, ~RM is the size of the immediate operand 723 that will follow the instruction. */ 724 725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 726 int shift, intptr_t offset) 727{ 728 int mod, len; 729 730 if (index < 0 && rm < 0) { 731 if (TCG_TARGET_REG_BITS == 64) { 732 /* Try for a rip-relative addressing mode. This has replaced 733 the 32-bit-mode absolute addressing encoding. */ 734 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 735 intptr_t disp = offset - pc; 736 if (disp == (int32_t)disp) { 737 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 738 tcg_out32(s, disp); 739 return; 740 } 741 742 /* Try for an absolute address encoding. This requires the 743 use of the MODRM+SIB encoding and is therefore larger than 744 rip-relative addressing. */ 745 if (offset == (int32_t)offset) { 746 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 747 tcg_out8(s, (4 << 3) | 5); 748 tcg_out32(s, offset); 749 return; 750 } 751 752 /* ??? The memory isn't directly addressable. */ 753 g_assert_not_reached(); 754 } else { 755 /* Absolute address. */ 756 tcg_out8(s, (r << 3) | 5); 757 tcg_out32(s, offset); 758 return; 759 } 760 } 761 762 /* Find the length of the immediate addend. Note that the encoding 763 that would be used for (%ebp) indicates absolute addressing. */ 764 if (rm < 0) { 765 mod = 0, len = 4, rm = 5; 766 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 767 mod = 0, len = 0; 768 } else if (offset == (int8_t)offset) { 769 mod = 0x40, len = 1; 770 } else { 771 mod = 0x80, len = 4; 772 } 773 774 /* Use a single byte MODRM format if possible. Note that the encoding 775 that would be used for %esp is the escape to the two byte form. */ 776 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 777 /* Single byte MODRM format. */ 778 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 779 } else { 780 /* Two byte MODRM+SIB format. */ 781 782 /* Note that the encoding that would place %esp into the index 783 field indicates no index register. In 64-bit mode, the REX.X 784 bit counts, so %r12 can be used as the index. */ 785 if (index < 0) { 786 index = 4; 787 } else { 788 tcg_debug_assert(index != TCG_REG_ESP); 789 } 790 791 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 792 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 793 } 794 795 if (len == 1) { 796 tcg_out8(s, offset); 797 } else if (len == 4) { 798 tcg_out32(s, offset); 799 } 800} 801 802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 803 int index, int shift, intptr_t offset) 804{ 805 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 806 tcg_out_sib_offset(s, r, rm, index, shift, offset); 807} 808 809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 810 int rm, int index, int shift, 811 intptr_t offset) 812{ 813 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 814 tcg_out_sib_offset(s, r, rm, index, shift, offset); 815} 816 817/* A simplification of the above with no index or shift. */ 818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 819 int rm, intptr_t offset) 820{ 821 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 822} 823 824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 825 int v, int rm, intptr_t offset) 826{ 827 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 828} 829 830/* Output an opcode with an expected reference to the constant pool. */ 831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 832{ 833 tcg_out_opc(s, opc, r, 0, 0); 834 /* Absolute for 32-bit, pc-relative for 64-bit. */ 835 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 836 tcg_out32(s, 0); 837} 838 839/* Output an opcode with an expected reference to the constant pool. */ 840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 841{ 842 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 843 /* Absolute for 32-bit, pc-relative for 64-bit. */ 844 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 845 tcg_out32(s, 0); 846} 847 848/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 850{ 851 /* Propagate an opcode prefix, such as P_REXW. */ 852 int ext = subop & ~0x7; 853 subop &= 0x7; 854 855 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 856} 857 858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 859{ 860 int rexw = 0; 861 862 if (arg == ret) { 863 return true; 864 } 865 switch (type) { 866 case TCG_TYPE_I64: 867 rexw = P_REXW; 868 /* fallthru */ 869 case TCG_TYPE_I32: 870 if (ret < 16) { 871 if (arg < 16) { 872 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 873 } else { 874 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 875 } 876 } else { 877 if (arg < 16) { 878 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 879 } else { 880 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 881 } 882 } 883 break; 884 885 case TCG_TYPE_V64: 886 tcg_debug_assert(ret >= 16 && arg >= 16); 887 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 888 break; 889 case TCG_TYPE_V128: 890 tcg_debug_assert(ret >= 16 && arg >= 16); 891 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 892 break; 893 case TCG_TYPE_V256: 894 tcg_debug_assert(ret >= 16 && arg >= 16); 895 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 896 break; 897 898 default: 899 g_assert_not_reached(); 900 } 901 return true; 902} 903 904static const int avx2_dup_insn[4] = { 905 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 906 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 907}; 908 909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 910 TCGReg r, TCGReg a) 911{ 912 if (have_avx2) { 913 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 914 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 915 } else { 916 switch (vece) { 917 case MO_8: 918 /* ??? With zero in a register, use PSHUFB. */ 919 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 920 a = r; 921 /* FALLTHRU */ 922 case MO_16: 923 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 924 a = r; 925 /* FALLTHRU */ 926 case MO_32: 927 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 928 /* imm8 operand: all output lanes selected from input lane 0. */ 929 tcg_out8(s, 0); 930 break; 931 case MO_64: 932 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 933 break; 934 default: 935 g_assert_not_reached(); 936 } 937 } 938 return true; 939} 940 941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 942 TCGReg r, TCGReg base, intptr_t offset) 943{ 944 if (have_avx2) { 945 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 946 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 947 r, 0, base, offset); 948 } else { 949 switch (vece) { 950 case MO_64: 951 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 952 break; 953 case MO_32: 954 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 955 break; 956 case MO_16: 957 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 958 tcg_out8(s, 0); /* imm8 */ 959 tcg_out_dup_vec(s, type, vece, r, r); 960 break; 961 case MO_8: 962 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 963 tcg_out8(s, 0); /* imm8 */ 964 tcg_out_dup_vec(s, type, vece, r, r); 965 break; 966 default: 967 g_assert_not_reached(); 968 } 969 } 970 return true; 971} 972 973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 974 TCGReg ret, int64_t arg) 975{ 976 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 977 978 if (arg == 0) { 979 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 980 return; 981 } 982 if (arg == -1) { 983 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 984 return; 985 } 986 987 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 988 if (have_avx2) { 989 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 990 } else { 991 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 992 } 993 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 994 } else { 995 if (type == TCG_TYPE_V64) { 996 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 997 } else if (have_avx2) { 998 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 999 } else { 1000 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1001 } 1002 if (TCG_TARGET_REG_BITS == 64) { 1003 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1004 } else { 1005 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1006 } 1007 } 1008} 1009 1010static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1011 TCGReg ret, tcg_target_long arg) 1012{ 1013 if (arg == 0) { 1014 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1015 return; 1016 } 1017 if (arg == -1) { 1018 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1019 return; 1020 } 1021 1022 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1023 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1024 if (TCG_TARGET_REG_BITS == 64) { 1025 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1026 } else { 1027 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1028 } 1029} 1030 1031static void tcg_out_movi_int(TCGContext *s, TCGType type, 1032 TCGReg ret, tcg_target_long arg) 1033{ 1034 tcg_target_long diff; 1035 1036 if (arg == 0) { 1037 tgen_arithr(s, ARITH_XOR, ret, ret); 1038 return; 1039 } 1040 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1041 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1042 tcg_out32(s, arg); 1043 return; 1044 } 1045 if (arg == (int32_t)arg) { 1046 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1047 tcg_out32(s, arg); 1048 return; 1049 } 1050 1051 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1052 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1053 if (diff == (int32_t)diff) { 1054 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1055 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1056 tcg_out32(s, diff); 1057 return; 1058 } 1059 1060 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1061 tcg_out64(s, arg); 1062} 1063 1064static void tcg_out_movi(TCGContext *s, TCGType type, 1065 TCGReg ret, tcg_target_long arg) 1066{ 1067 switch (type) { 1068 case TCG_TYPE_I32: 1069#if TCG_TARGET_REG_BITS == 64 1070 case TCG_TYPE_I64: 1071#endif 1072 if (ret < 16) { 1073 tcg_out_movi_int(s, type, ret, arg); 1074 } else { 1075 tcg_out_movi_vec(s, type, ret, arg); 1076 } 1077 break; 1078 default: 1079 g_assert_not_reached(); 1080 } 1081} 1082 1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1084{ 1085 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1086 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1087 return true; 1088} 1089 1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1091 tcg_target_long imm) 1092{ 1093 /* This function is only used for passing structs by reference. */ 1094 tcg_debug_assert(imm == (int32_t)imm); 1095 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1096} 1097 1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1099{ 1100 if (val == (int8_t)val) { 1101 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1102 tcg_out8(s, val); 1103 } else if (val == (int32_t)val) { 1104 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1105 tcg_out32(s, val); 1106 } else { 1107 g_assert_not_reached(); 1108 } 1109} 1110 1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1112{ 1113 /* Given the strength of x86 memory ordering, we only need care for 1114 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1115 faster than "mfence", so don't bother with the sse insn. */ 1116 if (a0 & TCG_MO_ST_LD) { 1117 tcg_out8(s, 0xf0); 1118 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1119 tcg_out8(s, 0); 1120 } 1121} 1122 1123static inline void tcg_out_push(TCGContext *s, int reg) 1124{ 1125 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1126} 1127 1128static inline void tcg_out_pop(TCGContext *s, int reg) 1129{ 1130 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1131} 1132 1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1134 TCGReg arg1, intptr_t arg2) 1135{ 1136 switch (type) { 1137 case TCG_TYPE_I32: 1138 if (ret < 16) { 1139 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1140 } else { 1141 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1142 } 1143 break; 1144 case TCG_TYPE_I64: 1145 if (ret < 16) { 1146 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1147 break; 1148 } 1149 /* FALLTHRU */ 1150 case TCG_TYPE_V64: 1151 /* There is no instruction that can validate 8-byte alignment. */ 1152 tcg_debug_assert(ret >= 16); 1153 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1154 break; 1155 case TCG_TYPE_V128: 1156 /* 1157 * The gvec infrastructure is asserts that v128 vector loads 1158 * and stores use a 16-byte aligned offset. Validate that the 1159 * final pointer is aligned by using an insn that will SIGSEGV. 1160 */ 1161 tcg_debug_assert(ret >= 16); 1162 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1163 break; 1164 case TCG_TYPE_V256: 1165 /* 1166 * The gvec infrastructure only requires 16-byte alignment, 1167 * so here we must use an unaligned load. 1168 */ 1169 tcg_debug_assert(ret >= 16); 1170 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1171 ret, 0, arg1, arg2); 1172 break; 1173 default: 1174 g_assert_not_reached(); 1175 } 1176} 1177 1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1179 TCGReg arg1, intptr_t arg2) 1180{ 1181 switch (type) { 1182 case TCG_TYPE_I32: 1183 if (arg < 16) { 1184 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1185 } else { 1186 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1187 } 1188 break; 1189 case TCG_TYPE_I64: 1190 if (arg < 16) { 1191 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1192 break; 1193 } 1194 /* FALLTHRU */ 1195 case TCG_TYPE_V64: 1196 /* There is no instruction that can validate 8-byte alignment. */ 1197 tcg_debug_assert(arg >= 16); 1198 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1199 break; 1200 case TCG_TYPE_V128: 1201 /* 1202 * The gvec infrastructure is asserts that v128 vector loads 1203 * and stores use a 16-byte aligned offset. Validate that the 1204 * final pointer is aligned by using an insn that will SIGSEGV. 1205 * 1206 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1207 * for _WIN64, which must have SSE2 but may not have AVX. 1208 */ 1209 tcg_debug_assert(arg >= 16); 1210 if (have_avx1) { 1211 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1212 } else { 1213 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1214 } 1215 break; 1216 case TCG_TYPE_V256: 1217 /* 1218 * The gvec infrastructure only requires 16-byte alignment, 1219 * so here we must use an unaligned store. 1220 */ 1221 tcg_debug_assert(arg >= 16); 1222 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1223 arg, 0, arg1, arg2); 1224 break; 1225 default: 1226 g_assert_not_reached(); 1227 } 1228} 1229 1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1231 TCGReg base, intptr_t ofs) 1232{ 1233 int rexw = 0; 1234 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1235 if (val != (int32_t)val) { 1236 return false; 1237 } 1238 rexw = P_REXW; 1239 } else if (type != TCG_TYPE_I32) { 1240 return false; 1241 } 1242 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1243 tcg_out32(s, val); 1244 return true; 1245} 1246 1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1248{ 1249 /* Propagate an opcode prefix, such as P_DATA16. */ 1250 int ext = subopc & ~0x7; 1251 subopc &= 0x7; 1252 1253 if (count == 1) { 1254 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1255 } else { 1256 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1257 tcg_out8(s, count); 1258 } 1259} 1260 1261static inline void tcg_out_bswap32(TCGContext *s, int reg) 1262{ 1263 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1264} 1265 1266static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1267{ 1268 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1269} 1270 1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1272{ 1273 /* movzbl */ 1274 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1275 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1276} 1277 1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1279{ 1280 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1281 /* movsbl */ 1282 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1283 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1284} 1285 1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1287{ 1288 /* movzwl */ 1289 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1290} 1291 1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1293{ 1294 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1295 /* movsw[lq] */ 1296 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1297} 1298 1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1300{ 1301 /* 32-bit mov zero extends. */ 1302 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1303} 1304 1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1306{ 1307 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1308 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1309} 1310 1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1312{ 1313 tcg_out_ext32s(s, dest, src); 1314} 1315 1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1317{ 1318 if (dest != src) { 1319 tcg_out_ext32u(s, dest, src); 1320 } 1321} 1322 1323static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1324{ 1325 tcg_out_ext32u(s, dest, src); 1326} 1327 1328static inline void tcg_out_bswap64(TCGContext *s, int reg) 1329{ 1330 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1331} 1332 1333static void tgen_arithi(TCGContext *s, int c, int r0, 1334 tcg_target_long val, int cf) 1335{ 1336 int rexw = 0; 1337 1338 if (TCG_TARGET_REG_BITS == 64) { 1339 rexw = c & -8; 1340 c &= 7; 1341 } 1342 1343 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1344 partial flags update stalls on Pentium4 and are not recommended 1345 by current Intel optimization manuals. */ 1346 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) { 1347 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1348 if (TCG_TARGET_REG_BITS == 64) { 1349 /* The single-byte increment encodings are re-tasked as the 1350 REX prefixes. Use the MODRM encoding. */ 1351 tcg_out_modrm(s, OPC_GRP5 + rexw, 1352 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1353 } else { 1354 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1355 } 1356 return; 1357 } 1358 1359 if (c == ARITH_AND) { 1360 if (TCG_TARGET_REG_BITS == 64) { 1361 if (val == 0xffffffffu) { 1362 tcg_out_ext32u(s, r0, r0); 1363 return; 1364 } 1365 if (val == (uint32_t)val) { 1366 /* AND with no high bits set can use a 32-bit operation. */ 1367 rexw = 0; 1368 } 1369 } 1370 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1371 tcg_out_ext8u(s, r0, r0); 1372 return; 1373 } 1374 if (val == 0xffffu) { 1375 tcg_out_ext16u(s, r0, r0); 1376 return; 1377 } 1378 } 1379 1380 if (val == (int8_t)val) { 1381 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1382 tcg_out8(s, val); 1383 return; 1384 } 1385 if (rexw == 0 || val == (int32_t)val) { 1386 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1387 tcg_out32(s, val); 1388 return; 1389 } 1390 1391 g_assert_not_reached(); 1392} 1393 1394static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1395{ 1396 if (val != 0) { 1397 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1398 } 1399} 1400 1401/* Set SMALL to force a short forward branch. */ 1402static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1403{ 1404 int32_t val, val1; 1405 1406 if (l->has_value) { 1407 val = tcg_pcrel_diff(s, l->u.value_ptr); 1408 val1 = val - 2; 1409 if ((int8_t)val1 == val1) { 1410 if (opc == -1) { 1411 tcg_out8(s, OPC_JMP_short); 1412 } else { 1413 tcg_out8(s, OPC_JCC_short + opc); 1414 } 1415 tcg_out8(s, val1); 1416 } else { 1417 tcg_debug_assert(!small); 1418 if (opc == -1) { 1419 tcg_out8(s, OPC_JMP_long); 1420 tcg_out32(s, val - 5); 1421 } else { 1422 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1423 tcg_out32(s, val - 6); 1424 } 1425 } 1426 } else if (small) { 1427 if (opc == -1) { 1428 tcg_out8(s, OPC_JMP_short); 1429 } else { 1430 tcg_out8(s, OPC_JCC_short + opc); 1431 } 1432 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1433 s->code_ptr += 1; 1434 } else { 1435 if (opc == -1) { 1436 tcg_out8(s, OPC_JMP_long); 1437 } else { 1438 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1439 } 1440 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1441 s->code_ptr += 4; 1442 } 1443} 1444 1445static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1446 int const_arg2, int rexw) 1447{ 1448 if (const_arg2) { 1449 if (arg2 == 0) { 1450 /* test r, r */ 1451 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1452 } else { 1453 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1454 } 1455 } else { 1456 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1457 } 1458} 1459 1460static void tcg_out_brcond32(TCGContext *s, TCGCond cond, 1461 TCGArg arg1, TCGArg arg2, int const_arg2, 1462 TCGLabel *label, int small) 1463{ 1464 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1465 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1466} 1467 1468#if TCG_TARGET_REG_BITS == 64 1469static void tcg_out_brcond64(TCGContext *s, TCGCond cond, 1470 TCGArg arg1, TCGArg arg2, int const_arg2, 1471 TCGLabel *label, int small) 1472{ 1473 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1474 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1475} 1476#else 1477/* XXX: we implement it at the target level to avoid having to 1478 handle cross basic blocks temporaries */ 1479static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1480 const int *const_args, int small) 1481{ 1482 TCGLabel *label_next = gen_new_label(); 1483 TCGLabel *label_this = arg_label(args[5]); 1484 1485 switch(args[4]) { 1486 case TCG_COND_EQ: 1487 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1488 label_next, 1); 1489 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3], 1490 label_this, small); 1491 break; 1492 case TCG_COND_NE: 1493 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1494 label_this, small); 1495 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3], 1496 label_this, small); 1497 break; 1498 case TCG_COND_LT: 1499 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1500 label_this, small); 1501 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1502 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1503 label_this, small); 1504 break; 1505 case TCG_COND_LE: 1506 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1507 label_this, small); 1508 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1509 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1510 label_this, small); 1511 break; 1512 case TCG_COND_GT: 1513 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1514 label_this, small); 1515 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1516 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1517 label_this, small); 1518 break; 1519 case TCG_COND_GE: 1520 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1521 label_this, small); 1522 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1523 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1524 label_this, small); 1525 break; 1526 case TCG_COND_LTU: 1527 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1528 label_this, small); 1529 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1530 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1531 label_this, small); 1532 break; 1533 case TCG_COND_LEU: 1534 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1535 label_this, small); 1536 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1537 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1538 label_this, small); 1539 break; 1540 case TCG_COND_GTU: 1541 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1542 label_this, small); 1543 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1544 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1545 label_this, small); 1546 break; 1547 case TCG_COND_GEU: 1548 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1549 label_this, small); 1550 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1551 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1552 label_this, small); 1553 break; 1554 default: 1555 g_assert_not_reached(); 1556 } 1557 tcg_out_label(s, label_next); 1558} 1559#endif 1560 1561static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest, 1562 TCGArg arg1, TCGArg arg2, int const_arg2) 1563{ 1564 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1565 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1566 tcg_out_ext8u(s, dest, dest); 1567} 1568 1569#if TCG_TARGET_REG_BITS == 64 1570static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest, 1571 TCGArg arg1, TCGArg arg2, int const_arg2) 1572{ 1573 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1574 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1575 tcg_out_ext8u(s, dest, dest); 1576} 1577#else 1578static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1579 const int *const_args) 1580{ 1581 TCGArg new_args[6]; 1582 TCGLabel *label_true, *label_over; 1583 1584 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1585 1586 if (args[0] == args[1] || args[0] == args[2] 1587 || (!const_args[3] && args[0] == args[3]) 1588 || (!const_args[4] && args[0] == args[4])) { 1589 /* When the destination overlaps with one of the argument 1590 registers, don't do anything tricky. */ 1591 label_true = gen_new_label(); 1592 label_over = gen_new_label(); 1593 1594 new_args[5] = label_arg(label_true); 1595 tcg_out_brcond2(s, new_args, const_args+1, 1); 1596 1597 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1598 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1599 tcg_out_label(s, label_true); 1600 1601 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1602 tcg_out_label(s, label_over); 1603 } else { 1604 /* When the destination does not overlap one of the arguments, 1605 clear the destination first, jump if cond false, and emit an 1606 increment in the true case. This results in smaller code. */ 1607 1608 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1609 1610 label_over = gen_new_label(); 1611 new_args[4] = tcg_invert_cond(new_args[4]); 1612 new_args[5] = label_arg(label_over); 1613 tcg_out_brcond2(s, new_args, const_args+1, 1); 1614 1615 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1616 tcg_out_label(s, label_over); 1617 } 1618} 1619#endif 1620 1621static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1622 TCGReg dest, TCGReg v1) 1623{ 1624 if (have_cmov) { 1625 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1626 } else { 1627 TCGLabel *over = gen_new_label(); 1628 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1629 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1630 tcg_out_label(s, over); 1631 } 1632} 1633 1634static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest, 1635 TCGReg c1, TCGArg c2, int const_c2, 1636 TCGReg v1) 1637{ 1638 tcg_out_cmp(s, c1, c2, const_c2, 0); 1639 tcg_out_cmov(s, cond, 0, dest, v1); 1640} 1641 1642#if TCG_TARGET_REG_BITS == 64 1643static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest, 1644 TCGReg c1, TCGArg c2, int const_c2, 1645 TCGReg v1) 1646{ 1647 tcg_out_cmp(s, c1, c2, const_c2, P_REXW); 1648 tcg_out_cmov(s, cond, P_REXW, dest, v1); 1649} 1650#endif 1651 1652static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1653 TCGArg arg2, bool const_a2) 1654{ 1655 if (have_bmi1) { 1656 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1657 if (const_a2) { 1658 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1659 } else { 1660 tcg_debug_assert(dest != arg2); 1661 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1662 } 1663 } else { 1664 tcg_debug_assert(dest != arg2); 1665 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1666 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1667 } 1668} 1669 1670static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1671 TCGArg arg2, bool const_a2) 1672{ 1673 if (have_lzcnt) { 1674 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1675 if (const_a2) { 1676 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1677 } else { 1678 tcg_debug_assert(dest != arg2); 1679 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1680 } 1681 } else { 1682 tcg_debug_assert(!const_a2); 1683 tcg_debug_assert(dest != arg1); 1684 tcg_debug_assert(dest != arg2); 1685 1686 /* Recall that the output of BSR is the index not the count. */ 1687 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1688 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1689 1690 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1691 tcg_out_cmp(s, arg1, 0, 1, rexw); 1692 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1693 } 1694} 1695 1696static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1697{ 1698 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1699 1700 if (disp == (int32_t)disp) { 1701 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1702 tcg_out32(s, disp); 1703 } else { 1704 /* rip-relative addressing into the constant pool. 1705 This is 6 + 8 = 14 bytes, as compared to using an 1706 immediate load 10 + 6 = 16 bytes, plus we may 1707 be able to re-use the pool constant for more calls. */ 1708 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1709 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1710 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1711 tcg_out32(s, 0); 1712 } 1713} 1714 1715static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1716 const TCGHelperInfo *info) 1717{ 1718 tcg_out_branch(s, 1, dest); 1719 1720#ifndef _WIN32 1721 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1722 /* 1723 * The sysv i386 abi for struct return places a reference as the 1724 * first argument of the stack, and pops that argument with the 1725 * return statement. Since we want to retain the aligned stack 1726 * pointer for the callee, we do not want to actually push that 1727 * argument before the call but rely on the normal store to the 1728 * stack slot. But we do need to compensate for the pop in order 1729 * to reset our correct stack pointer value. 1730 * Pushing a garbage value back onto the stack is quickest. 1731 */ 1732 tcg_out_push(s, TCG_REG_EAX); 1733 } 1734#endif 1735} 1736 1737static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1738{ 1739 tcg_out_branch(s, 0, dest); 1740} 1741 1742static void tcg_out_nopn(TCGContext *s, int n) 1743{ 1744 int i; 1745 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1746 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1747 * duplicate prefix, and all of the interesting recent cores can 1748 * decode and discard the duplicates in a single cycle. 1749 */ 1750 tcg_debug_assert(n >= 1); 1751 for (i = 1; i < n; ++i) { 1752 tcg_out8(s, 0x66); 1753 } 1754 tcg_out8(s, 0x90); 1755} 1756 1757/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */ 1758static void __attribute__((unused)) 1759tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i) 1760{ 1761 /* 1762 * This is used for testing alignment, so we can usually use testb. 1763 * For i686, we have to use testl for %esi/%edi. 1764 */ 1765 if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) { 1766 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r); 1767 tcg_out8(s, i); 1768 } else { 1769 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r); 1770 tcg_out32(s, i); 1771 } 1772} 1773 1774typedef struct { 1775 TCGReg base; 1776 int index; 1777 int ofs; 1778 int seg; 1779 TCGAtomAlign aa; 1780} HostAddress; 1781 1782bool tcg_target_has_memory_bswap(MemOp memop) 1783{ 1784 return have_movbe; 1785} 1786 1787/* 1788 * Because i686 has no register parameters and because x86_64 has xchg 1789 * to handle addr/data register overlap, we have placed all input arguments 1790 * before we need might need a scratch reg. 1791 * 1792 * Even then, a scratch is only needed for l->raddr. Rather than expose 1793 * a general-purpose scratch when we don't actually know it's available, 1794 * use the ra_gen hook to load into RAX if needed. 1795 */ 1796#if TCG_TARGET_REG_BITS == 64 1797static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 1798{ 1799 if (arg < 0) { 1800 arg = TCG_REG_RAX; 1801 } 1802 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 1803 return arg; 1804} 1805static const TCGLdstHelperParam ldst_helper_param = { 1806 .ra_gen = ldst_ra_gen 1807}; 1808#else 1809static const TCGLdstHelperParam ldst_helper_param = { }; 1810#endif 1811 1812/* 1813 * Generate code for the slow path for a load at the end of block 1814 */ 1815static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1816{ 1817 MemOp opc = get_memop(l->oi); 1818 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1819 1820 /* resolve label address */ 1821 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1822 if (label_ptr[1]) { 1823 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1824 } 1825 1826 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 1827 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 1828 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 1829 1830 tcg_out_jmp(s, l->raddr); 1831 return true; 1832} 1833 1834/* 1835 * Generate code for the slow path for a store at the end of block 1836 */ 1837static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1838{ 1839 MemOp opc = get_memop(l->oi); 1840 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1841 1842 /* resolve label address */ 1843 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1844 if (label_ptr[1]) { 1845 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1846 } 1847 1848 tcg_out_st_helper_args(s, l, &ldst_helper_param); 1849 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 1850 1851 tcg_out_jmp(s, l->raddr); 1852 return true; 1853} 1854 1855#ifndef CONFIG_SOFTMMU 1856static HostAddress x86_guest_base = { 1857 .index = -1 1858}; 1859 1860#if defined(__x86_64__) && defined(__linux__) 1861# include <asm/prctl.h> 1862# include <sys/prctl.h> 1863int arch_prctl(int code, unsigned long addr); 1864static inline int setup_guest_base_seg(void) 1865{ 1866 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 1867 return P_GS; 1868 } 1869 return 0; 1870} 1871#elif defined(__x86_64__) && \ 1872 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 1873# include <machine/sysarch.h> 1874static inline int setup_guest_base_seg(void) 1875{ 1876 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 1877 return P_GS; 1878 } 1879 return 0; 1880} 1881#else 1882static inline int setup_guest_base_seg(void) 1883{ 1884 return 0; 1885} 1886#endif /* setup_guest_base_seg */ 1887#endif /* !SOFTMMU */ 1888 1889/* 1890 * For softmmu, perform the TLB load and compare. 1891 * For useronly, perform any required alignment tests. 1892 * In both cases, return a TCGLabelQemuLdst structure if the slow path 1893 * is required and fill in @h with the host address for the fast path. 1894 */ 1895static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 1896 TCGReg addrlo, TCGReg addrhi, 1897 MemOpIdx oi, bool is_ld) 1898{ 1899 TCGLabelQemuLdst *ldst = NULL; 1900 MemOp opc = get_memop(oi); 1901 unsigned a_mask; 1902 1903#ifdef CONFIG_SOFTMMU 1904 h->index = TCG_REG_L0; 1905 h->ofs = 0; 1906 h->seg = 0; 1907#else 1908 *h = x86_guest_base; 1909#endif 1910 h->base = addrlo; 1911 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); 1912 a_mask = (1 << h->aa.align) - 1; 1913 1914#ifdef CONFIG_SOFTMMU 1915 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 1916 : offsetof(CPUTLBEntry, addr_write); 1917 TCGType ttype = TCG_TYPE_I32; 1918 TCGType tlbtype = TCG_TYPE_I32; 1919 int trexw = 0, hrexw = 0, tlbrexw = 0; 1920 unsigned mem_index = get_mmuidx(oi); 1921 unsigned s_bits = opc & MO_SIZE; 1922 unsigned s_mask = (1 << s_bits) - 1; 1923 int tlb_mask; 1924 1925 ldst = new_ldst_label(s); 1926 ldst->is_ld = is_ld; 1927 ldst->oi = oi; 1928 ldst->addrlo_reg = addrlo; 1929 ldst->addrhi_reg = addrhi; 1930 1931 if (TCG_TARGET_REG_BITS == 64) { 1932 ttype = s->addr_type; 1933 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 1934 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 1935 hrexw = P_REXW; 1936 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 1937 tlbtype = TCG_TYPE_I64; 1938 tlbrexw = P_REXW; 1939 } 1940 } 1941 } 1942 1943 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 1944 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 1945 s->page_bits - CPU_TLB_ENTRY_BITS); 1946 1947 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 1948 TLB_MASK_TABLE_OFS(mem_index) + 1949 offsetof(CPUTLBDescFast, mask)); 1950 1951 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 1952 TLB_MASK_TABLE_OFS(mem_index) + 1953 offsetof(CPUTLBDescFast, table)); 1954 1955 /* 1956 * If the required alignment is at least as large as the access, simply 1957 * copy the address and mask. For lesser alignments, check that we don't 1958 * cross pages for the complete access. 1959 */ 1960 if (a_mask >= s_mask) { 1961 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 1962 } else { 1963 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 1964 addrlo, s_mask - a_mask); 1965 } 1966 tlb_mask = s->page_mask | a_mask; 1967 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 1968 1969 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 1970 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 1971 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 1972 1973 /* jne slow_path */ 1974 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1975 ldst->label_ptr[0] = s->code_ptr; 1976 s->code_ptr += 4; 1977 1978 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 1979 /* cmp 4(TCG_REG_L0), addrhi */ 1980 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4); 1981 1982 /* jne slow_path */ 1983 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1984 ldst->label_ptr[1] = s->code_ptr; 1985 s->code_ptr += 4; 1986 } 1987 1988 /* TLB Hit. */ 1989 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 1990 offsetof(CPUTLBEntry, addend)); 1991#else 1992 if (a_mask) { 1993 ldst = new_ldst_label(s); 1994 1995 ldst->is_ld = is_ld; 1996 ldst->oi = oi; 1997 ldst->addrlo_reg = addrlo; 1998 ldst->addrhi_reg = addrhi; 1999 2000 tcg_out_testi(s, addrlo, a_mask); 2001 /* jne slow_path */ 2002 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2003 ldst->label_ptr[0] = s->code_ptr; 2004 s->code_ptr += 4; 2005 } 2006#endif 2007 2008 return ldst; 2009} 2010 2011static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2012 HostAddress h, TCGType type, MemOp memop) 2013{ 2014 bool use_movbe = false; 2015 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2016 int movop = OPC_MOVL_GvEv; 2017 2018 /* Do big-endian loads with movbe. */ 2019 if (memop & MO_BSWAP) { 2020 tcg_debug_assert(have_movbe); 2021 use_movbe = true; 2022 movop = OPC_MOVBE_GyMy; 2023 } 2024 2025 switch (memop & MO_SSIZE) { 2026 case MO_UB: 2027 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2028 h.base, h.index, 0, h.ofs); 2029 break; 2030 case MO_SB: 2031 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2032 h.base, h.index, 0, h.ofs); 2033 break; 2034 case MO_UW: 2035 if (use_movbe) { 2036 /* There is no extending movbe; only low 16-bits are modified. */ 2037 if (datalo != h.base && datalo != h.index) { 2038 /* XOR breaks dependency chains. */ 2039 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2040 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2041 datalo, h.base, h.index, 0, h.ofs); 2042 } else { 2043 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2044 datalo, h.base, h.index, 0, h.ofs); 2045 tcg_out_ext16u(s, datalo, datalo); 2046 } 2047 } else { 2048 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2049 h.base, h.index, 0, h.ofs); 2050 } 2051 break; 2052 case MO_SW: 2053 if (use_movbe) { 2054 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2055 datalo, h.base, h.index, 0, h.ofs); 2056 tcg_out_ext16s(s, type, datalo, datalo); 2057 } else { 2058 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2059 datalo, h.base, h.index, 0, h.ofs); 2060 } 2061 break; 2062 case MO_UL: 2063 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2064 h.base, h.index, 0, h.ofs); 2065 break; 2066#if TCG_TARGET_REG_BITS == 64 2067 case MO_SL: 2068 if (use_movbe) { 2069 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2070 h.base, h.index, 0, h.ofs); 2071 tcg_out_ext32s(s, datalo, datalo); 2072 } else { 2073 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2074 h.base, h.index, 0, h.ofs); 2075 } 2076 break; 2077#endif 2078 case MO_UQ: 2079 if (TCG_TARGET_REG_BITS == 64) { 2080 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2081 h.base, h.index, 0, h.ofs); 2082 break; 2083 } 2084 if (use_movbe) { 2085 TCGReg t = datalo; 2086 datalo = datahi; 2087 datahi = t; 2088 } 2089 if (h.base == datalo || h.index == datalo) { 2090 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2091 h.base, h.index, 0, h.ofs); 2092 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2093 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2094 } else { 2095 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2096 h.base, h.index, 0, h.ofs); 2097 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2098 h.base, h.index, 0, h.ofs + 4); 2099 } 2100 break; 2101 default: 2102 g_assert_not_reached(); 2103 } 2104} 2105 2106static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2107 TCGReg addrlo, TCGReg addrhi, 2108 MemOpIdx oi, TCGType data_type) 2109{ 2110 TCGLabelQemuLdst *ldst; 2111 HostAddress h; 2112 2113 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2114 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2115 2116 if (ldst) { 2117 ldst->type = data_type; 2118 ldst->datalo_reg = datalo; 2119 ldst->datahi_reg = datahi; 2120 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2121 } 2122} 2123 2124static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2125 HostAddress h, MemOp memop) 2126{ 2127 bool use_movbe = false; 2128 int movop = OPC_MOVL_EvGv; 2129 2130 /* 2131 * Do big-endian stores with movbe or softmmu. 2132 * User-only without movbe will have its swapping done generically. 2133 */ 2134 if (memop & MO_BSWAP) { 2135 tcg_debug_assert(have_movbe); 2136 use_movbe = true; 2137 movop = OPC_MOVBE_MyGy; 2138 } 2139 2140 switch (memop & MO_SIZE) { 2141 case MO_8: 2142 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2143 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2144 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2145 datalo, h.base, h.index, 0, h.ofs); 2146 break; 2147 case MO_16: 2148 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2149 h.base, h.index, 0, h.ofs); 2150 break; 2151 case MO_32: 2152 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2153 h.base, h.index, 0, h.ofs); 2154 break; 2155 case MO_64: 2156 if (TCG_TARGET_REG_BITS == 64) { 2157 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2158 h.base, h.index, 0, h.ofs); 2159 } else { 2160 if (use_movbe) { 2161 TCGReg t = datalo; 2162 datalo = datahi; 2163 datahi = t; 2164 } 2165 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2166 h.base, h.index, 0, h.ofs); 2167 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2168 h.base, h.index, 0, h.ofs + 4); 2169 } 2170 break; 2171 default: 2172 g_assert_not_reached(); 2173 } 2174} 2175 2176static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2177 TCGReg addrlo, TCGReg addrhi, 2178 MemOpIdx oi, TCGType data_type) 2179{ 2180 TCGLabelQemuLdst *ldst; 2181 HostAddress h; 2182 2183 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2184 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2185 2186 if (ldst) { 2187 ldst->type = data_type; 2188 ldst->datalo_reg = datalo; 2189 ldst->datahi_reg = datahi; 2190 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2191 } 2192} 2193 2194static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2195{ 2196 /* Reuse the zeroing that exists for goto_ptr. */ 2197 if (a0 == 0) { 2198 tcg_out_jmp(s, tcg_code_gen_epilogue); 2199 } else { 2200 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2201 tcg_out_jmp(s, tb_ret_addr); 2202 } 2203} 2204 2205static void tcg_out_goto_tb(TCGContext *s, int which) 2206{ 2207 /* 2208 * Jump displacement must be aligned for atomic patching; 2209 * see if we need to add extra nops before jump 2210 */ 2211 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2212 if (gap != 1) { 2213 tcg_out_nopn(s, gap - 1); 2214 } 2215 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2216 set_jmp_insn_offset(s, which); 2217 tcg_out32(s, 0); 2218 set_jmp_reset_offset(s, which); 2219} 2220 2221void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2222 uintptr_t jmp_rx, uintptr_t jmp_rw) 2223{ 2224 /* patch the branch destination */ 2225 uintptr_t addr = tb->jmp_target_addr[n]; 2226 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2227 /* no need to flush icache explicitly */ 2228} 2229 2230static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2231 const TCGArg args[TCG_MAX_OP_ARGS], 2232 const int const_args[TCG_MAX_OP_ARGS]) 2233{ 2234 TCGArg a0, a1, a2; 2235 int c, const_a2, vexop, rexw = 0; 2236 2237#if TCG_TARGET_REG_BITS == 64 2238# define OP_32_64(x) \ 2239 case glue(glue(INDEX_op_, x), _i64): \ 2240 rexw = P_REXW; /* FALLTHRU */ \ 2241 case glue(glue(INDEX_op_, x), _i32) 2242#else 2243# define OP_32_64(x) \ 2244 case glue(glue(INDEX_op_, x), _i32) 2245#endif 2246 2247 /* Hoist the loads of the most common arguments. */ 2248 a0 = args[0]; 2249 a1 = args[1]; 2250 a2 = args[2]; 2251 const_a2 = const_args[2]; 2252 2253 switch (opc) { 2254 case INDEX_op_goto_ptr: 2255 /* jmp to the given host address (could be epilogue) */ 2256 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2257 break; 2258 case INDEX_op_br: 2259 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2260 break; 2261 OP_32_64(ld8u): 2262 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2263 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2264 break; 2265 OP_32_64(ld8s): 2266 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2267 break; 2268 OP_32_64(ld16u): 2269 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2270 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2271 break; 2272 OP_32_64(ld16s): 2273 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2274 break; 2275#if TCG_TARGET_REG_BITS == 64 2276 case INDEX_op_ld32u_i64: 2277#endif 2278 case INDEX_op_ld_i32: 2279 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2280 break; 2281 2282 OP_32_64(st8): 2283 if (const_args[0]) { 2284 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2285 tcg_out8(s, a0); 2286 } else { 2287 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2288 } 2289 break; 2290 OP_32_64(st16): 2291 if (const_args[0]) { 2292 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2293 tcg_out16(s, a0); 2294 } else { 2295 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2296 } 2297 break; 2298#if TCG_TARGET_REG_BITS == 64 2299 case INDEX_op_st32_i64: 2300#endif 2301 case INDEX_op_st_i32: 2302 if (const_args[0]) { 2303 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2304 tcg_out32(s, a0); 2305 } else { 2306 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2307 } 2308 break; 2309 2310 OP_32_64(add): 2311 /* For 3-operand addition, use LEA. */ 2312 if (a0 != a1) { 2313 TCGArg c3 = 0; 2314 if (const_a2) { 2315 c3 = a2, a2 = -1; 2316 } else if (a0 == a2) { 2317 /* Watch out for dest = src + dest, since we've removed 2318 the matching constraint on the add. */ 2319 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2320 break; 2321 } 2322 2323 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2324 break; 2325 } 2326 c = ARITH_ADD; 2327 goto gen_arith; 2328 OP_32_64(sub): 2329 c = ARITH_SUB; 2330 goto gen_arith; 2331 OP_32_64(and): 2332 c = ARITH_AND; 2333 goto gen_arith; 2334 OP_32_64(or): 2335 c = ARITH_OR; 2336 goto gen_arith; 2337 OP_32_64(xor): 2338 c = ARITH_XOR; 2339 goto gen_arith; 2340 gen_arith: 2341 if (const_a2) { 2342 tgen_arithi(s, c + rexw, a0, a2, 0); 2343 } else { 2344 tgen_arithr(s, c + rexw, a0, a2); 2345 } 2346 break; 2347 2348 OP_32_64(andc): 2349 if (const_a2) { 2350 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2351 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2352 } else { 2353 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2354 } 2355 break; 2356 2357 OP_32_64(mul): 2358 if (const_a2) { 2359 int32_t val; 2360 val = a2; 2361 if (val == (int8_t)val) { 2362 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2363 tcg_out8(s, val); 2364 } else { 2365 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2366 tcg_out32(s, val); 2367 } 2368 } else { 2369 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2370 } 2371 break; 2372 2373 OP_32_64(div2): 2374 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2375 break; 2376 OP_32_64(divu2): 2377 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2378 break; 2379 2380 OP_32_64(shl): 2381 /* For small constant 3-operand shift, use LEA. */ 2382 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2383 if (a2 - 1 == 0) { 2384 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2385 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2386 } else { 2387 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2388 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2389 } 2390 break; 2391 } 2392 c = SHIFT_SHL; 2393 vexop = OPC_SHLX; 2394 goto gen_shift_maybe_vex; 2395 OP_32_64(shr): 2396 c = SHIFT_SHR; 2397 vexop = OPC_SHRX; 2398 goto gen_shift_maybe_vex; 2399 OP_32_64(sar): 2400 c = SHIFT_SAR; 2401 vexop = OPC_SARX; 2402 goto gen_shift_maybe_vex; 2403 OP_32_64(rotl): 2404 c = SHIFT_ROL; 2405 goto gen_shift; 2406 OP_32_64(rotr): 2407 c = SHIFT_ROR; 2408 goto gen_shift; 2409 gen_shift_maybe_vex: 2410 if (have_bmi2) { 2411 if (!const_a2) { 2412 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2413 break; 2414 } 2415 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2416 } 2417 /* FALLTHRU */ 2418 gen_shift: 2419 if (const_a2) { 2420 tcg_out_shifti(s, c + rexw, a0, a2); 2421 } else { 2422 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2423 } 2424 break; 2425 2426 OP_32_64(ctz): 2427 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2428 break; 2429 OP_32_64(clz): 2430 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2431 break; 2432 OP_32_64(ctpop): 2433 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2434 break; 2435 2436 case INDEX_op_brcond_i32: 2437 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2438 break; 2439 case INDEX_op_setcond_i32: 2440 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2); 2441 break; 2442 case INDEX_op_movcond_i32: 2443 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]); 2444 break; 2445 2446 OP_32_64(bswap16): 2447 if (a2 & TCG_BSWAP_OS) { 2448 /* Output must be sign-extended. */ 2449 if (rexw) { 2450 tcg_out_bswap64(s, a0); 2451 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2452 } else { 2453 tcg_out_bswap32(s, a0); 2454 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2455 } 2456 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2457 /* Output must be zero-extended, but input isn't. */ 2458 tcg_out_bswap32(s, a0); 2459 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2460 } else { 2461 tcg_out_rolw_8(s, a0); 2462 } 2463 break; 2464 OP_32_64(bswap32): 2465 tcg_out_bswap32(s, a0); 2466 if (rexw && (a2 & TCG_BSWAP_OS)) { 2467 tcg_out_ext32s(s, a0, a0); 2468 } 2469 break; 2470 2471 OP_32_64(neg): 2472 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2473 break; 2474 OP_32_64(not): 2475 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2476 break; 2477 2478 case INDEX_op_qemu_ld_a64_i32: 2479 if (TCG_TARGET_REG_BITS == 32) { 2480 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2481 break; 2482 } 2483 /* fall through */ 2484 case INDEX_op_qemu_ld_a32_i32: 2485 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2486 break; 2487 case INDEX_op_qemu_ld_a32_i64: 2488 if (TCG_TARGET_REG_BITS == 64) { 2489 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2490 } else { 2491 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2492 } 2493 break; 2494 case INDEX_op_qemu_ld_a64_i64: 2495 if (TCG_TARGET_REG_BITS == 64) { 2496 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2497 } else { 2498 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2499 } 2500 break; 2501 2502 case INDEX_op_qemu_st_a64_i32: 2503 case INDEX_op_qemu_st8_a64_i32: 2504 if (TCG_TARGET_REG_BITS == 32) { 2505 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2506 break; 2507 } 2508 /* fall through */ 2509 case INDEX_op_qemu_st_a32_i32: 2510 case INDEX_op_qemu_st8_a32_i32: 2511 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2512 break; 2513 case INDEX_op_qemu_st_a32_i64: 2514 if (TCG_TARGET_REG_BITS == 64) { 2515 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2516 } else { 2517 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2518 } 2519 break; 2520 case INDEX_op_qemu_st_a64_i64: 2521 if (TCG_TARGET_REG_BITS == 64) { 2522 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2523 } else { 2524 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2525 } 2526 break; 2527 2528 OP_32_64(mulu2): 2529 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2530 break; 2531 OP_32_64(muls2): 2532 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2533 break; 2534 OP_32_64(add2): 2535 if (const_args[4]) { 2536 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2537 } else { 2538 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2539 } 2540 if (const_args[5]) { 2541 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2542 } else { 2543 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2544 } 2545 break; 2546 OP_32_64(sub2): 2547 if (const_args[4]) { 2548 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2549 } else { 2550 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2551 } 2552 if (const_args[5]) { 2553 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2554 } else { 2555 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2556 } 2557 break; 2558 2559#if TCG_TARGET_REG_BITS == 32 2560 case INDEX_op_brcond2_i32: 2561 tcg_out_brcond2(s, args, const_args, 0); 2562 break; 2563 case INDEX_op_setcond2_i32: 2564 tcg_out_setcond2(s, args, const_args); 2565 break; 2566#else /* TCG_TARGET_REG_BITS == 64 */ 2567 case INDEX_op_ld32s_i64: 2568 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2569 break; 2570 case INDEX_op_ld_i64: 2571 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2572 break; 2573 case INDEX_op_st_i64: 2574 if (const_args[0]) { 2575 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2576 tcg_out32(s, a0); 2577 } else { 2578 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2579 } 2580 break; 2581 2582 case INDEX_op_brcond_i64: 2583 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2584 break; 2585 case INDEX_op_setcond_i64: 2586 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2); 2587 break; 2588 case INDEX_op_movcond_i64: 2589 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]); 2590 break; 2591 2592 case INDEX_op_bswap64_i64: 2593 tcg_out_bswap64(s, a0); 2594 break; 2595 case INDEX_op_extrh_i64_i32: 2596 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2597 break; 2598#endif 2599 2600 OP_32_64(deposit): 2601 if (args[3] == 0 && args[4] == 8) { 2602 /* load bits 0..7 */ 2603 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2604 } else if (args[3] == 8 && args[4] == 8) { 2605 /* load bits 8..15 */ 2606 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2607 } else if (args[3] == 0 && args[4] == 16) { 2608 /* load bits 0..15 */ 2609 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2610 } else { 2611 g_assert_not_reached(); 2612 } 2613 break; 2614 2615 case INDEX_op_extract_i64: 2616 if (a2 + args[3] == 32) { 2617 /* This is a 32-bit zero-extending right shift. */ 2618 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2619 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2620 break; 2621 } 2622 /* FALLTHRU */ 2623 case INDEX_op_extract_i32: 2624 /* On the off-chance that we can use the high-byte registers. 2625 Otherwise we emit the same ext16 + shift pattern that we 2626 would have gotten from the normal tcg-op.c expansion. */ 2627 tcg_debug_assert(a2 == 8 && args[3] == 8); 2628 if (a1 < 4 && a0 < 8) { 2629 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2630 } else { 2631 tcg_out_ext16u(s, a0, a1); 2632 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2633 } 2634 break; 2635 2636 case INDEX_op_sextract_i32: 2637 /* We don't implement sextract_i64, as we cannot sign-extend to 2638 64-bits without using the REX prefix that explicitly excludes 2639 access to the high-byte registers. */ 2640 tcg_debug_assert(a2 == 8 && args[3] == 8); 2641 if (a1 < 4 && a0 < 8) { 2642 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2643 } else { 2644 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 2645 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2646 } 2647 break; 2648 2649 OP_32_64(extract2): 2650 /* Note that SHRD outputs to the r/m operand. */ 2651 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2652 tcg_out8(s, args[3]); 2653 break; 2654 2655 case INDEX_op_mb: 2656 tcg_out_mb(s, a0); 2657 break; 2658 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2659 case INDEX_op_mov_i64: 2660 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2661 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 2662 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 2663 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 2664 case INDEX_op_ext8s_i64: 2665 case INDEX_op_ext8u_i32: 2666 case INDEX_op_ext8u_i64: 2667 case INDEX_op_ext16s_i32: 2668 case INDEX_op_ext16s_i64: 2669 case INDEX_op_ext16u_i32: 2670 case INDEX_op_ext16u_i64: 2671 case INDEX_op_ext32s_i64: 2672 case INDEX_op_ext32u_i64: 2673 case INDEX_op_ext_i32_i64: 2674 case INDEX_op_extu_i32_i64: 2675 case INDEX_op_extrl_i64_i32: 2676 default: 2677 g_assert_not_reached(); 2678 } 2679 2680#undef OP_32_64 2681} 2682 2683static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2684 unsigned vecl, unsigned vece, 2685 const TCGArg args[TCG_MAX_OP_ARGS], 2686 const int const_args[TCG_MAX_OP_ARGS]) 2687{ 2688 static int const add_insn[4] = { 2689 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2690 }; 2691 static int const ssadd_insn[4] = { 2692 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2693 }; 2694 static int const usadd_insn[4] = { 2695 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2696 }; 2697 static int const sub_insn[4] = { 2698 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2699 }; 2700 static int const sssub_insn[4] = { 2701 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2702 }; 2703 static int const ussub_insn[4] = { 2704 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2705 }; 2706 static int const mul_insn[4] = { 2707 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 2708 }; 2709 static int const shift_imm_insn[4] = { 2710 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2711 }; 2712 static int const cmpeq_insn[4] = { 2713 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2714 }; 2715 static int const cmpgt_insn[4] = { 2716 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2717 }; 2718 static int const punpckl_insn[4] = { 2719 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2720 }; 2721 static int const punpckh_insn[4] = { 2722 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2723 }; 2724 static int const packss_insn[4] = { 2725 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2726 }; 2727 static int const packus_insn[4] = { 2728 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2729 }; 2730 static int const smin_insn[4] = { 2731 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 2732 }; 2733 static int const smax_insn[4] = { 2734 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 2735 }; 2736 static int const umin_insn[4] = { 2737 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 2738 }; 2739 static int const umax_insn[4] = { 2740 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 2741 }; 2742 static int const rotlv_insn[4] = { 2743 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 2744 }; 2745 static int const rotrv_insn[4] = { 2746 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 2747 }; 2748 static int const shlv_insn[4] = { 2749 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 2750 }; 2751 static int const shrv_insn[4] = { 2752 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 2753 }; 2754 static int const sarv_insn[4] = { 2755 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 2756 }; 2757 static int const shls_insn[4] = { 2758 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 2759 }; 2760 static int const shrs_insn[4] = { 2761 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 2762 }; 2763 static int const sars_insn[4] = { 2764 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 2765 }; 2766 static int const vpshldi_insn[4] = { 2767 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 2768 }; 2769 static int const vpshldv_insn[4] = { 2770 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 2771 }; 2772 static int const vpshrdv_insn[4] = { 2773 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 2774 }; 2775 static int const abs_insn[4] = { 2776 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 2777 }; 2778 2779 TCGType type = vecl + TCG_TYPE_V64; 2780 int insn, sub; 2781 TCGArg a0, a1, a2, a3; 2782 2783 a0 = args[0]; 2784 a1 = args[1]; 2785 a2 = args[2]; 2786 2787 switch (opc) { 2788 case INDEX_op_add_vec: 2789 insn = add_insn[vece]; 2790 goto gen_simd; 2791 case INDEX_op_ssadd_vec: 2792 insn = ssadd_insn[vece]; 2793 goto gen_simd; 2794 case INDEX_op_usadd_vec: 2795 insn = usadd_insn[vece]; 2796 goto gen_simd; 2797 case INDEX_op_sub_vec: 2798 insn = sub_insn[vece]; 2799 goto gen_simd; 2800 case INDEX_op_sssub_vec: 2801 insn = sssub_insn[vece]; 2802 goto gen_simd; 2803 case INDEX_op_ussub_vec: 2804 insn = ussub_insn[vece]; 2805 goto gen_simd; 2806 case INDEX_op_mul_vec: 2807 insn = mul_insn[vece]; 2808 goto gen_simd; 2809 case INDEX_op_and_vec: 2810 insn = OPC_PAND; 2811 goto gen_simd; 2812 case INDEX_op_or_vec: 2813 insn = OPC_POR; 2814 goto gen_simd; 2815 case INDEX_op_xor_vec: 2816 insn = OPC_PXOR; 2817 goto gen_simd; 2818 case INDEX_op_smin_vec: 2819 insn = smin_insn[vece]; 2820 goto gen_simd; 2821 case INDEX_op_umin_vec: 2822 insn = umin_insn[vece]; 2823 goto gen_simd; 2824 case INDEX_op_smax_vec: 2825 insn = smax_insn[vece]; 2826 goto gen_simd; 2827 case INDEX_op_umax_vec: 2828 insn = umax_insn[vece]; 2829 goto gen_simd; 2830 case INDEX_op_shlv_vec: 2831 insn = shlv_insn[vece]; 2832 goto gen_simd; 2833 case INDEX_op_shrv_vec: 2834 insn = shrv_insn[vece]; 2835 goto gen_simd; 2836 case INDEX_op_sarv_vec: 2837 insn = sarv_insn[vece]; 2838 goto gen_simd; 2839 case INDEX_op_rotlv_vec: 2840 insn = rotlv_insn[vece]; 2841 goto gen_simd; 2842 case INDEX_op_rotrv_vec: 2843 insn = rotrv_insn[vece]; 2844 goto gen_simd; 2845 case INDEX_op_shls_vec: 2846 insn = shls_insn[vece]; 2847 goto gen_simd; 2848 case INDEX_op_shrs_vec: 2849 insn = shrs_insn[vece]; 2850 goto gen_simd; 2851 case INDEX_op_sars_vec: 2852 insn = sars_insn[vece]; 2853 goto gen_simd; 2854 case INDEX_op_x86_punpckl_vec: 2855 insn = punpckl_insn[vece]; 2856 goto gen_simd; 2857 case INDEX_op_x86_punpckh_vec: 2858 insn = punpckh_insn[vece]; 2859 goto gen_simd; 2860 case INDEX_op_x86_packss_vec: 2861 insn = packss_insn[vece]; 2862 goto gen_simd; 2863 case INDEX_op_x86_packus_vec: 2864 insn = packus_insn[vece]; 2865 goto gen_simd; 2866 case INDEX_op_x86_vpshldv_vec: 2867 insn = vpshldv_insn[vece]; 2868 a1 = a2; 2869 a2 = args[3]; 2870 goto gen_simd; 2871 case INDEX_op_x86_vpshrdv_vec: 2872 insn = vpshrdv_insn[vece]; 2873 a1 = a2; 2874 a2 = args[3]; 2875 goto gen_simd; 2876#if TCG_TARGET_REG_BITS == 32 2877 case INDEX_op_dup2_vec: 2878 /* First merge the two 32-bit inputs to a single 64-bit element. */ 2879 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 2880 /* Then replicate the 64-bit elements across the rest of the vector. */ 2881 if (type != TCG_TYPE_V64) { 2882 tcg_out_dup_vec(s, type, MO_64, a0, a0); 2883 } 2884 break; 2885#endif 2886 case INDEX_op_abs_vec: 2887 insn = abs_insn[vece]; 2888 a2 = a1; 2889 a1 = 0; 2890 goto gen_simd; 2891 gen_simd: 2892 tcg_debug_assert(insn != OPC_UD2); 2893 if (type == TCG_TYPE_V256) { 2894 insn |= P_VEXL; 2895 } 2896 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2897 break; 2898 2899 case INDEX_op_cmp_vec: 2900 sub = args[3]; 2901 if (sub == TCG_COND_EQ) { 2902 insn = cmpeq_insn[vece]; 2903 } else if (sub == TCG_COND_GT) { 2904 insn = cmpgt_insn[vece]; 2905 } else { 2906 g_assert_not_reached(); 2907 } 2908 goto gen_simd; 2909 2910 case INDEX_op_andc_vec: 2911 insn = OPC_PANDN; 2912 if (type == TCG_TYPE_V256) { 2913 insn |= P_VEXL; 2914 } 2915 tcg_out_vex_modrm(s, insn, a0, a2, a1); 2916 break; 2917 2918 case INDEX_op_shli_vec: 2919 insn = shift_imm_insn[vece]; 2920 sub = 6; 2921 goto gen_shift; 2922 case INDEX_op_shri_vec: 2923 insn = shift_imm_insn[vece]; 2924 sub = 2; 2925 goto gen_shift; 2926 case INDEX_op_sari_vec: 2927 if (vece == MO_64) { 2928 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 2929 } else { 2930 insn = shift_imm_insn[vece]; 2931 } 2932 sub = 4; 2933 goto gen_shift; 2934 case INDEX_op_rotli_vec: 2935 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 2936 if (vece == MO_64) { 2937 insn |= P_VEXW; 2938 } 2939 sub = 1; 2940 goto gen_shift; 2941 gen_shift: 2942 tcg_debug_assert(vece != MO_8); 2943 if (type == TCG_TYPE_V256) { 2944 insn |= P_VEXL; 2945 } 2946 tcg_out_vex_modrm(s, insn, sub, a0, a1); 2947 tcg_out8(s, a2); 2948 break; 2949 2950 case INDEX_op_ld_vec: 2951 tcg_out_ld(s, type, a0, a1, a2); 2952 break; 2953 case INDEX_op_st_vec: 2954 tcg_out_st(s, type, a0, a1, a2); 2955 break; 2956 case INDEX_op_dupm_vec: 2957 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 2958 break; 2959 2960 case INDEX_op_x86_shufps_vec: 2961 insn = OPC_SHUFPS; 2962 sub = args[3]; 2963 goto gen_simd_imm8; 2964 case INDEX_op_x86_blend_vec: 2965 if (vece == MO_16) { 2966 insn = OPC_PBLENDW; 2967 } else if (vece == MO_32) { 2968 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 2969 } else { 2970 g_assert_not_reached(); 2971 } 2972 sub = args[3]; 2973 goto gen_simd_imm8; 2974 case INDEX_op_x86_vperm2i128_vec: 2975 insn = OPC_VPERM2I128; 2976 sub = args[3]; 2977 goto gen_simd_imm8; 2978 case INDEX_op_x86_vpshldi_vec: 2979 insn = vpshldi_insn[vece]; 2980 sub = args[3]; 2981 goto gen_simd_imm8; 2982 2983 case INDEX_op_not_vec: 2984 insn = OPC_VPTERNLOGQ; 2985 a2 = a1; 2986 sub = 0x33; /* !B */ 2987 goto gen_simd_imm8; 2988 case INDEX_op_nor_vec: 2989 insn = OPC_VPTERNLOGQ; 2990 sub = 0x11; /* norCB */ 2991 goto gen_simd_imm8; 2992 case INDEX_op_nand_vec: 2993 insn = OPC_VPTERNLOGQ; 2994 sub = 0x77; /* nandCB */ 2995 goto gen_simd_imm8; 2996 case INDEX_op_eqv_vec: 2997 insn = OPC_VPTERNLOGQ; 2998 sub = 0x99; /* xnorCB */ 2999 goto gen_simd_imm8; 3000 case INDEX_op_orc_vec: 3001 insn = OPC_VPTERNLOGQ; 3002 sub = 0xdd; /* orB!C */ 3003 goto gen_simd_imm8; 3004 3005 case INDEX_op_bitsel_vec: 3006 insn = OPC_VPTERNLOGQ; 3007 a3 = args[3]; 3008 if (a0 == a1) { 3009 a1 = a2; 3010 a2 = a3; 3011 sub = 0xca; /* A?B:C */ 3012 } else if (a0 == a2) { 3013 a2 = a3; 3014 sub = 0xe2; /* B?A:C */ 3015 } else { 3016 tcg_out_mov(s, type, a0, a3); 3017 sub = 0xb8; /* B?C:A */ 3018 } 3019 goto gen_simd_imm8; 3020 3021 gen_simd_imm8: 3022 tcg_debug_assert(insn != OPC_UD2); 3023 if (type == TCG_TYPE_V256) { 3024 insn |= P_VEXL; 3025 } 3026 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3027 tcg_out8(s, sub); 3028 break; 3029 3030 case INDEX_op_x86_vpblendvb_vec: 3031 insn = OPC_VPBLENDVB; 3032 if (type == TCG_TYPE_V256) { 3033 insn |= P_VEXL; 3034 } 3035 tcg_out_vex_modrm(s, insn, a0, a1, a2); 3036 tcg_out8(s, args[3] << 4); 3037 break; 3038 3039 case INDEX_op_x86_psrldq_vec: 3040 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3041 tcg_out8(s, a2); 3042 break; 3043 3044 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3045 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3046 default: 3047 g_assert_not_reached(); 3048 } 3049} 3050 3051static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3052{ 3053 switch (op) { 3054 case INDEX_op_goto_ptr: 3055 return C_O0_I1(r); 3056 3057 case INDEX_op_ld8u_i32: 3058 case INDEX_op_ld8u_i64: 3059 case INDEX_op_ld8s_i32: 3060 case INDEX_op_ld8s_i64: 3061 case INDEX_op_ld16u_i32: 3062 case INDEX_op_ld16u_i64: 3063 case INDEX_op_ld16s_i32: 3064 case INDEX_op_ld16s_i64: 3065 case INDEX_op_ld_i32: 3066 case INDEX_op_ld32u_i64: 3067 case INDEX_op_ld32s_i64: 3068 case INDEX_op_ld_i64: 3069 return C_O1_I1(r, r); 3070 3071 case INDEX_op_st8_i32: 3072 case INDEX_op_st8_i64: 3073 return C_O0_I2(qi, r); 3074 3075 case INDEX_op_st16_i32: 3076 case INDEX_op_st16_i64: 3077 case INDEX_op_st_i32: 3078 case INDEX_op_st32_i64: 3079 return C_O0_I2(ri, r); 3080 3081 case INDEX_op_st_i64: 3082 return C_O0_I2(re, r); 3083 3084 case INDEX_op_add_i32: 3085 case INDEX_op_add_i64: 3086 return C_O1_I2(r, r, re); 3087 3088 case INDEX_op_sub_i32: 3089 case INDEX_op_sub_i64: 3090 case INDEX_op_mul_i32: 3091 case INDEX_op_mul_i64: 3092 case INDEX_op_or_i32: 3093 case INDEX_op_or_i64: 3094 case INDEX_op_xor_i32: 3095 case INDEX_op_xor_i64: 3096 return C_O1_I2(r, 0, re); 3097 3098 case INDEX_op_and_i32: 3099 case INDEX_op_and_i64: 3100 return C_O1_I2(r, 0, reZ); 3101 3102 case INDEX_op_andc_i32: 3103 case INDEX_op_andc_i64: 3104 return C_O1_I2(r, r, rI); 3105 3106 case INDEX_op_shl_i32: 3107 case INDEX_op_shl_i64: 3108 case INDEX_op_shr_i32: 3109 case INDEX_op_shr_i64: 3110 case INDEX_op_sar_i32: 3111 case INDEX_op_sar_i64: 3112 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3113 3114 case INDEX_op_rotl_i32: 3115 case INDEX_op_rotl_i64: 3116 case INDEX_op_rotr_i32: 3117 case INDEX_op_rotr_i64: 3118 return C_O1_I2(r, 0, ci); 3119 3120 case INDEX_op_brcond_i32: 3121 case INDEX_op_brcond_i64: 3122 return C_O0_I2(r, re); 3123 3124 case INDEX_op_bswap16_i32: 3125 case INDEX_op_bswap16_i64: 3126 case INDEX_op_bswap32_i32: 3127 case INDEX_op_bswap32_i64: 3128 case INDEX_op_bswap64_i64: 3129 case INDEX_op_neg_i32: 3130 case INDEX_op_neg_i64: 3131 case INDEX_op_not_i32: 3132 case INDEX_op_not_i64: 3133 case INDEX_op_extrh_i64_i32: 3134 return C_O1_I1(r, 0); 3135 3136 case INDEX_op_ext8s_i32: 3137 case INDEX_op_ext8s_i64: 3138 case INDEX_op_ext8u_i32: 3139 case INDEX_op_ext8u_i64: 3140 return C_O1_I1(r, q); 3141 3142 case INDEX_op_ext16s_i32: 3143 case INDEX_op_ext16s_i64: 3144 case INDEX_op_ext16u_i32: 3145 case INDEX_op_ext16u_i64: 3146 case INDEX_op_ext32s_i64: 3147 case INDEX_op_ext32u_i64: 3148 case INDEX_op_ext_i32_i64: 3149 case INDEX_op_extu_i32_i64: 3150 case INDEX_op_extrl_i64_i32: 3151 case INDEX_op_extract_i32: 3152 case INDEX_op_extract_i64: 3153 case INDEX_op_sextract_i32: 3154 case INDEX_op_ctpop_i32: 3155 case INDEX_op_ctpop_i64: 3156 return C_O1_I1(r, r); 3157 3158 case INDEX_op_extract2_i32: 3159 case INDEX_op_extract2_i64: 3160 return C_O1_I2(r, 0, r); 3161 3162 case INDEX_op_deposit_i32: 3163 case INDEX_op_deposit_i64: 3164 return C_O1_I2(Q, 0, Q); 3165 3166 case INDEX_op_setcond_i32: 3167 case INDEX_op_setcond_i64: 3168 return C_O1_I2(q, r, re); 3169 3170 case INDEX_op_movcond_i32: 3171 case INDEX_op_movcond_i64: 3172 return C_O1_I4(r, r, re, r, 0); 3173 3174 case INDEX_op_div2_i32: 3175 case INDEX_op_div2_i64: 3176 case INDEX_op_divu2_i32: 3177 case INDEX_op_divu2_i64: 3178 return C_O2_I3(a, d, 0, 1, r); 3179 3180 case INDEX_op_mulu2_i32: 3181 case INDEX_op_mulu2_i64: 3182 case INDEX_op_muls2_i32: 3183 case INDEX_op_muls2_i64: 3184 return C_O2_I2(a, d, a, r); 3185 3186 case INDEX_op_add2_i32: 3187 case INDEX_op_add2_i64: 3188 case INDEX_op_sub2_i32: 3189 case INDEX_op_sub2_i64: 3190 return C_O2_I4(r, r, 0, 1, re, re); 3191 3192 case INDEX_op_ctz_i32: 3193 case INDEX_op_ctz_i64: 3194 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3195 3196 case INDEX_op_clz_i32: 3197 case INDEX_op_clz_i64: 3198 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3199 3200 case INDEX_op_qemu_ld_a32_i32: 3201 return C_O1_I1(r, L); 3202 case INDEX_op_qemu_ld_a64_i32: 3203 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3204 3205 case INDEX_op_qemu_st_a32_i32: 3206 return C_O0_I2(L, L); 3207 case INDEX_op_qemu_st_a64_i32: 3208 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3209 case INDEX_op_qemu_st8_a32_i32: 3210 return C_O0_I2(s, L); 3211 case INDEX_op_qemu_st8_a64_i32: 3212 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3213 3214 case INDEX_op_qemu_ld_a32_i64: 3215 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3216 case INDEX_op_qemu_ld_a64_i64: 3217 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3218 3219 case INDEX_op_qemu_st_a32_i64: 3220 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3221 case INDEX_op_qemu_st_a64_i64: 3222 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3223 3224 case INDEX_op_brcond2_i32: 3225 return C_O0_I4(r, r, ri, ri); 3226 3227 case INDEX_op_setcond2_i32: 3228 return C_O1_I4(r, r, r, ri, ri); 3229 3230 case INDEX_op_ld_vec: 3231 case INDEX_op_dupm_vec: 3232 return C_O1_I1(x, r); 3233 3234 case INDEX_op_st_vec: 3235 return C_O0_I2(x, r); 3236 3237 case INDEX_op_add_vec: 3238 case INDEX_op_sub_vec: 3239 case INDEX_op_mul_vec: 3240 case INDEX_op_and_vec: 3241 case INDEX_op_or_vec: 3242 case INDEX_op_xor_vec: 3243 case INDEX_op_andc_vec: 3244 case INDEX_op_orc_vec: 3245 case INDEX_op_nand_vec: 3246 case INDEX_op_nor_vec: 3247 case INDEX_op_eqv_vec: 3248 case INDEX_op_ssadd_vec: 3249 case INDEX_op_usadd_vec: 3250 case INDEX_op_sssub_vec: 3251 case INDEX_op_ussub_vec: 3252 case INDEX_op_smin_vec: 3253 case INDEX_op_umin_vec: 3254 case INDEX_op_smax_vec: 3255 case INDEX_op_umax_vec: 3256 case INDEX_op_shlv_vec: 3257 case INDEX_op_shrv_vec: 3258 case INDEX_op_sarv_vec: 3259 case INDEX_op_rotlv_vec: 3260 case INDEX_op_rotrv_vec: 3261 case INDEX_op_shls_vec: 3262 case INDEX_op_shrs_vec: 3263 case INDEX_op_sars_vec: 3264 case INDEX_op_cmp_vec: 3265 case INDEX_op_x86_shufps_vec: 3266 case INDEX_op_x86_blend_vec: 3267 case INDEX_op_x86_packss_vec: 3268 case INDEX_op_x86_packus_vec: 3269 case INDEX_op_x86_vperm2i128_vec: 3270 case INDEX_op_x86_punpckl_vec: 3271 case INDEX_op_x86_punpckh_vec: 3272 case INDEX_op_x86_vpshldi_vec: 3273#if TCG_TARGET_REG_BITS == 32 3274 case INDEX_op_dup2_vec: 3275#endif 3276 return C_O1_I2(x, x, x); 3277 3278 case INDEX_op_abs_vec: 3279 case INDEX_op_dup_vec: 3280 case INDEX_op_not_vec: 3281 case INDEX_op_shli_vec: 3282 case INDEX_op_shri_vec: 3283 case INDEX_op_sari_vec: 3284 case INDEX_op_rotli_vec: 3285 case INDEX_op_x86_psrldq_vec: 3286 return C_O1_I1(x, x); 3287 3288 case INDEX_op_x86_vpshldv_vec: 3289 case INDEX_op_x86_vpshrdv_vec: 3290 return C_O1_I3(x, 0, x, x); 3291 3292 case INDEX_op_bitsel_vec: 3293 case INDEX_op_x86_vpblendvb_vec: 3294 return C_O1_I3(x, x, x, x); 3295 3296 default: 3297 g_assert_not_reached(); 3298 } 3299} 3300 3301int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3302{ 3303 switch (opc) { 3304 case INDEX_op_add_vec: 3305 case INDEX_op_sub_vec: 3306 case INDEX_op_and_vec: 3307 case INDEX_op_or_vec: 3308 case INDEX_op_xor_vec: 3309 case INDEX_op_andc_vec: 3310 case INDEX_op_orc_vec: 3311 case INDEX_op_nand_vec: 3312 case INDEX_op_nor_vec: 3313 case INDEX_op_eqv_vec: 3314 case INDEX_op_not_vec: 3315 case INDEX_op_bitsel_vec: 3316 return 1; 3317 case INDEX_op_cmp_vec: 3318 case INDEX_op_cmpsel_vec: 3319 return -1; 3320 3321 case INDEX_op_rotli_vec: 3322 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3323 3324 case INDEX_op_shli_vec: 3325 case INDEX_op_shri_vec: 3326 /* We must expand the operation for MO_8. */ 3327 return vece == MO_8 ? -1 : 1; 3328 3329 case INDEX_op_sari_vec: 3330 switch (vece) { 3331 case MO_8: 3332 return -1; 3333 case MO_16: 3334 case MO_32: 3335 return 1; 3336 case MO_64: 3337 if (have_avx512vl) { 3338 return 1; 3339 } 3340 /* 3341 * We can emulate this for MO_64, but it does not pay off 3342 * unless we're producing at least 4 values. 3343 */ 3344 return type >= TCG_TYPE_V256 ? -1 : 0; 3345 } 3346 return 0; 3347 3348 case INDEX_op_shls_vec: 3349 case INDEX_op_shrs_vec: 3350 return vece >= MO_16; 3351 case INDEX_op_sars_vec: 3352 switch (vece) { 3353 case MO_16: 3354 case MO_32: 3355 return 1; 3356 case MO_64: 3357 return have_avx512vl; 3358 } 3359 return 0; 3360 case INDEX_op_rotls_vec: 3361 return vece >= MO_16 ? -1 : 0; 3362 3363 case INDEX_op_shlv_vec: 3364 case INDEX_op_shrv_vec: 3365 switch (vece) { 3366 case MO_16: 3367 return have_avx512bw; 3368 case MO_32: 3369 case MO_64: 3370 return have_avx2; 3371 } 3372 return 0; 3373 case INDEX_op_sarv_vec: 3374 switch (vece) { 3375 case MO_16: 3376 return have_avx512bw; 3377 case MO_32: 3378 return have_avx2; 3379 case MO_64: 3380 return have_avx512vl; 3381 } 3382 return 0; 3383 case INDEX_op_rotlv_vec: 3384 case INDEX_op_rotrv_vec: 3385 switch (vece) { 3386 case MO_16: 3387 return have_avx512vbmi2 ? -1 : 0; 3388 case MO_32: 3389 case MO_64: 3390 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3391 } 3392 return 0; 3393 3394 case INDEX_op_mul_vec: 3395 switch (vece) { 3396 case MO_8: 3397 return -1; 3398 case MO_64: 3399 return have_avx512dq; 3400 } 3401 return 1; 3402 3403 case INDEX_op_ssadd_vec: 3404 case INDEX_op_usadd_vec: 3405 case INDEX_op_sssub_vec: 3406 case INDEX_op_ussub_vec: 3407 return vece <= MO_16; 3408 case INDEX_op_smin_vec: 3409 case INDEX_op_smax_vec: 3410 case INDEX_op_umin_vec: 3411 case INDEX_op_umax_vec: 3412 case INDEX_op_abs_vec: 3413 return vece <= MO_32 || have_avx512vl; 3414 3415 default: 3416 return 0; 3417 } 3418} 3419 3420static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3421 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3422{ 3423 TCGv_vec t1, t2; 3424 3425 tcg_debug_assert(vece == MO_8); 3426 3427 t1 = tcg_temp_new_vec(type); 3428 t2 = tcg_temp_new_vec(type); 3429 3430 /* 3431 * Unpack to W, shift, and repack. Tricky bits: 3432 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3433 * i.e. duplicate in other half of the 16-bit lane. 3434 * (2) For right-shift, add 8 so that the high half of the lane 3435 * becomes zero. For left-shift, and left-rotate, we must 3436 * shift up and down again. 3437 * (3) Step 2 leaves high half zero such that PACKUSWB 3438 * (pack with unsigned saturation) does not modify 3439 * the quantity. 3440 */ 3441 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3442 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3443 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3444 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3445 3446 if (opc != INDEX_op_rotli_vec) { 3447 imm += 8; 3448 } 3449 if (opc == INDEX_op_shri_vec) { 3450 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3451 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3452 } else { 3453 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3454 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3455 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3456 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3457 } 3458 3459 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3460 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3461 tcg_temp_free_vec(t1); 3462 tcg_temp_free_vec(t2); 3463} 3464 3465static void expand_vec_sari(TCGType type, unsigned vece, 3466 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3467{ 3468 TCGv_vec t1, t2; 3469 3470 switch (vece) { 3471 case MO_8: 3472 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3473 t1 = tcg_temp_new_vec(type); 3474 t2 = tcg_temp_new_vec(type); 3475 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3476 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3477 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3478 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3479 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3480 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3481 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3482 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3483 tcg_temp_free_vec(t1); 3484 tcg_temp_free_vec(t2); 3485 break; 3486 3487 case MO_64: 3488 t1 = tcg_temp_new_vec(type); 3489 if (imm <= 32) { 3490 /* 3491 * We can emulate a small sign extend by performing an arithmetic 3492 * 32-bit shift and overwriting the high half of a 64-bit logical 3493 * shift. Note that the ISA says shift of 32 is valid, but TCG 3494 * does not, so we have to bound the smaller shift -- we get the 3495 * same result in the high half either way. 3496 */ 3497 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3498 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3499 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3500 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3501 tcgv_vec_arg(t1), 0xaa); 3502 } else { 3503 /* Otherwise we will need to use a compare vs 0 to produce 3504 * the sign-extend, shift and merge. 3505 */ 3506 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 3507 tcg_constant_vec(type, MO_64, 0), v1); 3508 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3509 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3510 tcg_gen_or_vec(MO_64, v0, v0, t1); 3511 } 3512 tcg_temp_free_vec(t1); 3513 break; 3514 3515 default: 3516 g_assert_not_reached(); 3517 } 3518} 3519 3520static void expand_vec_rotli(TCGType type, unsigned vece, 3521 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3522{ 3523 TCGv_vec t; 3524 3525 if (vece == MO_8) { 3526 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3527 return; 3528 } 3529 3530 if (have_avx512vbmi2) { 3531 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 3532 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 3533 return; 3534 } 3535 3536 t = tcg_temp_new_vec(type); 3537 tcg_gen_shli_vec(vece, t, v1, imm); 3538 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3539 tcg_gen_or_vec(vece, v0, v0, t); 3540 tcg_temp_free_vec(t); 3541} 3542 3543static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3544 TCGv_vec v1, TCGv_vec sh, bool right) 3545{ 3546 TCGv_vec t; 3547 3548 if (have_avx512vbmi2) { 3549 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 3550 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 3551 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 3552 return; 3553 } 3554 3555 t = tcg_temp_new_vec(type); 3556 tcg_gen_dupi_vec(vece, t, 8 << vece); 3557 tcg_gen_sub_vec(vece, t, t, sh); 3558 if (right) { 3559 tcg_gen_shlv_vec(vece, t, v1, t); 3560 tcg_gen_shrv_vec(vece, v0, v1, sh); 3561 } else { 3562 tcg_gen_shrv_vec(vece, t, v1, t); 3563 tcg_gen_shlv_vec(vece, v0, v1, sh); 3564 } 3565 tcg_gen_or_vec(vece, v0, v0, t); 3566 tcg_temp_free_vec(t); 3567} 3568 3569static void expand_vec_rotls(TCGType type, unsigned vece, 3570 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3571{ 3572 TCGv_vec t = tcg_temp_new_vec(type); 3573 3574 tcg_debug_assert(vece != MO_8); 3575 3576 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 3577 tcg_gen_dup_i32_vec(vece, t, lsh); 3578 if (vece >= MO_32) { 3579 tcg_gen_rotlv_vec(vece, v0, v1, t); 3580 } else { 3581 expand_vec_rotv(type, vece, v0, v1, t, false); 3582 } 3583 } else { 3584 TCGv_i32 rsh = tcg_temp_new_i32(); 3585 3586 tcg_gen_neg_i32(rsh, lsh); 3587 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3588 tcg_gen_shls_vec(vece, t, v1, lsh); 3589 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3590 tcg_gen_or_vec(vece, v0, v0, t); 3591 3592 tcg_temp_free_i32(rsh); 3593 } 3594 3595 tcg_temp_free_vec(t); 3596} 3597 3598static void expand_vec_mul(TCGType type, unsigned vece, 3599 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3600{ 3601 TCGv_vec t1, t2, t3, t4, zero; 3602 3603 tcg_debug_assert(vece == MO_8); 3604 3605 /* 3606 * Unpack v1 bytes to words, 0 | x. 3607 * Unpack v2 bytes to words, y | 0. 3608 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3609 * Shift logical right by 8 bits to clear the high 8 bytes before 3610 * using an unsigned saturated pack. 3611 * 3612 * The difference between the V64, V128 and V256 cases is merely how 3613 * we distribute the expansion between temporaries. 3614 */ 3615 switch (type) { 3616 case TCG_TYPE_V64: 3617 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3618 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3619 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3620 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3621 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3622 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3623 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3624 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3625 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3626 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3627 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3628 tcg_temp_free_vec(t1); 3629 tcg_temp_free_vec(t2); 3630 break; 3631 3632 case TCG_TYPE_V128: 3633 case TCG_TYPE_V256: 3634 t1 = tcg_temp_new_vec(type); 3635 t2 = tcg_temp_new_vec(type); 3636 t3 = tcg_temp_new_vec(type); 3637 t4 = tcg_temp_new_vec(type); 3638 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3639 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3640 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3641 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3642 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3643 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3644 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3645 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3646 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3647 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3648 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3649 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3650 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3651 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3652 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3653 tcg_temp_free_vec(t1); 3654 tcg_temp_free_vec(t2); 3655 tcg_temp_free_vec(t3); 3656 tcg_temp_free_vec(t4); 3657 break; 3658 3659 default: 3660 g_assert_not_reached(); 3661 } 3662} 3663 3664static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3665 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3666{ 3667 enum { 3668 NEED_INV = 1, 3669 NEED_SWAP = 2, 3670 NEED_BIAS = 4, 3671 NEED_UMIN = 8, 3672 NEED_UMAX = 16, 3673 }; 3674 TCGv_vec t1, t2, t3; 3675 uint8_t fixup; 3676 3677 switch (cond) { 3678 case TCG_COND_EQ: 3679 case TCG_COND_GT: 3680 fixup = 0; 3681 break; 3682 case TCG_COND_NE: 3683 case TCG_COND_LE: 3684 fixup = NEED_INV; 3685 break; 3686 case TCG_COND_LT: 3687 fixup = NEED_SWAP; 3688 break; 3689 case TCG_COND_GE: 3690 fixup = NEED_SWAP | NEED_INV; 3691 break; 3692 case TCG_COND_LEU: 3693 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3694 fixup = NEED_UMIN; 3695 } else { 3696 fixup = NEED_BIAS | NEED_INV; 3697 } 3698 break; 3699 case TCG_COND_GTU: 3700 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { 3701 fixup = NEED_UMIN | NEED_INV; 3702 } else { 3703 fixup = NEED_BIAS; 3704 } 3705 break; 3706 case TCG_COND_GEU: 3707 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3708 fixup = NEED_UMAX; 3709 } else { 3710 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3711 } 3712 break; 3713 case TCG_COND_LTU: 3714 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { 3715 fixup = NEED_UMAX | NEED_INV; 3716 } else { 3717 fixup = NEED_BIAS | NEED_SWAP; 3718 } 3719 break; 3720 default: 3721 g_assert_not_reached(); 3722 } 3723 3724 if (fixup & NEED_INV) { 3725 cond = tcg_invert_cond(cond); 3726 } 3727 if (fixup & NEED_SWAP) { 3728 t1 = v1, v1 = v2, v2 = t1; 3729 cond = tcg_swap_cond(cond); 3730 } 3731 3732 t1 = t2 = NULL; 3733 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3734 t1 = tcg_temp_new_vec(type); 3735 if (fixup & NEED_UMIN) { 3736 tcg_gen_umin_vec(vece, t1, v1, v2); 3737 } else { 3738 tcg_gen_umax_vec(vece, t1, v1, v2); 3739 } 3740 v2 = t1; 3741 cond = TCG_COND_EQ; 3742 } else if (fixup & NEED_BIAS) { 3743 t1 = tcg_temp_new_vec(type); 3744 t2 = tcg_temp_new_vec(type); 3745 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 3746 tcg_gen_sub_vec(vece, t1, v1, t3); 3747 tcg_gen_sub_vec(vece, t2, v2, t3); 3748 v1 = t1; 3749 v2 = t2; 3750 cond = tcg_signed_cond(cond); 3751 } 3752 3753 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 3754 /* Expand directly; do not recurse. */ 3755 vec_gen_4(INDEX_op_cmp_vec, type, vece, 3756 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 3757 3758 if (t1) { 3759 tcg_temp_free_vec(t1); 3760 if (t2) { 3761 tcg_temp_free_vec(t2); 3762 } 3763 } 3764 return fixup & NEED_INV; 3765} 3766 3767static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 3768 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3769{ 3770 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 3771 tcg_gen_not_vec(vece, v0, v0); 3772 } 3773} 3774 3775static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 3776 TCGv_vec c1, TCGv_vec c2, 3777 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 3778{ 3779 TCGv_vec t = tcg_temp_new_vec(type); 3780 3781 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 3782 /* Invert the sense of the compare by swapping arguments. */ 3783 TCGv_vec x; 3784 x = v3, v3 = v4, v4 = x; 3785 } 3786 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 3787 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 3788 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 3789 tcg_temp_free_vec(t); 3790} 3791 3792void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 3793 TCGArg a0, ...) 3794{ 3795 va_list va; 3796 TCGArg a2; 3797 TCGv_vec v0, v1, v2, v3, v4; 3798 3799 va_start(va, a0); 3800 v0 = temp_tcgv_vec(arg_temp(a0)); 3801 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3802 a2 = va_arg(va, TCGArg); 3803 3804 switch (opc) { 3805 case INDEX_op_shli_vec: 3806 case INDEX_op_shri_vec: 3807 expand_vec_shi(type, vece, opc, v0, v1, a2); 3808 break; 3809 3810 case INDEX_op_sari_vec: 3811 expand_vec_sari(type, vece, v0, v1, a2); 3812 break; 3813 3814 case INDEX_op_rotli_vec: 3815 expand_vec_rotli(type, vece, v0, v1, a2); 3816 break; 3817 3818 case INDEX_op_rotls_vec: 3819 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 3820 break; 3821 3822 case INDEX_op_rotlv_vec: 3823 v2 = temp_tcgv_vec(arg_temp(a2)); 3824 expand_vec_rotv(type, vece, v0, v1, v2, false); 3825 break; 3826 case INDEX_op_rotrv_vec: 3827 v2 = temp_tcgv_vec(arg_temp(a2)); 3828 expand_vec_rotv(type, vece, v0, v1, v2, true); 3829 break; 3830 3831 case INDEX_op_mul_vec: 3832 v2 = temp_tcgv_vec(arg_temp(a2)); 3833 expand_vec_mul(type, vece, v0, v1, v2); 3834 break; 3835 3836 case INDEX_op_cmp_vec: 3837 v2 = temp_tcgv_vec(arg_temp(a2)); 3838 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 3839 break; 3840 3841 case INDEX_op_cmpsel_vec: 3842 v2 = temp_tcgv_vec(arg_temp(a2)); 3843 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3844 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3845 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 3846 break; 3847 3848 default: 3849 break; 3850 } 3851 3852 va_end(va); 3853} 3854 3855static const int tcg_target_callee_save_regs[] = { 3856#if TCG_TARGET_REG_BITS == 64 3857 TCG_REG_RBP, 3858 TCG_REG_RBX, 3859#if defined(_WIN64) 3860 TCG_REG_RDI, 3861 TCG_REG_RSI, 3862#endif 3863 TCG_REG_R12, 3864 TCG_REG_R13, 3865 TCG_REG_R14, /* Currently used for the global env. */ 3866 TCG_REG_R15, 3867#else 3868 TCG_REG_EBP, /* Currently used for the global env. */ 3869 TCG_REG_EBX, 3870 TCG_REG_ESI, 3871 TCG_REG_EDI, 3872#endif 3873}; 3874 3875/* Compute frame size via macros, to share between tcg_target_qemu_prologue 3876 and tcg_register_jit. */ 3877 3878#define PUSH_SIZE \ 3879 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 3880 * (TCG_TARGET_REG_BITS / 8)) 3881 3882#define FRAME_SIZE \ 3883 ((PUSH_SIZE \ 3884 + TCG_STATIC_CALL_ARGS_SIZE \ 3885 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 3886 + TCG_TARGET_STACK_ALIGN - 1) \ 3887 & ~(TCG_TARGET_STACK_ALIGN - 1)) 3888 3889/* Generate global QEMU prologue and epilogue code */ 3890static void tcg_target_qemu_prologue(TCGContext *s) 3891{ 3892 int i, stack_addend; 3893 3894 /* TB prologue */ 3895 3896 /* Reserve some stack space, also for TCG temps. */ 3897 stack_addend = FRAME_SIZE - PUSH_SIZE; 3898 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 3899 CPU_TEMP_BUF_NLONGS * sizeof(long)); 3900 3901 /* Save all callee saved registers. */ 3902 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 3903 tcg_out_push(s, tcg_target_callee_save_regs[i]); 3904 } 3905 3906#if TCG_TARGET_REG_BITS == 32 3907 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 3908 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 3909 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3910 /* jmp *tb. */ 3911 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 3912 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 3913 + stack_addend); 3914#else 3915# if !defined(CONFIG_SOFTMMU) 3916 if (guest_base) { 3917 int seg = setup_guest_base_seg(); 3918 if (seg != 0) { 3919 x86_guest_base.seg = seg; 3920 } else if (guest_base == (int32_t)guest_base) { 3921 x86_guest_base.ofs = guest_base; 3922 } else { 3923 /* Choose R12 because, as a base, it requires a SIB byte. */ 3924 x86_guest_base.index = TCG_REG_R12; 3925 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 3926 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 3927 } 3928 } 3929# endif 3930 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 3931 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3932 /* jmp *tb. */ 3933 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 3934#endif 3935 3936 /* 3937 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 3938 * and fall through to the rest of the epilogue. 3939 */ 3940 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 3941 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 3942 3943 /* TB epilogue */ 3944 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 3945 3946 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 3947 3948 if (have_avx2) { 3949 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 3950 } 3951 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 3952 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 3953 } 3954 tcg_out_opc(s, OPC_RET, 0, 0, 0); 3955} 3956 3957static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 3958{ 3959 memset(p, 0x90, count); 3960} 3961 3962static void tcg_target_init(TCGContext *s) 3963{ 3964#ifdef CONFIG_CPUID_H 3965 unsigned a, b, c, d, b7 = 0, c7 = 0; 3966 unsigned max = __get_cpuid_max(0, 0); 3967 3968 if (max >= 7) { 3969 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ 3970 __cpuid_count(7, 0, a, b7, c7, d); 3971 have_bmi1 = (b7 & bit_BMI) != 0; 3972 have_bmi2 = (b7 & bit_BMI2) != 0; 3973 } 3974 3975 if (max >= 1) { 3976 __cpuid(1, a, b, c, d); 3977#ifndef have_cmov 3978 /* For 32-bit, 99% certainty that we're running on hardware that 3979 supports cmov, but we still need to check. In case cmov is not 3980 available, we'll use a small forward branch. */ 3981 have_cmov = (d & bit_CMOV) != 0; 3982#endif 3983 3984 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we 3985 need to probe for it. */ 3986 have_movbe = (c & bit_MOVBE) != 0; 3987 have_popcnt = (c & bit_POPCNT) != 0; 3988 3989 /* There are a number of things we must check before we can be 3990 sure of not hitting invalid opcode. */ 3991 if (c & bit_OSXSAVE) { 3992 unsigned bv = xgetbv_low(0); 3993 3994 if ((bv & 6) == 6) { 3995 have_avx1 = (c & bit_AVX) != 0; 3996 have_avx2 = (b7 & bit_AVX2) != 0; 3997 3998 /* 3999 * There are interesting instructions in AVX512, so long 4000 * as we have AVX512VL, which indicates support for EVEX 4001 * on sizes smaller than 512 bits. We are required to 4002 * check that OPMASK and all extended ZMM state are enabled 4003 * even if we're not using them -- the insns will fault. 4004 */ 4005 if ((bv & 0xe0) == 0xe0 4006 && (b7 & bit_AVX512F) 4007 && (b7 & bit_AVX512VL)) { 4008 have_avx512vl = true; 4009 have_avx512bw = (b7 & bit_AVX512BW) != 0; 4010 have_avx512dq = (b7 & bit_AVX512DQ) != 0; 4011 have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0; 4012 } 4013 4014 /* 4015 * The Intel SDM has added: 4016 * Processors that enumerate support for Intel® AVX 4017 * (by setting the feature flag CPUID.01H:ECX.AVX[bit 28]) 4018 * guarantee that the 16-byte memory operations performed 4019 * by the following instructions will always be carried 4020 * out atomically: 4021 * - MOVAPD, MOVAPS, and MOVDQA. 4022 * - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. 4023 * - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded 4024 * with EVEX.128 and k0 (masking disabled). 4025 * Note that these instructions require the linear addresses 4026 * of their memory operands to be 16-byte aligned. 4027 * 4028 * AMD has provided an even stronger guarantee that processors 4029 * with AVX provide 16-byte atomicity for all cachable, 4030 * naturally aligned single loads and stores, e.g. MOVDQU. 4031 * 4032 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 4033 */ 4034 if (have_avx1) { 4035 __cpuid(0, a, b, c, d); 4036 have_atomic16 = (c == signature_INTEL_ecx || 4037 c == signature_AMD_ecx); 4038 } 4039 } 4040 } 4041 } 4042 4043 max = __get_cpuid_max(0x8000000, 0); 4044 if (max >= 1) { 4045 __cpuid(0x80000001, a, b, c, d); 4046 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */ 4047 have_lzcnt = (c & bit_LZCNT) != 0; 4048 } 4049#endif /* CONFIG_CPUID_H */ 4050 4051 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4052 if (TCG_TARGET_REG_BITS == 64) { 4053 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4054 } 4055 if (have_avx1) { 4056 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4057 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4058 } 4059 if (have_avx2) { 4060 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4061 } 4062 4063 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4064 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4065 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4066 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4067 if (TCG_TARGET_REG_BITS == 64) { 4068#if !defined(_WIN64) 4069 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4070 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4071#endif 4072 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4073 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4074 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4075 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4076 } 4077 4078 s->reserved_regs = 0; 4079 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4080#ifdef _WIN64 4081 /* These are call saved, and we don't save them, so don't use them. */ 4082 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4083 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4084 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4085 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4086 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4087 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4088 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4089 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4090 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4091 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4092#endif 4093} 4094 4095typedef struct { 4096 DebugFrameHeader h; 4097 uint8_t fde_def_cfa[4]; 4098 uint8_t fde_reg_ofs[14]; 4099} DebugFrame; 4100 4101/* We're expecting a 2 byte uleb128 encoded value. */ 4102QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4103 4104#if !defined(__ELF__) 4105 /* Host machine without ELF. */ 4106#elif TCG_TARGET_REG_BITS == 64 4107#define ELF_HOST_MACHINE EM_X86_64 4108static const DebugFrame debug_frame = { 4109 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4110 .h.cie.id = -1, 4111 .h.cie.version = 1, 4112 .h.cie.code_align = 1, 4113 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4114 .h.cie.return_column = 16, 4115 4116 /* Total FDE size does not include the "len" member. */ 4117 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4118 4119 .fde_def_cfa = { 4120 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4121 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4122 (FRAME_SIZE >> 7) 4123 }, 4124 .fde_reg_ofs = { 4125 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4126 /* The following ordering must match tcg_target_callee_save_regs. */ 4127 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4128 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4129 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4130 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4131 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4132 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4133 } 4134}; 4135#else 4136#define ELF_HOST_MACHINE EM_386 4137static const DebugFrame debug_frame = { 4138 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4139 .h.cie.id = -1, 4140 .h.cie.version = 1, 4141 .h.cie.code_align = 1, 4142 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4143 .h.cie.return_column = 8, 4144 4145 /* Total FDE size does not include the "len" member. */ 4146 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4147 4148 .fde_def_cfa = { 4149 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4150 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4151 (FRAME_SIZE >> 7) 4152 }, 4153 .fde_reg_ofs = { 4154 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4155 /* The following ordering must match tcg_target_callee_save_regs. */ 4156 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4157 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4158 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4159 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4160 } 4161}; 4162#endif 4163 4164#if defined(ELF_HOST_MACHINE) 4165void tcg_register_jit(const void *buf, size_t buf_size) 4166{ 4167 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4168} 4169#endif 4170