1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-pool.c.inc" 26 27#ifdef CONFIG_DEBUG_TCG 28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 29#if TCG_TARGET_REG_BITS == 64 30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 31#else 32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 33#endif 34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 36#if TCG_TARGET_REG_BITS == 64 37 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 38 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 39#endif 40}; 41#endif 42 43static const int tcg_target_reg_alloc_order[] = { 44#if TCG_TARGET_REG_BITS == 64 45 TCG_REG_RBP, 46 TCG_REG_RBX, 47 TCG_REG_R12, 48 TCG_REG_R13, 49 TCG_REG_R14, 50 TCG_REG_R15, 51 TCG_REG_R10, 52 TCG_REG_R11, 53 TCG_REG_R9, 54 TCG_REG_R8, 55 TCG_REG_RCX, 56 TCG_REG_RDX, 57 TCG_REG_RSI, 58 TCG_REG_RDI, 59 TCG_REG_RAX, 60#else 61 TCG_REG_EBX, 62 TCG_REG_ESI, 63 TCG_REG_EDI, 64 TCG_REG_EBP, 65 TCG_REG_ECX, 66 TCG_REG_EDX, 67 TCG_REG_EAX, 68#endif 69 TCG_REG_XMM0, 70 TCG_REG_XMM1, 71 TCG_REG_XMM2, 72 TCG_REG_XMM3, 73 TCG_REG_XMM4, 74 TCG_REG_XMM5, 75#ifndef _WIN64 76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 78 TCG_REG_XMM6, 79 TCG_REG_XMM7, 80#if TCG_TARGET_REG_BITS == 64 81 TCG_REG_XMM8, 82 TCG_REG_XMM9, 83 TCG_REG_XMM10, 84 TCG_REG_XMM11, 85 TCG_REG_XMM12, 86 TCG_REG_XMM13, 87 TCG_REG_XMM14, 88 TCG_REG_XMM15, 89#endif 90#endif 91}; 92 93static const int tcg_target_call_iarg_regs[] = { 94#if TCG_TARGET_REG_BITS == 64 95#if defined(_WIN64) 96 TCG_REG_RCX, 97 TCG_REG_RDX, 98#else 99 TCG_REG_RDI, 100 TCG_REG_RSI, 101 TCG_REG_RDX, 102 TCG_REG_RCX, 103#endif 104 TCG_REG_R8, 105 TCG_REG_R9, 106#else 107 /* 32 bit mode uses stack based calling convention (GCC default). */ 108#endif 109}; 110 111static const int tcg_target_call_oarg_regs[] = { 112 TCG_REG_EAX, 113#if TCG_TARGET_REG_BITS == 32 114 TCG_REG_EDX 115#endif 116}; 117 118/* Constants we accept. */ 119#define TCG_CT_CONST_S32 0x100 120#define TCG_CT_CONST_U32 0x200 121#define TCG_CT_CONST_I32 0x400 122#define TCG_CT_CONST_WSZ 0x800 123 124/* Registers used with L constraint, which are the first argument 125 registers on x86_64, and two random call clobbered registers on 126 i386. */ 127#if TCG_TARGET_REG_BITS == 64 128# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 129# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 130#else 131# define TCG_REG_L0 TCG_REG_EAX 132# define TCG_REG_L1 TCG_REG_EDX 133#endif 134 135/* The host compiler should supply <cpuid.h> to enable runtime features 136 detection, as we're not going to go so far as our own inline assembly. 137 If not available, default values will be assumed. */ 138#if defined(CONFIG_CPUID_H) 139#include "qemu/cpuid.h" 140#endif 141 142/* For 64-bit, we always know that CMOV is available. */ 143#if TCG_TARGET_REG_BITS == 64 144# define have_cmov 1 145#elif defined(CONFIG_CPUID_H) 146static bool have_cmov; 147#else 148# define have_cmov 0 149#endif 150 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize 152 it there. Therefore we always define the variable. */ 153bool have_bmi1; 154bool have_popcnt; 155bool have_avx1; 156bool have_avx2; 157bool have_movbe; 158 159#ifdef CONFIG_CPUID_H 160static bool have_bmi2; 161static bool have_lzcnt; 162#else 163# define have_bmi2 0 164# define have_lzcnt 0 165#endif 166 167static const tcg_insn_unit *tb_ret_addr; 168 169static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 170 intptr_t value, intptr_t addend) 171{ 172 value += addend; 173 switch(type) { 174 case R_386_PC32: 175 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 176 if (value != (int32_t)value) { 177 return false; 178 } 179 /* FALLTHRU */ 180 case R_386_32: 181 tcg_patch32(code_ptr, value); 182 break; 183 case R_386_PC8: 184 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 185 if (value != (int8_t)value) { 186 return false; 187 } 188 tcg_patch8(code_ptr, value); 189 break; 190 default: 191 tcg_abort(); 192 } 193 return true; 194} 195 196#if TCG_TARGET_REG_BITS == 64 197#define ALL_GENERAL_REGS 0x0000ffffu 198#define ALL_VECTOR_REGS 0xffff0000u 199#else 200#define ALL_GENERAL_REGS 0x000000ffu 201#define ALL_VECTOR_REGS 0x00ff0000u 202#endif 203 204/* parse target specific constraints */ 205static const char *target_parse_constraint(TCGArgConstraint *ct, 206 const char *ct_str, TCGType type) 207{ 208 switch(*ct_str++) { 209 case 'a': 210 tcg_regset_set_reg(ct->regs, TCG_REG_EAX); 211 break; 212 case 'b': 213 tcg_regset_set_reg(ct->regs, TCG_REG_EBX); 214 break; 215 case 'c': 216 tcg_regset_set_reg(ct->regs, TCG_REG_ECX); 217 break; 218 case 'd': 219 tcg_regset_set_reg(ct->regs, TCG_REG_EDX); 220 break; 221 case 'S': 222 tcg_regset_set_reg(ct->regs, TCG_REG_ESI); 223 break; 224 case 'D': 225 tcg_regset_set_reg(ct->regs, TCG_REG_EDI); 226 break; 227 case 'q': 228 /* A register that can be used as a byte operand. */ 229 ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf; 230 break; 231 case 'Q': 232 /* A register with an addressable second byte (e.g. %ah). */ 233 ct->regs = 0xf; 234 break; 235 case 'r': 236 /* A general register. */ 237 ct->regs |= ALL_GENERAL_REGS; 238 break; 239 case 'W': 240 /* With TZCNT/LZCNT, we can have operand-size as an input. */ 241 ct->ct |= TCG_CT_CONST_WSZ; 242 break; 243 case 'x': 244 /* A vector register. */ 245 ct->regs |= ALL_VECTOR_REGS; 246 break; 247 248 case 'L': 249 /* qemu_ld/st data+address constraint */ 250 ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff; 251#ifdef CONFIG_SOFTMMU 252 tcg_regset_reset_reg(ct->regs, TCG_REG_L0); 253 tcg_regset_reset_reg(ct->regs, TCG_REG_L1); 254#endif 255 break; 256 case 's': 257 /* qemu_st8_i32 data constraint */ 258 ct->regs = 0xf; 259#ifdef CONFIG_SOFTMMU 260 tcg_regset_reset_reg(ct->regs, TCG_REG_L0); 261 tcg_regset_reset_reg(ct->regs, TCG_REG_L1); 262#endif 263 break; 264 265 case 'e': 266 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32); 267 break; 268 case 'Z': 269 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32); 270 break; 271 case 'I': 272 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32); 273 break; 274 275 default: 276 return NULL; 277 } 278 return ct_str; 279} 280 281/* test if a constant matches the constraint */ 282static inline int tcg_target_const_match(tcg_target_long val, TCGType type, 283 const TCGArgConstraint *arg_ct) 284{ 285 int ct = arg_ct->ct; 286 if (ct & TCG_CT_CONST) { 287 return 1; 288 } 289 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 290 return 1; 291 } 292 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 293 return 1; 294 } 295 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 296 return 1; 297 } 298 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 299 return 1; 300 } 301 return 0; 302} 303 304# define LOWREGMASK(x) ((x) & 7) 305 306#define P_EXT 0x100 /* 0x0f opcode prefix */ 307#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 308#define P_DATA16 0x400 /* 0x66 opcode prefix */ 309#if TCG_TARGET_REG_BITS == 64 310# define P_REXW 0x1000 /* Set REX.W = 1 */ 311# define P_REXB_R 0x2000 /* REG field as byte register */ 312# define P_REXB_RM 0x4000 /* R/M field as byte register */ 313# define P_GS 0x8000 /* gs segment override */ 314#else 315# define P_REXW 0 316# define P_REXB_R 0 317# define P_REXB_RM 0 318# define P_GS 0 319#endif 320#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 321#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 322#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 323#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 324 325#define OPC_ARITH_EvIz (0x81) 326#define OPC_ARITH_EvIb (0x83) 327#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 328#define OPC_ANDN (0xf2 | P_EXT38) 329#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 330#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 331#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 332#define OPC_BSF (0xbc | P_EXT) 333#define OPC_BSR (0xbd | P_EXT) 334#define OPC_BSWAP (0xc8 | P_EXT) 335#define OPC_CALL_Jz (0xe8) 336#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 337#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 338#define OPC_DEC_r32 (0x48) 339#define OPC_IMUL_GvEv (0xaf | P_EXT) 340#define OPC_IMUL_GvEvIb (0x6b) 341#define OPC_IMUL_GvEvIz (0x69) 342#define OPC_INC_r32 (0x40) 343#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 344#define OPC_JCC_short (0x70) /* ... plus condition code */ 345#define OPC_JMP_long (0xe9) 346#define OPC_JMP_short (0xeb) 347#define OPC_LEA (0x8d) 348#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 349#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 350#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 351#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 352#define OPC_MOVB_EvIz (0xc6) 353#define OPC_MOVL_EvIz (0xc7) 354#define OPC_MOVL_Iv (0xb8) 355#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 356#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 357#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 358#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 359#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 360#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 361#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 362#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 363#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 364#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 365#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 366#define OPC_MOVSBL (0xbe | P_EXT) 367#define OPC_MOVSWL (0xbf | P_EXT) 368#define OPC_MOVSLQ (0x63 | P_REXW) 369#define OPC_MOVZBL (0xb6 | P_EXT) 370#define OPC_MOVZWL (0xb7 | P_EXT) 371#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 372#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 373#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 374#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 375#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 376#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 377#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 378#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 379#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 380#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 381#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 382#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 383#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 384#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 385#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 386#define OPC_PAND (0xdb | P_EXT | P_DATA16) 387#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 388#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 389#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 390#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 391#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 392#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 393#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 394#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 395#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 396#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 397#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 398#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 399#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 400#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 401#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 402#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 403#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 404#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 405#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 406#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 407#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 408#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 409#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 410#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 411#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 412#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 413#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 414#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 415#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 416#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 417#define OPC_POR (0xeb | P_EXT | P_DATA16) 418#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 419#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 420#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 421#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 422#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 423#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ 424#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 425#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 426#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 427#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 428#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 429#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 430#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 431#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 432#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 433#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 434#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 435#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 436#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 437#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 438#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 439#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 440#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 441#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 442#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 443#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 444#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 445#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 446#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 447#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 448#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 449#define OPC_PXOR (0xef | P_EXT | P_DATA16) 450#define OPC_POP_r32 (0x58) 451#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 452#define OPC_PUSH_r32 (0x50) 453#define OPC_PUSH_Iv (0x68) 454#define OPC_PUSH_Ib (0x6a) 455#define OPC_RET (0xc3) 456#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 457#define OPC_SHIFT_1 (0xd1) 458#define OPC_SHIFT_Ib (0xc1) 459#define OPC_SHIFT_cl (0xd3) 460#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 461#define OPC_SHUFPS (0xc6 | P_EXT) 462#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 463#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 464#define OPC_SHRD_Ib (0xac | P_EXT) 465#define OPC_TESTL (0x85) 466#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 467#define OPC_UD2 (0x0b | P_EXT) 468#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 469#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 470#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 471#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 472#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 473#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 474#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 475#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 476#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 477#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 478#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW) 479#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 480#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 481#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_REXW) 482#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 483#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 484#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_REXW) 485#define OPC_VZEROUPPER (0x77 | P_EXT) 486#define OPC_XCHG_ax_r32 (0x90) 487 488#define OPC_GRP3_Ev (0xf7) 489#define OPC_GRP5 (0xff) 490#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 491 492/* Group 1 opcode extensions for 0x80-0x83. 493 These are also used as modifiers for OPC_ARITH. */ 494#define ARITH_ADD 0 495#define ARITH_OR 1 496#define ARITH_ADC 2 497#define ARITH_SBB 3 498#define ARITH_AND 4 499#define ARITH_SUB 5 500#define ARITH_XOR 6 501#define ARITH_CMP 7 502 503/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 504#define SHIFT_ROL 0 505#define SHIFT_ROR 1 506#define SHIFT_SHL 4 507#define SHIFT_SHR 5 508#define SHIFT_SAR 7 509 510/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 511#define EXT3_NOT 2 512#define EXT3_NEG 3 513#define EXT3_MUL 4 514#define EXT3_IMUL 5 515#define EXT3_DIV 6 516#define EXT3_IDIV 7 517 518/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 519#define EXT5_INC_Ev 0 520#define EXT5_DEC_Ev 1 521#define EXT5_CALLN_Ev 2 522#define EXT5_JMPN_Ev 4 523 524/* Condition codes to be added to OPC_JCC_{long,short}. */ 525#define JCC_JMP (-1) 526#define JCC_JO 0x0 527#define JCC_JNO 0x1 528#define JCC_JB 0x2 529#define JCC_JAE 0x3 530#define JCC_JE 0x4 531#define JCC_JNE 0x5 532#define JCC_JBE 0x6 533#define JCC_JA 0x7 534#define JCC_JS 0x8 535#define JCC_JNS 0x9 536#define JCC_JP 0xa 537#define JCC_JNP 0xb 538#define JCC_JL 0xc 539#define JCC_JGE 0xd 540#define JCC_JLE 0xe 541#define JCC_JG 0xf 542 543static const uint8_t tcg_cond_to_jcc[] = { 544 [TCG_COND_EQ] = JCC_JE, 545 [TCG_COND_NE] = JCC_JNE, 546 [TCG_COND_LT] = JCC_JL, 547 [TCG_COND_GE] = JCC_JGE, 548 [TCG_COND_LE] = JCC_JLE, 549 [TCG_COND_GT] = JCC_JG, 550 [TCG_COND_LTU] = JCC_JB, 551 [TCG_COND_GEU] = JCC_JAE, 552 [TCG_COND_LEU] = JCC_JBE, 553 [TCG_COND_GTU] = JCC_JA, 554}; 555 556#if TCG_TARGET_REG_BITS == 64 557static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 558{ 559 int rex; 560 561 if (opc & P_GS) { 562 tcg_out8(s, 0x65); 563 } 564 if (opc & P_DATA16) { 565 /* We should never be asking for both 16 and 64-bit operation. */ 566 tcg_debug_assert((opc & P_REXW) == 0); 567 tcg_out8(s, 0x66); 568 } 569 if (opc & P_SIMDF3) { 570 tcg_out8(s, 0xf3); 571 } else if (opc & P_SIMDF2) { 572 tcg_out8(s, 0xf2); 573 } 574 575 rex = 0; 576 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 577 rex |= (r & 8) >> 1; /* REX.R */ 578 rex |= (x & 8) >> 2; /* REX.X */ 579 rex |= (rm & 8) >> 3; /* REX.B */ 580 581 /* P_REXB_{R,RM} indicates that the given register is the low byte. 582 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 583 as otherwise the encoding indicates %[abcd]h. Note that the values 584 that are ORed in merely indicate that the REX byte must be present; 585 those bits get discarded in output. */ 586 rex |= opc & (r >= 4 ? P_REXB_R : 0); 587 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 588 589 if (rex) { 590 tcg_out8(s, (uint8_t)(rex | 0x40)); 591 } 592 593 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 594 tcg_out8(s, 0x0f); 595 if (opc & P_EXT38) { 596 tcg_out8(s, 0x38); 597 } else if (opc & P_EXT3A) { 598 tcg_out8(s, 0x3a); 599 } 600 } 601 602 tcg_out8(s, opc); 603} 604#else 605static void tcg_out_opc(TCGContext *s, int opc) 606{ 607 if (opc & P_DATA16) { 608 tcg_out8(s, 0x66); 609 } 610 if (opc & P_SIMDF3) { 611 tcg_out8(s, 0xf3); 612 } else if (opc & P_SIMDF2) { 613 tcg_out8(s, 0xf2); 614 } 615 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 616 tcg_out8(s, 0x0f); 617 if (opc & P_EXT38) { 618 tcg_out8(s, 0x38); 619 } else if (opc & P_EXT3A) { 620 tcg_out8(s, 0x3a); 621 } 622 } 623 tcg_out8(s, opc); 624} 625/* Discard the register arguments to tcg_out_opc early, so as not to penalize 626 the 32-bit compilation paths. This method works with all versions of gcc, 627 whereas relying on optimization may not be able to exclude them. */ 628#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 629#endif 630 631static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 632{ 633 tcg_out_opc(s, opc, r, rm, 0); 634 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 635} 636 637static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 638 int rm, int index) 639{ 640 int tmp; 641 642 /* Use the two byte form if possible, which cannot encode 643 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 644 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT 645 && ((rm | index) & 8) == 0) { 646 /* Two byte VEX prefix. */ 647 tcg_out8(s, 0xc5); 648 649 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 650 } else { 651 /* Three byte VEX prefix. */ 652 tcg_out8(s, 0xc4); 653 654 /* VEX.m-mmmm */ 655 if (opc & P_EXT3A) { 656 tmp = 3; 657 } else if (opc & P_EXT38) { 658 tmp = 2; 659 } else if (opc & P_EXT) { 660 tmp = 1; 661 } else { 662 g_assert_not_reached(); 663 } 664 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 665 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 666 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 667 tcg_out8(s, tmp); 668 669 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ 670 } 671 672 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 673 /* VEX.pp */ 674 if (opc & P_DATA16) { 675 tmp |= 1; /* 0x66 */ 676 } else if (opc & P_SIMDF3) { 677 tmp |= 2; /* 0xf3 */ 678 } else if (opc & P_SIMDF2) { 679 tmp |= 3; /* 0xf2 */ 680 } 681 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 682 tcg_out8(s, tmp); 683 tcg_out8(s, opc); 684} 685 686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 687{ 688 tcg_out_vex_opc(s, opc, r, v, rm, 0); 689 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 690} 691 692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 693 We handle either RM and INDEX missing with a negative value. In 64-bit 694 mode for absolute addresses, ~RM is the size of the immediate operand 695 that will follow the instruction. */ 696 697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 698 int shift, intptr_t offset) 699{ 700 int mod, len; 701 702 if (index < 0 && rm < 0) { 703 if (TCG_TARGET_REG_BITS == 64) { 704 /* Try for a rip-relative addressing mode. This has replaced 705 the 32-bit-mode absolute addressing encoding. */ 706 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 707 intptr_t disp = offset - pc; 708 if (disp == (int32_t)disp) { 709 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 710 tcg_out32(s, disp); 711 return; 712 } 713 714 /* Try for an absolute address encoding. This requires the 715 use of the MODRM+SIB encoding and is therefore larger than 716 rip-relative addressing. */ 717 if (offset == (int32_t)offset) { 718 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 719 tcg_out8(s, (4 << 3) | 5); 720 tcg_out32(s, offset); 721 return; 722 } 723 724 /* ??? The memory isn't directly addressable. */ 725 g_assert_not_reached(); 726 } else { 727 /* Absolute address. */ 728 tcg_out8(s, (r << 3) | 5); 729 tcg_out32(s, offset); 730 return; 731 } 732 } 733 734 /* Find the length of the immediate addend. Note that the encoding 735 that would be used for (%ebp) indicates absolute addressing. */ 736 if (rm < 0) { 737 mod = 0, len = 4, rm = 5; 738 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 739 mod = 0, len = 0; 740 } else if (offset == (int8_t)offset) { 741 mod = 0x40, len = 1; 742 } else { 743 mod = 0x80, len = 4; 744 } 745 746 /* Use a single byte MODRM format if possible. Note that the encoding 747 that would be used for %esp is the escape to the two byte form. */ 748 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 749 /* Single byte MODRM format. */ 750 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 751 } else { 752 /* Two byte MODRM+SIB format. */ 753 754 /* Note that the encoding that would place %esp into the index 755 field indicates no index register. In 64-bit mode, the REX.X 756 bit counts, so %r12 can be used as the index. */ 757 if (index < 0) { 758 index = 4; 759 } else { 760 tcg_debug_assert(index != TCG_REG_ESP); 761 } 762 763 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 764 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 765 } 766 767 if (len == 1) { 768 tcg_out8(s, offset); 769 } else if (len == 4) { 770 tcg_out32(s, offset); 771 } 772} 773 774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 775 int index, int shift, intptr_t offset) 776{ 777 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 778 tcg_out_sib_offset(s, r, rm, index, shift, offset); 779} 780 781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 782 int rm, int index, int shift, 783 intptr_t offset) 784{ 785 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 786 tcg_out_sib_offset(s, r, rm, index, shift, offset); 787} 788 789/* A simplification of the above with no index or shift. */ 790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 791 int rm, intptr_t offset) 792{ 793 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 794} 795 796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 797 int v, int rm, intptr_t offset) 798{ 799 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 800} 801 802/* Output an opcode with an expected reference to the constant pool. */ 803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 804{ 805 tcg_out_opc(s, opc, r, 0, 0); 806 /* Absolute for 32-bit, pc-relative for 64-bit. */ 807 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 808 tcg_out32(s, 0); 809} 810 811/* Output an opcode with an expected reference to the constant pool. */ 812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 813{ 814 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 815 /* Absolute for 32-bit, pc-relative for 64-bit. */ 816 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 817 tcg_out32(s, 0); 818} 819 820/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 822{ 823 /* Propagate an opcode prefix, such as P_REXW. */ 824 int ext = subop & ~0x7; 825 subop &= 0x7; 826 827 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 828} 829 830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 831{ 832 int rexw = 0; 833 834 if (arg == ret) { 835 return true; 836 } 837 switch (type) { 838 case TCG_TYPE_I64: 839 rexw = P_REXW; 840 /* fallthru */ 841 case TCG_TYPE_I32: 842 if (ret < 16) { 843 if (arg < 16) { 844 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 845 } else { 846 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 847 } 848 } else { 849 if (arg < 16) { 850 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 851 } else { 852 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 853 } 854 } 855 break; 856 857 case TCG_TYPE_V64: 858 tcg_debug_assert(ret >= 16 && arg >= 16); 859 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 860 break; 861 case TCG_TYPE_V128: 862 tcg_debug_assert(ret >= 16 && arg >= 16); 863 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 864 break; 865 case TCG_TYPE_V256: 866 tcg_debug_assert(ret >= 16 && arg >= 16); 867 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 868 break; 869 870 default: 871 g_assert_not_reached(); 872 } 873 return true; 874} 875 876static const int avx2_dup_insn[4] = { 877 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 878 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 879}; 880 881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 882 TCGReg r, TCGReg a) 883{ 884 if (have_avx2) { 885 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 886 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 887 } else { 888 switch (vece) { 889 case MO_8: 890 /* ??? With zero in a register, use PSHUFB. */ 891 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 892 a = r; 893 /* FALLTHRU */ 894 case MO_16: 895 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 896 a = r; 897 /* FALLTHRU */ 898 case MO_32: 899 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 900 /* imm8 operand: all output lanes selected from input lane 0. */ 901 tcg_out8(s, 0); 902 break; 903 case MO_64: 904 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 905 break; 906 default: 907 g_assert_not_reached(); 908 } 909 } 910 return true; 911} 912 913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 914 TCGReg r, TCGReg base, intptr_t offset) 915{ 916 if (have_avx2) { 917 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 918 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 919 r, 0, base, offset); 920 } else { 921 switch (vece) { 922 case MO_64: 923 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 924 break; 925 case MO_32: 926 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 927 break; 928 case MO_16: 929 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 930 tcg_out8(s, 0); /* imm8 */ 931 tcg_out_dup_vec(s, type, vece, r, r); 932 break; 933 case MO_8: 934 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 935 tcg_out8(s, 0); /* imm8 */ 936 tcg_out_dup_vec(s, type, vece, r, r); 937 break; 938 default: 939 g_assert_not_reached(); 940 } 941 } 942 return true; 943} 944 945static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 946 TCGReg ret, int64_t arg) 947{ 948 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 949 950 if (arg == 0) { 951 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 952 return; 953 } 954 if (arg == -1) { 955 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 956 return; 957 } 958 959 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 960 if (have_avx2) { 961 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 962 } else { 963 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 964 } 965 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 966 } else { 967 if (type == TCG_TYPE_V64) { 968 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 969 } else if (have_avx2) { 970 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 971 } else { 972 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 973 } 974 if (TCG_TARGET_REG_BITS == 64) { 975 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 976 } else { 977 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 978 } 979 } 980} 981 982static void tcg_out_movi_vec(TCGContext *s, TCGType type, 983 TCGReg ret, tcg_target_long arg) 984{ 985 if (arg == 0) { 986 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 987 return; 988 } 989 if (arg == -1) { 990 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 991 return; 992 } 993 994 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 995 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 996 if (TCG_TARGET_REG_BITS == 64) { 997 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 998 } else { 999 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1000 } 1001} 1002 1003static void tcg_out_movi_int(TCGContext *s, TCGType type, 1004 TCGReg ret, tcg_target_long arg) 1005{ 1006 tcg_target_long diff; 1007 1008 if (arg == 0) { 1009 tgen_arithr(s, ARITH_XOR, ret, ret); 1010 return; 1011 } 1012 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1013 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1014 tcg_out32(s, arg); 1015 return; 1016 } 1017 if (arg == (int32_t)arg) { 1018 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1019 tcg_out32(s, arg); 1020 return; 1021 } 1022 1023 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1024 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1025 if (diff == (int32_t)diff) { 1026 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1027 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1028 tcg_out32(s, diff); 1029 return; 1030 } 1031 1032 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1033 tcg_out64(s, arg); 1034} 1035 1036static void tcg_out_movi(TCGContext *s, TCGType type, 1037 TCGReg ret, tcg_target_long arg) 1038{ 1039 switch (type) { 1040 case TCG_TYPE_I32: 1041#if TCG_TARGET_REG_BITS == 64 1042 case TCG_TYPE_I64: 1043#endif 1044 if (ret < 16) { 1045 tcg_out_movi_int(s, type, ret, arg); 1046 } else { 1047 tcg_out_movi_vec(s, type, ret, arg); 1048 } 1049 break; 1050 default: 1051 g_assert_not_reached(); 1052 } 1053} 1054 1055static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1056{ 1057 if (val == (int8_t)val) { 1058 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1059 tcg_out8(s, val); 1060 } else if (val == (int32_t)val) { 1061 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1062 tcg_out32(s, val); 1063 } else { 1064 tcg_abort(); 1065 } 1066} 1067 1068static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1069{ 1070 /* Given the strength of x86 memory ordering, we only need care for 1071 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1072 faster than "mfence", so don't bother with the sse insn. */ 1073 if (a0 & TCG_MO_ST_LD) { 1074 tcg_out8(s, 0xf0); 1075 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1076 tcg_out8(s, 0); 1077 } 1078} 1079 1080static inline void tcg_out_push(TCGContext *s, int reg) 1081{ 1082 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1083} 1084 1085static inline void tcg_out_pop(TCGContext *s, int reg) 1086{ 1087 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1088} 1089 1090static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1091 TCGReg arg1, intptr_t arg2) 1092{ 1093 switch (type) { 1094 case TCG_TYPE_I32: 1095 if (ret < 16) { 1096 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1097 } else { 1098 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1099 } 1100 break; 1101 case TCG_TYPE_I64: 1102 if (ret < 16) { 1103 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1104 break; 1105 } 1106 /* FALLTHRU */ 1107 case TCG_TYPE_V64: 1108 /* There is no instruction that can validate 8-byte alignment. */ 1109 tcg_debug_assert(ret >= 16); 1110 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1111 break; 1112 case TCG_TYPE_V128: 1113 /* 1114 * The gvec infrastructure is asserts that v128 vector loads 1115 * and stores use a 16-byte aligned offset. Validate that the 1116 * final pointer is aligned by using an insn that will SIGSEGV. 1117 */ 1118 tcg_debug_assert(ret >= 16); 1119 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1120 break; 1121 case TCG_TYPE_V256: 1122 /* 1123 * The gvec infrastructure only requires 16-byte alignment, 1124 * so here we must use an unaligned load. 1125 */ 1126 tcg_debug_assert(ret >= 16); 1127 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1128 ret, 0, arg1, arg2); 1129 break; 1130 default: 1131 g_assert_not_reached(); 1132 } 1133} 1134 1135static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1136 TCGReg arg1, intptr_t arg2) 1137{ 1138 switch (type) { 1139 case TCG_TYPE_I32: 1140 if (arg < 16) { 1141 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1142 } else { 1143 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1144 } 1145 break; 1146 case TCG_TYPE_I64: 1147 if (arg < 16) { 1148 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1149 break; 1150 } 1151 /* FALLTHRU */ 1152 case TCG_TYPE_V64: 1153 /* There is no instruction that can validate 8-byte alignment. */ 1154 tcg_debug_assert(arg >= 16); 1155 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1156 break; 1157 case TCG_TYPE_V128: 1158 /* 1159 * The gvec infrastructure is asserts that v128 vector loads 1160 * and stores use a 16-byte aligned offset. Validate that the 1161 * final pointer is aligned by using an insn that will SIGSEGV. 1162 */ 1163 tcg_debug_assert(arg >= 16); 1164 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1165 break; 1166 case TCG_TYPE_V256: 1167 /* 1168 * The gvec infrastructure only requires 16-byte alignment, 1169 * so here we must use an unaligned store. 1170 */ 1171 tcg_debug_assert(arg >= 16); 1172 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1173 arg, 0, arg1, arg2); 1174 break; 1175 default: 1176 g_assert_not_reached(); 1177 } 1178} 1179 1180static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1181 TCGReg base, intptr_t ofs) 1182{ 1183 int rexw = 0; 1184 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1185 if (val != (int32_t)val) { 1186 return false; 1187 } 1188 rexw = P_REXW; 1189 } else if (type != TCG_TYPE_I32) { 1190 return false; 1191 } 1192 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1193 tcg_out32(s, val); 1194 return true; 1195} 1196 1197static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1198{ 1199 /* Propagate an opcode prefix, such as P_DATA16. */ 1200 int ext = subopc & ~0x7; 1201 subopc &= 0x7; 1202 1203 if (count == 1) { 1204 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1205 } else { 1206 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1207 tcg_out8(s, count); 1208 } 1209} 1210 1211static inline void tcg_out_bswap32(TCGContext *s, int reg) 1212{ 1213 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1214} 1215 1216static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1217{ 1218 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1219} 1220 1221static inline void tcg_out_ext8u(TCGContext *s, int dest, int src) 1222{ 1223 /* movzbl */ 1224 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1225 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1226} 1227 1228static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw) 1229{ 1230 /* movsbl */ 1231 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1232 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1233} 1234 1235static inline void tcg_out_ext16u(TCGContext *s, int dest, int src) 1236{ 1237 /* movzwl */ 1238 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1239} 1240 1241static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw) 1242{ 1243 /* movsw[lq] */ 1244 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1245} 1246 1247static inline void tcg_out_ext32u(TCGContext *s, int dest, int src) 1248{ 1249 /* 32-bit mov zero extends. */ 1250 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1251} 1252 1253static inline void tcg_out_ext32s(TCGContext *s, int dest, int src) 1254{ 1255 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1256} 1257 1258static inline void tcg_out_bswap64(TCGContext *s, int reg) 1259{ 1260 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1261} 1262 1263static void tgen_arithi(TCGContext *s, int c, int r0, 1264 tcg_target_long val, int cf) 1265{ 1266 int rexw = 0; 1267 1268 if (TCG_TARGET_REG_BITS == 64) { 1269 rexw = c & -8; 1270 c &= 7; 1271 } 1272 1273 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1274 partial flags update stalls on Pentium4 and are not recommended 1275 by current Intel optimization manuals. */ 1276 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) { 1277 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1278 if (TCG_TARGET_REG_BITS == 64) { 1279 /* The single-byte increment encodings are re-tasked as the 1280 REX prefixes. Use the MODRM encoding. */ 1281 tcg_out_modrm(s, OPC_GRP5 + rexw, 1282 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1283 } else { 1284 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1285 } 1286 return; 1287 } 1288 1289 if (c == ARITH_AND) { 1290 if (TCG_TARGET_REG_BITS == 64) { 1291 if (val == 0xffffffffu) { 1292 tcg_out_ext32u(s, r0, r0); 1293 return; 1294 } 1295 if (val == (uint32_t)val) { 1296 /* AND with no high bits set can use a 32-bit operation. */ 1297 rexw = 0; 1298 } 1299 } 1300 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1301 tcg_out_ext8u(s, r0, r0); 1302 return; 1303 } 1304 if (val == 0xffffu) { 1305 tcg_out_ext16u(s, r0, r0); 1306 return; 1307 } 1308 } 1309 1310 if (val == (int8_t)val) { 1311 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1312 tcg_out8(s, val); 1313 return; 1314 } 1315 if (rexw == 0 || val == (int32_t)val) { 1316 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1317 tcg_out32(s, val); 1318 return; 1319 } 1320 1321 tcg_abort(); 1322} 1323 1324static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1325{ 1326 if (val != 0) { 1327 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1328 } 1329} 1330 1331/* Use SMALL != 0 to force a short forward branch. */ 1332static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small) 1333{ 1334 int32_t val, val1; 1335 1336 if (l->has_value) { 1337 val = tcg_pcrel_diff(s, l->u.value_ptr); 1338 val1 = val - 2; 1339 if ((int8_t)val1 == val1) { 1340 if (opc == -1) { 1341 tcg_out8(s, OPC_JMP_short); 1342 } else { 1343 tcg_out8(s, OPC_JCC_short + opc); 1344 } 1345 tcg_out8(s, val1); 1346 } else { 1347 if (small) { 1348 tcg_abort(); 1349 } 1350 if (opc == -1) { 1351 tcg_out8(s, OPC_JMP_long); 1352 tcg_out32(s, val - 5); 1353 } else { 1354 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1355 tcg_out32(s, val - 6); 1356 } 1357 } 1358 } else if (small) { 1359 if (opc == -1) { 1360 tcg_out8(s, OPC_JMP_short); 1361 } else { 1362 tcg_out8(s, OPC_JCC_short + opc); 1363 } 1364 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1365 s->code_ptr += 1; 1366 } else { 1367 if (opc == -1) { 1368 tcg_out8(s, OPC_JMP_long); 1369 } else { 1370 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1371 } 1372 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1373 s->code_ptr += 4; 1374 } 1375} 1376 1377static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1378 int const_arg2, int rexw) 1379{ 1380 if (const_arg2) { 1381 if (arg2 == 0) { 1382 /* test r, r */ 1383 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1384 } else { 1385 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1386 } 1387 } else { 1388 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1389 } 1390} 1391 1392static void tcg_out_brcond32(TCGContext *s, TCGCond cond, 1393 TCGArg arg1, TCGArg arg2, int const_arg2, 1394 TCGLabel *label, int small) 1395{ 1396 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1397 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1398} 1399 1400#if TCG_TARGET_REG_BITS == 64 1401static void tcg_out_brcond64(TCGContext *s, TCGCond cond, 1402 TCGArg arg1, TCGArg arg2, int const_arg2, 1403 TCGLabel *label, int small) 1404{ 1405 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1406 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1407} 1408#else 1409/* XXX: we implement it at the target level to avoid having to 1410 handle cross basic blocks temporaries */ 1411static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1412 const int *const_args, int small) 1413{ 1414 TCGLabel *label_next = gen_new_label(); 1415 TCGLabel *label_this = arg_label(args[5]); 1416 1417 switch(args[4]) { 1418 case TCG_COND_EQ: 1419 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1420 label_next, 1); 1421 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3], 1422 label_this, small); 1423 break; 1424 case TCG_COND_NE: 1425 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1426 label_this, small); 1427 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3], 1428 label_this, small); 1429 break; 1430 case TCG_COND_LT: 1431 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1432 label_this, small); 1433 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1434 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1435 label_this, small); 1436 break; 1437 case TCG_COND_LE: 1438 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1439 label_this, small); 1440 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1441 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1442 label_this, small); 1443 break; 1444 case TCG_COND_GT: 1445 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1446 label_this, small); 1447 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1448 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1449 label_this, small); 1450 break; 1451 case TCG_COND_GE: 1452 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1453 label_this, small); 1454 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1455 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1456 label_this, small); 1457 break; 1458 case TCG_COND_LTU: 1459 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1460 label_this, small); 1461 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1462 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1463 label_this, small); 1464 break; 1465 case TCG_COND_LEU: 1466 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1467 label_this, small); 1468 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1469 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1470 label_this, small); 1471 break; 1472 case TCG_COND_GTU: 1473 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1474 label_this, small); 1475 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1476 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1477 label_this, small); 1478 break; 1479 case TCG_COND_GEU: 1480 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1481 label_this, small); 1482 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1483 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1484 label_this, small); 1485 break; 1486 default: 1487 tcg_abort(); 1488 } 1489 tcg_out_label(s, label_next); 1490} 1491#endif 1492 1493static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest, 1494 TCGArg arg1, TCGArg arg2, int const_arg2) 1495{ 1496 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1497 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1498 tcg_out_ext8u(s, dest, dest); 1499} 1500 1501#if TCG_TARGET_REG_BITS == 64 1502static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest, 1503 TCGArg arg1, TCGArg arg2, int const_arg2) 1504{ 1505 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1506 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1507 tcg_out_ext8u(s, dest, dest); 1508} 1509#else 1510static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1511 const int *const_args) 1512{ 1513 TCGArg new_args[6]; 1514 TCGLabel *label_true, *label_over; 1515 1516 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1517 1518 if (args[0] == args[1] || args[0] == args[2] 1519 || (!const_args[3] && args[0] == args[3]) 1520 || (!const_args[4] && args[0] == args[4])) { 1521 /* When the destination overlaps with one of the argument 1522 registers, don't do anything tricky. */ 1523 label_true = gen_new_label(); 1524 label_over = gen_new_label(); 1525 1526 new_args[5] = label_arg(label_true); 1527 tcg_out_brcond2(s, new_args, const_args+1, 1); 1528 1529 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1530 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1531 tcg_out_label(s, label_true); 1532 1533 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1534 tcg_out_label(s, label_over); 1535 } else { 1536 /* When the destination does not overlap one of the arguments, 1537 clear the destination first, jump if cond false, and emit an 1538 increment in the true case. This results in smaller code. */ 1539 1540 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1541 1542 label_over = gen_new_label(); 1543 new_args[4] = tcg_invert_cond(new_args[4]); 1544 new_args[5] = label_arg(label_over); 1545 tcg_out_brcond2(s, new_args, const_args+1, 1); 1546 1547 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1548 tcg_out_label(s, label_over); 1549 } 1550} 1551#endif 1552 1553static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1554 TCGReg dest, TCGReg v1) 1555{ 1556 if (have_cmov) { 1557 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1558 } else { 1559 TCGLabel *over = gen_new_label(); 1560 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1561 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1562 tcg_out_label(s, over); 1563 } 1564} 1565 1566static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest, 1567 TCGReg c1, TCGArg c2, int const_c2, 1568 TCGReg v1) 1569{ 1570 tcg_out_cmp(s, c1, c2, const_c2, 0); 1571 tcg_out_cmov(s, cond, 0, dest, v1); 1572} 1573 1574#if TCG_TARGET_REG_BITS == 64 1575static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest, 1576 TCGReg c1, TCGArg c2, int const_c2, 1577 TCGReg v1) 1578{ 1579 tcg_out_cmp(s, c1, c2, const_c2, P_REXW); 1580 tcg_out_cmov(s, cond, P_REXW, dest, v1); 1581} 1582#endif 1583 1584static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1585 TCGArg arg2, bool const_a2) 1586{ 1587 if (have_bmi1) { 1588 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1589 if (const_a2) { 1590 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1591 } else { 1592 tcg_debug_assert(dest != arg2); 1593 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1594 } 1595 } else { 1596 tcg_debug_assert(dest != arg2); 1597 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1598 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1599 } 1600} 1601 1602static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1603 TCGArg arg2, bool const_a2) 1604{ 1605 if (have_lzcnt) { 1606 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1607 if (const_a2) { 1608 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1609 } else { 1610 tcg_debug_assert(dest != arg2); 1611 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1612 } 1613 } else { 1614 tcg_debug_assert(!const_a2); 1615 tcg_debug_assert(dest != arg1); 1616 tcg_debug_assert(dest != arg2); 1617 1618 /* Recall that the output of BSR is the index not the count. */ 1619 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1620 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1621 1622 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1623 tcg_out_cmp(s, arg1, 0, 1, rexw); 1624 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1625 } 1626} 1627 1628static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1629{ 1630 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1631 1632 if (disp == (int32_t)disp) { 1633 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1634 tcg_out32(s, disp); 1635 } else { 1636 /* rip-relative addressing into the constant pool. 1637 This is 6 + 8 = 14 bytes, as compared to using an 1638 an immediate load 10 + 6 = 16 bytes, plus we may 1639 be able to re-use the pool constant for more calls. */ 1640 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1641 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1642 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1643 tcg_out32(s, 0); 1644 } 1645} 1646 1647static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest) 1648{ 1649 tcg_out_branch(s, 1, dest); 1650} 1651 1652static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1653{ 1654 tcg_out_branch(s, 0, dest); 1655} 1656 1657static void tcg_out_nopn(TCGContext *s, int n) 1658{ 1659 int i; 1660 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1661 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1662 * duplicate prefix, and all of the interesting recent cores can 1663 * decode and discard the duplicates in a single cycle. 1664 */ 1665 tcg_debug_assert(n >= 1); 1666 for (i = 1; i < n; ++i) { 1667 tcg_out8(s, 0x66); 1668 } 1669 tcg_out8(s, 0x90); 1670} 1671 1672#if defined(CONFIG_SOFTMMU) 1673#include "../tcg-ldst.c.inc" 1674 1675/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, 1676 * int mmu_idx, uintptr_t ra) 1677 */ 1678static void * const qemu_ld_helpers[16] = { 1679 [MO_UB] = helper_ret_ldub_mmu, 1680 [MO_LEUW] = helper_le_lduw_mmu, 1681 [MO_LEUL] = helper_le_ldul_mmu, 1682 [MO_LEQ] = helper_le_ldq_mmu, 1683 [MO_BEUW] = helper_be_lduw_mmu, 1684 [MO_BEUL] = helper_be_ldul_mmu, 1685 [MO_BEQ] = helper_be_ldq_mmu, 1686}; 1687 1688/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, 1689 * uintxx_t val, int mmu_idx, uintptr_t ra) 1690 */ 1691static void * const qemu_st_helpers[16] = { 1692 [MO_UB] = helper_ret_stb_mmu, 1693 [MO_LEUW] = helper_le_stw_mmu, 1694 [MO_LEUL] = helper_le_stl_mmu, 1695 [MO_LEQ] = helper_le_stq_mmu, 1696 [MO_BEUW] = helper_be_stw_mmu, 1697 [MO_BEUL] = helper_be_stl_mmu, 1698 [MO_BEQ] = helper_be_stq_mmu, 1699}; 1700 1701/* Perform the TLB load and compare. 1702 1703 Inputs: 1704 ADDRLO and ADDRHI contain the low and high part of the address. 1705 1706 MEM_INDEX and S_BITS are the memory context and log2 size of the load. 1707 1708 WHICH is the offset into the CPUTLBEntry structure of the slot to read. 1709 This should be offsetof addr_read or addr_write. 1710 1711 Outputs: 1712 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses) 1713 positions of the displacements of forward jumps to the TLB miss case. 1714 1715 Second argument register is loaded with the low part of the address. 1716 In the TLB hit case, it has been adjusted as indicated by the TLB 1717 and so is a host address. In the TLB miss case, it continues to 1718 hold a guest address. 1719 1720 First argument register is clobbered. */ 1721 1722static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, 1723 int mem_index, MemOp opc, 1724 tcg_insn_unit **label_ptr, int which) 1725{ 1726 const TCGReg r0 = TCG_REG_L0; 1727 const TCGReg r1 = TCG_REG_L1; 1728 TCGType ttype = TCG_TYPE_I32; 1729 TCGType tlbtype = TCG_TYPE_I32; 1730 int trexw = 0, hrexw = 0, tlbrexw = 0; 1731 unsigned a_bits = get_alignment_bits(opc); 1732 unsigned s_bits = opc & MO_SIZE; 1733 unsigned a_mask = (1 << a_bits) - 1; 1734 unsigned s_mask = (1 << s_bits) - 1; 1735 target_ulong tlb_mask; 1736 1737 if (TCG_TARGET_REG_BITS == 64) { 1738 if (TARGET_LONG_BITS == 64) { 1739 ttype = TCG_TYPE_I64; 1740 trexw = P_REXW; 1741 } 1742 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 1743 hrexw = P_REXW; 1744 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) { 1745 tlbtype = TCG_TYPE_I64; 1746 tlbrexw = P_REXW; 1747 } 1748 } 1749 } 1750 1751 tcg_out_mov(s, tlbtype, r0, addrlo); 1752 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0, 1753 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 1754 1755 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0, 1756 TLB_MASK_TABLE_OFS(mem_index) + 1757 offsetof(CPUTLBDescFast, mask)); 1758 1759 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0, 1760 TLB_MASK_TABLE_OFS(mem_index) + 1761 offsetof(CPUTLBDescFast, table)); 1762 1763 /* If the required alignment is at least as large as the access, simply 1764 copy the address and mask. For lesser alignments, check that we don't 1765 cross pages for the complete access. */ 1766 if (a_bits >= s_bits) { 1767 tcg_out_mov(s, ttype, r1, addrlo); 1768 } else { 1769 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask); 1770 } 1771 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask; 1772 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0); 1773 1774 /* cmp 0(r0), r1 */ 1775 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which); 1776 1777 /* Prepare for both the fast path add of the tlb addend, and the slow 1778 path function argument setup. */ 1779 tcg_out_mov(s, ttype, r1, addrlo); 1780 1781 /* jne slow_path */ 1782 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1783 label_ptr[0] = s->code_ptr; 1784 s->code_ptr += 4; 1785 1786 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1787 /* cmp 4(r0), addrhi */ 1788 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4); 1789 1790 /* jne slow_path */ 1791 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1792 label_ptr[1] = s->code_ptr; 1793 s->code_ptr += 4; 1794 } 1795 1796 /* TLB Hit. */ 1797 1798 /* add addend(r0), r1 */ 1799 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0, 1800 offsetof(CPUTLBEntry, addend)); 1801} 1802 1803/* 1804 * Record the context of a call to the out of line helper code for the slow path 1805 * for a load or store, so that we can later generate the correct helper code 1806 */ 1807static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64, 1808 TCGMemOpIdx oi, 1809 TCGReg datalo, TCGReg datahi, 1810 TCGReg addrlo, TCGReg addrhi, 1811 tcg_insn_unit *raddr, 1812 tcg_insn_unit **label_ptr) 1813{ 1814 TCGLabelQemuLdst *label = new_ldst_label(s); 1815 1816 label->is_ld = is_ld; 1817 label->oi = oi; 1818 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32; 1819 label->datalo_reg = datalo; 1820 label->datahi_reg = datahi; 1821 label->addrlo_reg = addrlo; 1822 label->addrhi_reg = addrhi; 1823 label->raddr = tcg_splitwx_to_rx(raddr); 1824 label->label_ptr[0] = label_ptr[0]; 1825 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1826 label->label_ptr[1] = label_ptr[1]; 1827 } 1828} 1829 1830/* 1831 * Generate code for the slow path for a load at the end of block 1832 */ 1833static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1834{ 1835 TCGMemOpIdx oi = l->oi; 1836 MemOp opc = get_memop(oi); 1837 TCGReg data_reg; 1838 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1839 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0); 1840 1841 /* resolve label address */ 1842 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1843 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1844 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1845 } 1846 1847 if (TCG_TARGET_REG_BITS == 32) { 1848 int ofs = 0; 1849 1850 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 1851 ofs += 4; 1852 1853 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 1854 ofs += 4; 1855 1856 if (TARGET_LONG_BITS == 64) { 1857 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 1858 ofs += 4; 1859 } 1860 1861 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs); 1862 ofs += 4; 1863 1864 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs); 1865 } else { 1866 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 1867 /* The second argument is already loaded with addrlo. */ 1868 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi); 1869 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], 1870 (uintptr_t)l->raddr); 1871 } 1872 1873 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); 1874 1875 data_reg = l->datalo_reg; 1876 switch (opc & MO_SSIZE) { 1877 case MO_SB: 1878 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw); 1879 break; 1880 case MO_SW: 1881 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw); 1882 break; 1883#if TCG_TARGET_REG_BITS == 64 1884 case MO_SL: 1885 tcg_out_ext32s(s, data_reg, TCG_REG_EAX); 1886 break; 1887#endif 1888 case MO_UB: 1889 case MO_UW: 1890 /* Note that the helpers have zero-extended to tcg_target_long. */ 1891 case MO_UL: 1892 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); 1893 break; 1894 case MO_Q: 1895 if (TCG_TARGET_REG_BITS == 64) { 1896 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); 1897 } else if (data_reg == TCG_REG_EDX) { 1898 /* xchg %edx, %eax */ 1899 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); 1900 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); 1901 } else { 1902 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); 1903 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); 1904 } 1905 break; 1906 default: 1907 tcg_abort(); 1908 } 1909 1910 /* Jump to the code corresponding to next IR of qemu_st */ 1911 tcg_out_jmp(s, l->raddr); 1912 return true; 1913} 1914 1915/* 1916 * Generate code for the slow path for a store at the end of block 1917 */ 1918static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1919{ 1920 TCGMemOpIdx oi = l->oi; 1921 MemOp opc = get_memop(oi); 1922 MemOp s_bits = opc & MO_SIZE; 1923 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1924 TCGReg retaddr; 1925 1926 /* resolve label address */ 1927 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1928 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1929 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1930 } 1931 1932 if (TCG_TARGET_REG_BITS == 32) { 1933 int ofs = 0; 1934 1935 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 1936 ofs += 4; 1937 1938 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 1939 ofs += 4; 1940 1941 if (TARGET_LONG_BITS == 64) { 1942 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 1943 ofs += 4; 1944 } 1945 1946 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs); 1947 ofs += 4; 1948 1949 if (s_bits == MO_64) { 1950 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs); 1951 ofs += 4; 1952 } 1953 1954 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs); 1955 ofs += 4; 1956 1957 retaddr = TCG_REG_EAX; 1958 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 1959 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs); 1960 } else { 1961 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 1962 /* The second argument is already loaded with addrlo. */ 1963 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32), 1964 tcg_target_call_iarg_regs[2], l->datalo_reg); 1965 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi); 1966 1967 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) { 1968 retaddr = tcg_target_call_iarg_regs[4]; 1969 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 1970 } else { 1971 retaddr = TCG_REG_RAX; 1972 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 1973 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, 1974 TCG_TARGET_CALL_STACK_OFFSET); 1975 } 1976 } 1977 1978 /* "Tail call" to the helper, with the return address back inline. */ 1979 tcg_out_push(s, retaddr); 1980 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); 1981 return true; 1982} 1983#elif TCG_TARGET_REG_BITS == 32 1984# define x86_guest_base_seg 0 1985# define x86_guest_base_index -1 1986# define x86_guest_base_offset guest_base 1987#else 1988static int x86_guest_base_seg; 1989static int x86_guest_base_index = -1; 1990static int32_t x86_guest_base_offset; 1991# if defined(__x86_64__) && defined(__linux__) 1992# include <asm/prctl.h> 1993# include <sys/prctl.h> 1994int arch_prctl(int code, unsigned long addr); 1995static inline int setup_guest_base_seg(void) 1996{ 1997 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 1998 return P_GS; 1999 } 2000 return 0; 2001} 2002# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__) 2003# include <machine/sysarch.h> 2004static inline int setup_guest_base_seg(void) 2005{ 2006 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2007 return P_GS; 2008 } 2009 return 0; 2010} 2011# else 2012static inline int setup_guest_base_seg(void) 2013{ 2014 return 0; 2015} 2016# endif 2017#endif /* SOFTMMU */ 2018 2019static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2020 TCGReg base, int index, intptr_t ofs, 2021 int seg, bool is64, MemOp memop) 2022{ 2023 bool use_movbe = false; 2024 int rexw = is64 * P_REXW; 2025 int movop = OPC_MOVL_GvEv; 2026 2027 /* Do big-endian loads with movbe. */ 2028 if (memop & MO_BSWAP) { 2029 tcg_debug_assert(have_movbe); 2030 use_movbe = true; 2031 movop = OPC_MOVBE_GyMy; 2032 } 2033 2034 switch (memop & MO_SSIZE) { 2035 case MO_UB: 2036 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo, 2037 base, index, 0, ofs); 2038 break; 2039 case MO_SB: 2040 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo, 2041 base, index, 0, ofs); 2042 break; 2043 case MO_UW: 2044 if (use_movbe) { 2045 /* There is no extending movbe; only low 16-bits are modified. */ 2046 if (datalo != base && datalo != index) { 2047 /* XOR breaks dependency chains. */ 2048 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2049 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2050 datalo, base, index, 0, ofs); 2051 } else { 2052 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2053 datalo, base, index, 0, ofs); 2054 tcg_out_ext16u(s, datalo, datalo); 2055 } 2056 } else { 2057 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo, 2058 base, index, 0, ofs); 2059 } 2060 break; 2061 case MO_SW: 2062 if (use_movbe) { 2063 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2064 datalo, base, index, 0, ofs); 2065 tcg_out_ext16s(s, datalo, datalo, rexw); 2066 } else { 2067 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg, 2068 datalo, base, index, 0, ofs); 2069 } 2070 break; 2071 case MO_UL: 2072 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs); 2073 break; 2074#if TCG_TARGET_REG_BITS == 64 2075 case MO_SL: 2076 if (use_movbe) { 2077 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo, 2078 base, index, 0, ofs); 2079 tcg_out_ext32s(s, datalo, datalo); 2080 } else { 2081 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo, 2082 base, index, 0, ofs); 2083 } 2084 break; 2085#endif 2086 case MO_Q: 2087 if (TCG_TARGET_REG_BITS == 64) { 2088 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo, 2089 base, index, 0, ofs); 2090 } else { 2091 if (use_movbe) { 2092 TCGReg t = datalo; 2093 datalo = datahi; 2094 datahi = t; 2095 } 2096 if (base != datalo) { 2097 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2098 base, index, 0, ofs); 2099 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2100 base, index, 0, ofs + 4); 2101 } else { 2102 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2103 base, index, 0, ofs + 4); 2104 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2105 base, index, 0, ofs); 2106 } 2107 } 2108 break; 2109 default: 2110 g_assert_not_reached(); 2111 } 2112} 2113 2114/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and 2115 EAX. It will be useful once fixed registers globals are less 2116 common. */ 2117static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64) 2118{ 2119 TCGReg datalo, datahi, addrlo; 2120 TCGReg addrhi __attribute__((unused)); 2121 TCGMemOpIdx oi; 2122 MemOp opc; 2123#if defined(CONFIG_SOFTMMU) 2124 int mem_index; 2125 tcg_insn_unit *label_ptr[2]; 2126#endif 2127 2128 datalo = *args++; 2129 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); 2130 addrlo = *args++; 2131 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); 2132 oi = *args++; 2133 opc = get_memop(oi); 2134 2135#if defined(CONFIG_SOFTMMU) 2136 mem_index = get_mmuidx(oi); 2137 2138 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, 2139 label_ptr, offsetof(CPUTLBEntry, addr_read)); 2140 2141 /* TLB Hit. */ 2142 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc); 2143 2144 /* Record the current context of a load into ldst label */ 2145 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi, 2146 s->code_ptr, label_ptr); 2147#else 2148 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index, 2149 x86_guest_base_offset, x86_guest_base_seg, 2150 is64, opc); 2151#endif 2152} 2153 2154static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2155 TCGReg base, int index, intptr_t ofs, 2156 int seg, MemOp memop) 2157{ 2158 bool use_movbe = false; 2159 int movop = OPC_MOVL_EvGv; 2160 2161 /* 2162 * Do big-endian stores with movbe or softmmu. 2163 * User-only without movbe will have its swapping done generically. 2164 */ 2165 if (memop & MO_BSWAP) { 2166 tcg_debug_assert(have_movbe); 2167 use_movbe = true; 2168 movop = OPC_MOVBE_MyGy; 2169 } 2170 2171 switch (memop & MO_SIZE) { 2172 case MO_8: 2173 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2174 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2175 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg, 2176 datalo, base, index, 0, ofs); 2177 break; 2178 case MO_16: 2179 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo, 2180 base, index, 0, ofs); 2181 break; 2182 case MO_32: 2183 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs); 2184 break; 2185 case MO_64: 2186 if (TCG_TARGET_REG_BITS == 64) { 2187 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo, 2188 base, index, 0, ofs); 2189 } else { 2190 if (use_movbe) { 2191 TCGReg t = datalo; 2192 datalo = datahi; 2193 datahi = t; 2194 } 2195 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2196 base, index, 0, ofs); 2197 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2198 base, index, 0, ofs + 4); 2199 } 2200 break; 2201 default: 2202 g_assert_not_reached(); 2203 } 2204} 2205 2206static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) 2207{ 2208 TCGReg datalo, datahi, addrlo; 2209 TCGReg addrhi __attribute__((unused)); 2210 TCGMemOpIdx oi; 2211 MemOp opc; 2212#if defined(CONFIG_SOFTMMU) 2213 int mem_index; 2214 tcg_insn_unit *label_ptr[2]; 2215#endif 2216 2217 datalo = *args++; 2218 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); 2219 addrlo = *args++; 2220 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); 2221 oi = *args++; 2222 opc = get_memop(oi); 2223 2224#if defined(CONFIG_SOFTMMU) 2225 mem_index = get_mmuidx(oi); 2226 2227 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, 2228 label_ptr, offsetof(CPUTLBEntry, addr_write)); 2229 2230 /* TLB Hit. */ 2231 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc); 2232 2233 /* Record the current context of a store into ldst label */ 2234 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi, 2235 s->code_ptr, label_ptr); 2236#else 2237 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index, 2238 x86_guest_base_offset, x86_guest_base_seg, opc); 2239#endif 2240} 2241 2242static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2243 const TCGArg *args, const int *const_args) 2244{ 2245 TCGArg a0, a1, a2; 2246 int c, const_a2, vexop, rexw = 0; 2247 2248#if TCG_TARGET_REG_BITS == 64 2249# define OP_32_64(x) \ 2250 case glue(glue(INDEX_op_, x), _i64): \ 2251 rexw = P_REXW; /* FALLTHRU */ \ 2252 case glue(glue(INDEX_op_, x), _i32) 2253#else 2254# define OP_32_64(x) \ 2255 case glue(glue(INDEX_op_, x), _i32) 2256#endif 2257 2258 /* Hoist the loads of the most common arguments. */ 2259 a0 = args[0]; 2260 a1 = args[1]; 2261 a2 = args[2]; 2262 const_a2 = const_args[2]; 2263 2264 switch (opc) { 2265 case INDEX_op_exit_tb: 2266 /* Reuse the zeroing that exists for goto_ptr. */ 2267 if (a0 == 0) { 2268 tcg_out_jmp(s, tcg_code_gen_epilogue); 2269 } else { 2270 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2271 tcg_out_jmp(s, tb_ret_addr); 2272 } 2273 break; 2274 case INDEX_op_goto_tb: 2275 if (s->tb_jmp_insn_offset) { 2276 /* direct jump method */ 2277 int gap; 2278 /* jump displacement must be aligned for atomic patching; 2279 * see if we need to add extra nops before jump 2280 */ 2281 gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2282 if (gap != 1) { 2283 tcg_out_nopn(s, gap - 1); 2284 } 2285 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2286 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); 2287 tcg_out32(s, 0); 2288 } else { 2289 /* indirect jump method */ 2290 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1, 2291 (intptr_t)(s->tb_jmp_target_addr + a0)); 2292 } 2293 set_jmp_reset_offset(s, a0); 2294 break; 2295 case INDEX_op_goto_ptr: 2296 /* jmp to the given host address (could be epilogue) */ 2297 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2298 break; 2299 case INDEX_op_br: 2300 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2301 break; 2302 OP_32_64(ld8u): 2303 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2304 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2305 break; 2306 OP_32_64(ld8s): 2307 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2308 break; 2309 OP_32_64(ld16u): 2310 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2311 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2312 break; 2313 OP_32_64(ld16s): 2314 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2315 break; 2316#if TCG_TARGET_REG_BITS == 64 2317 case INDEX_op_ld32u_i64: 2318#endif 2319 case INDEX_op_ld_i32: 2320 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2321 break; 2322 2323 OP_32_64(st8): 2324 if (const_args[0]) { 2325 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2326 tcg_out8(s, a0); 2327 } else { 2328 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2329 } 2330 break; 2331 OP_32_64(st16): 2332 if (const_args[0]) { 2333 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2334 tcg_out16(s, a0); 2335 } else { 2336 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2337 } 2338 break; 2339#if TCG_TARGET_REG_BITS == 64 2340 case INDEX_op_st32_i64: 2341#endif 2342 case INDEX_op_st_i32: 2343 if (const_args[0]) { 2344 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2345 tcg_out32(s, a0); 2346 } else { 2347 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2348 } 2349 break; 2350 2351 OP_32_64(add): 2352 /* For 3-operand addition, use LEA. */ 2353 if (a0 != a1) { 2354 TCGArg c3 = 0; 2355 if (const_a2) { 2356 c3 = a2, a2 = -1; 2357 } else if (a0 == a2) { 2358 /* Watch out for dest = src + dest, since we've removed 2359 the matching constraint on the add. */ 2360 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2361 break; 2362 } 2363 2364 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2365 break; 2366 } 2367 c = ARITH_ADD; 2368 goto gen_arith; 2369 OP_32_64(sub): 2370 c = ARITH_SUB; 2371 goto gen_arith; 2372 OP_32_64(and): 2373 c = ARITH_AND; 2374 goto gen_arith; 2375 OP_32_64(or): 2376 c = ARITH_OR; 2377 goto gen_arith; 2378 OP_32_64(xor): 2379 c = ARITH_XOR; 2380 goto gen_arith; 2381 gen_arith: 2382 if (const_a2) { 2383 tgen_arithi(s, c + rexw, a0, a2, 0); 2384 } else { 2385 tgen_arithr(s, c + rexw, a0, a2); 2386 } 2387 break; 2388 2389 OP_32_64(andc): 2390 if (const_a2) { 2391 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2392 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2393 } else { 2394 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2395 } 2396 break; 2397 2398 OP_32_64(mul): 2399 if (const_a2) { 2400 int32_t val; 2401 val = a2; 2402 if (val == (int8_t)val) { 2403 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2404 tcg_out8(s, val); 2405 } else { 2406 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2407 tcg_out32(s, val); 2408 } 2409 } else { 2410 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2411 } 2412 break; 2413 2414 OP_32_64(div2): 2415 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2416 break; 2417 OP_32_64(divu2): 2418 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2419 break; 2420 2421 OP_32_64(shl): 2422 /* For small constant 3-operand shift, use LEA. */ 2423 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2424 if (a2 - 1 == 0) { 2425 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2426 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2427 } else { 2428 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2429 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2430 } 2431 break; 2432 } 2433 c = SHIFT_SHL; 2434 vexop = OPC_SHLX; 2435 goto gen_shift_maybe_vex; 2436 OP_32_64(shr): 2437 c = SHIFT_SHR; 2438 vexop = OPC_SHRX; 2439 goto gen_shift_maybe_vex; 2440 OP_32_64(sar): 2441 c = SHIFT_SAR; 2442 vexop = OPC_SARX; 2443 goto gen_shift_maybe_vex; 2444 OP_32_64(rotl): 2445 c = SHIFT_ROL; 2446 goto gen_shift; 2447 OP_32_64(rotr): 2448 c = SHIFT_ROR; 2449 goto gen_shift; 2450 gen_shift_maybe_vex: 2451 if (have_bmi2) { 2452 if (!const_a2) { 2453 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2454 break; 2455 } 2456 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2457 } 2458 /* FALLTHRU */ 2459 gen_shift: 2460 if (const_a2) { 2461 tcg_out_shifti(s, c + rexw, a0, a2); 2462 } else { 2463 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2464 } 2465 break; 2466 2467 OP_32_64(ctz): 2468 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2469 break; 2470 OP_32_64(clz): 2471 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2472 break; 2473 OP_32_64(ctpop): 2474 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2475 break; 2476 2477 case INDEX_op_brcond_i32: 2478 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2479 break; 2480 case INDEX_op_setcond_i32: 2481 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2); 2482 break; 2483 case INDEX_op_movcond_i32: 2484 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]); 2485 break; 2486 2487 OP_32_64(bswap16): 2488 tcg_out_rolw_8(s, a0); 2489 break; 2490 OP_32_64(bswap32): 2491 tcg_out_bswap32(s, a0); 2492 break; 2493 2494 OP_32_64(neg): 2495 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2496 break; 2497 OP_32_64(not): 2498 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2499 break; 2500 2501 OP_32_64(ext8s): 2502 tcg_out_ext8s(s, a0, a1, rexw); 2503 break; 2504 OP_32_64(ext16s): 2505 tcg_out_ext16s(s, a0, a1, rexw); 2506 break; 2507 OP_32_64(ext8u): 2508 tcg_out_ext8u(s, a0, a1); 2509 break; 2510 OP_32_64(ext16u): 2511 tcg_out_ext16u(s, a0, a1); 2512 break; 2513 2514 case INDEX_op_qemu_ld_i32: 2515 tcg_out_qemu_ld(s, args, 0); 2516 break; 2517 case INDEX_op_qemu_ld_i64: 2518 tcg_out_qemu_ld(s, args, 1); 2519 break; 2520 case INDEX_op_qemu_st_i32: 2521 case INDEX_op_qemu_st8_i32: 2522 tcg_out_qemu_st(s, args, 0); 2523 break; 2524 case INDEX_op_qemu_st_i64: 2525 tcg_out_qemu_st(s, args, 1); 2526 break; 2527 2528 OP_32_64(mulu2): 2529 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2530 break; 2531 OP_32_64(muls2): 2532 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2533 break; 2534 OP_32_64(add2): 2535 if (const_args[4]) { 2536 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2537 } else { 2538 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2539 } 2540 if (const_args[5]) { 2541 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2542 } else { 2543 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2544 } 2545 break; 2546 OP_32_64(sub2): 2547 if (const_args[4]) { 2548 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2549 } else { 2550 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2551 } 2552 if (const_args[5]) { 2553 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2554 } else { 2555 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2556 } 2557 break; 2558 2559#if TCG_TARGET_REG_BITS == 32 2560 case INDEX_op_brcond2_i32: 2561 tcg_out_brcond2(s, args, const_args, 0); 2562 break; 2563 case INDEX_op_setcond2_i32: 2564 tcg_out_setcond2(s, args, const_args); 2565 break; 2566#else /* TCG_TARGET_REG_BITS == 64 */ 2567 case INDEX_op_ld32s_i64: 2568 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2569 break; 2570 case INDEX_op_ld_i64: 2571 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2572 break; 2573 case INDEX_op_st_i64: 2574 if (const_args[0]) { 2575 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2576 tcg_out32(s, a0); 2577 } else { 2578 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2579 } 2580 break; 2581 2582 case INDEX_op_brcond_i64: 2583 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2584 break; 2585 case INDEX_op_setcond_i64: 2586 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2); 2587 break; 2588 case INDEX_op_movcond_i64: 2589 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]); 2590 break; 2591 2592 case INDEX_op_bswap64_i64: 2593 tcg_out_bswap64(s, a0); 2594 break; 2595 case INDEX_op_extu_i32_i64: 2596 case INDEX_op_ext32u_i64: 2597 case INDEX_op_extrl_i64_i32: 2598 tcg_out_ext32u(s, a0, a1); 2599 break; 2600 case INDEX_op_ext_i32_i64: 2601 case INDEX_op_ext32s_i64: 2602 tcg_out_ext32s(s, a0, a1); 2603 break; 2604 case INDEX_op_extrh_i64_i32: 2605 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2606 break; 2607#endif 2608 2609 OP_32_64(deposit): 2610 if (args[3] == 0 && args[4] == 8) { 2611 /* load bits 0..7 */ 2612 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2613 } else if (args[3] == 8 && args[4] == 8) { 2614 /* load bits 8..15 */ 2615 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2616 } else if (args[3] == 0 && args[4] == 16) { 2617 /* load bits 0..15 */ 2618 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2619 } else { 2620 tcg_abort(); 2621 } 2622 break; 2623 2624 case INDEX_op_extract_i64: 2625 if (a2 + args[3] == 32) { 2626 /* This is a 32-bit zero-extending right shift. */ 2627 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2628 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2629 break; 2630 } 2631 /* FALLTHRU */ 2632 case INDEX_op_extract_i32: 2633 /* On the off-chance that we can use the high-byte registers. 2634 Otherwise we emit the same ext16 + shift pattern that we 2635 would have gotten from the normal tcg-op.c expansion. */ 2636 tcg_debug_assert(a2 == 8 && args[3] == 8); 2637 if (a1 < 4 && a0 < 8) { 2638 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2639 } else { 2640 tcg_out_ext16u(s, a0, a1); 2641 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2642 } 2643 break; 2644 2645 case INDEX_op_sextract_i32: 2646 /* We don't implement sextract_i64, as we cannot sign-extend to 2647 64-bits without using the REX prefix that explicitly excludes 2648 access to the high-byte registers. */ 2649 tcg_debug_assert(a2 == 8 && args[3] == 8); 2650 if (a1 < 4 && a0 < 8) { 2651 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2652 } else { 2653 tcg_out_ext16s(s, a0, a1, 0); 2654 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2655 } 2656 break; 2657 2658 OP_32_64(extract2): 2659 /* Note that SHRD outputs to the r/m operand. */ 2660 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2661 tcg_out8(s, args[3]); 2662 break; 2663 2664 case INDEX_op_mb: 2665 tcg_out_mb(s, a0); 2666 break; 2667 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2668 case INDEX_op_mov_i64: 2669 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2670 default: 2671 tcg_abort(); 2672 } 2673 2674#undef OP_32_64 2675} 2676 2677static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2678 unsigned vecl, unsigned vece, 2679 const TCGArg *args, const int *const_args) 2680{ 2681 static int const add_insn[4] = { 2682 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2683 }; 2684 static int const ssadd_insn[4] = { 2685 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2686 }; 2687 static int const usadd_insn[4] = { 2688 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2689 }; 2690 static int const sub_insn[4] = { 2691 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2692 }; 2693 static int const sssub_insn[4] = { 2694 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2695 }; 2696 static int const ussub_insn[4] = { 2697 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2698 }; 2699 static int const mul_insn[4] = { 2700 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2 2701 }; 2702 static int const shift_imm_insn[4] = { 2703 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2704 }; 2705 static int const cmpeq_insn[4] = { 2706 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2707 }; 2708 static int const cmpgt_insn[4] = { 2709 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2710 }; 2711 static int const punpckl_insn[4] = { 2712 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2713 }; 2714 static int const punpckh_insn[4] = { 2715 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2716 }; 2717 static int const packss_insn[4] = { 2718 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2719 }; 2720 static int const packus_insn[4] = { 2721 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2722 }; 2723 static int const smin_insn[4] = { 2724 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2 2725 }; 2726 static int const smax_insn[4] = { 2727 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2 2728 }; 2729 static int const umin_insn[4] = { 2730 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2 2731 }; 2732 static int const umax_insn[4] = { 2733 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2 2734 }; 2735 static int const shlv_insn[4] = { 2736 /* TODO: AVX512 adds support for MO_16. */ 2737 OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ 2738 }; 2739 static int const shrv_insn[4] = { 2740 /* TODO: AVX512 adds support for MO_16. */ 2741 OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ 2742 }; 2743 static int const sarv_insn[4] = { 2744 /* TODO: AVX512 adds support for MO_16, MO_64. */ 2745 OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2 2746 }; 2747 static int const shls_insn[4] = { 2748 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 2749 }; 2750 static int const shrs_insn[4] = { 2751 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 2752 }; 2753 static int const sars_insn[4] = { 2754 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2 2755 }; 2756 static int const abs_insn[4] = { 2757 /* TODO: AVX512 adds support for MO_64. */ 2758 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2 2759 }; 2760 2761 TCGType type = vecl + TCG_TYPE_V64; 2762 int insn, sub; 2763 TCGArg a0, a1, a2; 2764 2765 a0 = args[0]; 2766 a1 = args[1]; 2767 a2 = args[2]; 2768 2769 switch (opc) { 2770 case INDEX_op_add_vec: 2771 insn = add_insn[vece]; 2772 goto gen_simd; 2773 case INDEX_op_ssadd_vec: 2774 insn = ssadd_insn[vece]; 2775 goto gen_simd; 2776 case INDEX_op_usadd_vec: 2777 insn = usadd_insn[vece]; 2778 goto gen_simd; 2779 case INDEX_op_sub_vec: 2780 insn = sub_insn[vece]; 2781 goto gen_simd; 2782 case INDEX_op_sssub_vec: 2783 insn = sssub_insn[vece]; 2784 goto gen_simd; 2785 case INDEX_op_ussub_vec: 2786 insn = ussub_insn[vece]; 2787 goto gen_simd; 2788 case INDEX_op_mul_vec: 2789 insn = mul_insn[vece]; 2790 goto gen_simd; 2791 case INDEX_op_and_vec: 2792 insn = OPC_PAND; 2793 goto gen_simd; 2794 case INDEX_op_or_vec: 2795 insn = OPC_POR; 2796 goto gen_simd; 2797 case INDEX_op_xor_vec: 2798 insn = OPC_PXOR; 2799 goto gen_simd; 2800 case INDEX_op_smin_vec: 2801 insn = smin_insn[vece]; 2802 goto gen_simd; 2803 case INDEX_op_umin_vec: 2804 insn = umin_insn[vece]; 2805 goto gen_simd; 2806 case INDEX_op_smax_vec: 2807 insn = smax_insn[vece]; 2808 goto gen_simd; 2809 case INDEX_op_umax_vec: 2810 insn = umax_insn[vece]; 2811 goto gen_simd; 2812 case INDEX_op_shlv_vec: 2813 insn = shlv_insn[vece]; 2814 goto gen_simd; 2815 case INDEX_op_shrv_vec: 2816 insn = shrv_insn[vece]; 2817 goto gen_simd; 2818 case INDEX_op_sarv_vec: 2819 insn = sarv_insn[vece]; 2820 goto gen_simd; 2821 case INDEX_op_shls_vec: 2822 insn = shls_insn[vece]; 2823 goto gen_simd; 2824 case INDEX_op_shrs_vec: 2825 insn = shrs_insn[vece]; 2826 goto gen_simd; 2827 case INDEX_op_sars_vec: 2828 insn = sars_insn[vece]; 2829 goto gen_simd; 2830 case INDEX_op_x86_punpckl_vec: 2831 insn = punpckl_insn[vece]; 2832 goto gen_simd; 2833 case INDEX_op_x86_punpckh_vec: 2834 insn = punpckh_insn[vece]; 2835 goto gen_simd; 2836 case INDEX_op_x86_packss_vec: 2837 insn = packss_insn[vece]; 2838 goto gen_simd; 2839 case INDEX_op_x86_packus_vec: 2840 insn = packus_insn[vece]; 2841 goto gen_simd; 2842#if TCG_TARGET_REG_BITS == 32 2843 case INDEX_op_dup2_vec: 2844 /* First merge the two 32-bit inputs to a single 64-bit element. */ 2845 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 2846 /* Then replicate the 64-bit elements across the rest of the vector. */ 2847 if (type != TCG_TYPE_V64) { 2848 tcg_out_dup_vec(s, type, MO_64, a0, a0); 2849 } 2850 break; 2851#endif 2852 case INDEX_op_abs_vec: 2853 insn = abs_insn[vece]; 2854 a2 = a1; 2855 a1 = 0; 2856 goto gen_simd; 2857 gen_simd: 2858 tcg_debug_assert(insn != OPC_UD2); 2859 if (type == TCG_TYPE_V256) { 2860 insn |= P_VEXL; 2861 } 2862 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2863 break; 2864 2865 case INDEX_op_cmp_vec: 2866 sub = args[3]; 2867 if (sub == TCG_COND_EQ) { 2868 insn = cmpeq_insn[vece]; 2869 } else if (sub == TCG_COND_GT) { 2870 insn = cmpgt_insn[vece]; 2871 } else { 2872 g_assert_not_reached(); 2873 } 2874 goto gen_simd; 2875 2876 case INDEX_op_andc_vec: 2877 insn = OPC_PANDN; 2878 if (type == TCG_TYPE_V256) { 2879 insn |= P_VEXL; 2880 } 2881 tcg_out_vex_modrm(s, insn, a0, a2, a1); 2882 break; 2883 2884 case INDEX_op_shli_vec: 2885 sub = 6; 2886 goto gen_shift; 2887 case INDEX_op_shri_vec: 2888 sub = 2; 2889 goto gen_shift; 2890 case INDEX_op_sari_vec: 2891 tcg_debug_assert(vece != MO_64); 2892 sub = 4; 2893 gen_shift: 2894 tcg_debug_assert(vece != MO_8); 2895 insn = shift_imm_insn[vece]; 2896 if (type == TCG_TYPE_V256) { 2897 insn |= P_VEXL; 2898 } 2899 tcg_out_vex_modrm(s, insn, sub, a0, a1); 2900 tcg_out8(s, a2); 2901 break; 2902 2903 case INDEX_op_ld_vec: 2904 tcg_out_ld(s, type, a0, a1, a2); 2905 break; 2906 case INDEX_op_st_vec: 2907 tcg_out_st(s, type, a0, a1, a2); 2908 break; 2909 case INDEX_op_dupm_vec: 2910 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 2911 break; 2912 2913 case INDEX_op_x86_shufps_vec: 2914 insn = OPC_SHUFPS; 2915 sub = args[3]; 2916 goto gen_simd_imm8; 2917 case INDEX_op_x86_blend_vec: 2918 if (vece == MO_16) { 2919 insn = OPC_PBLENDW; 2920 } else if (vece == MO_32) { 2921 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 2922 } else { 2923 g_assert_not_reached(); 2924 } 2925 sub = args[3]; 2926 goto gen_simd_imm8; 2927 case INDEX_op_x86_vperm2i128_vec: 2928 insn = OPC_VPERM2I128; 2929 sub = args[3]; 2930 goto gen_simd_imm8; 2931 gen_simd_imm8: 2932 if (type == TCG_TYPE_V256) { 2933 insn |= P_VEXL; 2934 } 2935 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2936 tcg_out8(s, sub); 2937 break; 2938 2939 case INDEX_op_x86_vpblendvb_vec: 2940 insn = OPC_VPBLENDVB; 2941 if (type == TCG_TYPE_V256) { 2942 insn |= P_VEXL; 2943 } 2944 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2945 tcg_out8(s, args[3] << 4); 2946 break; 2947 2948 case INDEX_op_x86_psrldq_vec: 2949 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 2950 tcg_out8(s, a2); 2951 break; 2952 2953 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 2954 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 2955 default: 2956 g_assert_not_reached(); 2957 } 2958} 2959 2960static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) 2961{ 2962 static const TCGTargetOpDef r = { .args_ct_str = { "r" } }; 2963 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } }; 2964 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } }; 2965 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } }; 2966 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } }; 2967 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } }; 2968 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } }; 2969 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } }; 2970 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } }; 2971 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } }; 2972 static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } }; 2973 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } }; 2974 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } }; 2975 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } }; 2976 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } }; 2977 static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } }; 2978 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } }; 2979 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } }; 2980 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } }; 2981 static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } }; 2982 static const TCGTargetOpDef r_r_L_L 2983 = { .args_ct_str = { "r", "r", "L", "L" } }; 2984 static const TCGTargetOpDef L_L_L_L 2985 = { .args_ct_str = { "L", "L", "L", "L" } }; 2986 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } }; 2987 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; 2988 static const TCGTargetOpDef x_x_x_x 2989 = { .args_ct_str = { "x", "x", "x", "x" } }; 2990 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; 2991 2992 switch (op) { 2993 case INDEX_op_goto_ptr: 2994 return &r; 2995 2996 case INDEX_op_ld8u_i32: 2997 case INDEX_op_ld8u_i64: 2998 case INDEX_op_ld8s_i32: 2999 case INDEX_op_ld8s_i64: 3000 case INDEX_op_ld16u_i32: 3001 case INDEX_op_ld16u_i64: 3002 case INDEX_op_ld16s_i32: 3003 case INDEX_op_ld16s_i64: 3004 case INDEX_op_ld_i32: 3005 case INDEX_op_ld32u_i64: 3006 case INDEX_op_ld32s_i64: 3007 case INDEX_op_ld_i64: 3008 return &r_r; 3009 3010 case INDEX_op_st8_i32: 3011 case INDEX_op_st8_i64: 3012 return &qi_r; 3013 case INDEX_op_st16_i32: 3014 case INDEX_op_st16_i64: 3015 case INDEX_op_st_i32: 3016 case INDEX_op_st32_i64: 3017 return &ri_r; 3018 case INDEX_op_st_i64: 3019 return &re_r; 3020 3021 case INDEX_op_add_i32: 3022 case INDEX_op_add_i64: 3023 return &r_r_re; 3024 case INDEX_op_sub_i32: 3025 case INDEX_op_sub_i64: 3026 case INDEX_op_mul_i32: 3027 case INDEX_op_mul_i64: 3028 case INDEX_op_or_i32: 3029 case INDEX_op_or_i64: 3030 case INDEX_op_xor_i32: 3031 case INDEX_op_xor_i64: 3032 return &r_0_re; 3033 3034 case INDEX_op_and_i32: 3035 case INDEX_op_and_i64: 3036 { 3037 static const TCGTargetOpDef and 3038 = { .args_ct_str = { "r", "0", "reZ" } }; 3039 return ∧ 3040 } 3041 break; 3042 case INDEX_op_andc_i32: 3043 case INDEX_op_andc_i64: 3044 { 3045 static const TCGTargetOpDef andc 3046 = { .args_ct_str = { "r", "r", "rI" } }; 3047 return &andc; 3048 } 3049 break; 3050 3051 case INDEX_op_shl_i32: 3052 case INDEX_op_shl_i64: 3053 case INDEX_op_shr_i32: 3054 case INDEX_op_shr_i64: 3055 case INDEX_op_sar_i32: 3056 case INDEX_op_sar_i64: 3057 return have_bmi2 ? &r_r_ri : &r_0_ci; 3058 case INDEX_op_rotl_i32: 3059 case INDEX_op_rotl_i64: 3060 case INDEX_op_rotr_i32: 3061 case INDEX_op_rotr_i64: 3062 return &r_0_ci; 3063 3064 case INDEX_op_brcond_i32: 3065 case INDEX_op_brcond_i64: 3066 return &r_re; 3067 3068 case INDEX_op_bswap16_i32: 3069 case INDEX_op_bswap16_i64: 3070 case INDEX_op_bswap32_i32: 3071 case INDEX_op_bswap32_i64: 3072 case INDEX_op_bswap64_i64: 3073 case INDEX_op_neg_i32: 3074 case INDEX_op_neg_i64: 3075 case INDEX_op_not_i32: 3076 case INDEX_op_not_i64: 3077 case INDEX_op_extrh_i64_i32: 3078 return &r_0; 3079 3080 case INDEX_op_ext8s_i32: 3081 case INDEX_op_ext8s_i64: 3082 case INDEX_op_ext8u_i32: 3083 case INDEX_op_ext8u_i64: 3084 return &r_q; 3085 case INDEX_op_ext16s_i32: 3086 case INDEX_op_ext16s_i64: 3087 case INDEX_op_ext16u_i32: 3088 case INDEX_op_ext16u_i64: 3089 case INDEX_op_ext32s_i64: 3090 case INDEX_op_ext32u_i64: 3091 case INDEX_op_ext_i32_i64: 3092 case INDEX_op_extu_i32_i64: 3093 case INDEX_op_extrl_i64_i32: 3094 case INDEX_op_extract_i32: 3095 case INDEX_op_extract_i64: 3096 case INDEX_op_sextract_i32: 3097 case INDEX_op_ctpop_i32: 3098 case INDEX_op_ctpop_i64: 3099 return &r_r; 3100 case INDEX_op_extract2_i32: 3101 case INDEX_op_extract2_i64: 3102 return &r_0_r; 3103 3104 case INDEX_op_deposit_i32: 3105 case INDEX_op_deposit_i64: 3106 { 3107 static const TCGTargetOpDef dep 3108 = { .args_ct_str = { "Q", "0", "Q" } }; 3109 return &dep; 3110 } 3111 case INDEX_op_setcond_i32: 3112 case INDEX_op_setcond_i64: 3113 { 3114 static const TCGTargetOpDef setc 3115 = { .args_ct_str = { "q", "r", "re" } }; 3116 return &setc; 3117 } 3118 case INDEX_op_movcond_i32: 3119 case INDEX_op_movcond_i64: 3120 { 3121 static const TCGTargetOpDef movc 3122 = { .args_ct_str = { "r", "r", "re", "r", "0" } }; 3123 return &movc; 3124 } 3125 case INDEX_op_div2_i32: 3126 case INDEX_op_div2_i64: 3127 case INDEX_op_divu2_i32: 3128 case INDEX_op_divu2_i64: 3129 { 3130 static const TCGTargetOpDef div2 3131 = { .args_ct_str = { "a", "d", "0", "1", "r" } }; 3132 return &div2; 3133 } 3134 case INDEX_op_mulu2_i32: 3135 case INDEX_op_mulu2_i64: 3136 case INDEX_op_muls2_i32: 3137 case INDEX_op_muls2_i64: 3138 { 3139 static const TCGTargetOpDef mul2 3140 = { .args_ct_str = { "a", "d", "a", "r" } }; 3141 return &mul2; 3142 } 3143 case INDEX_op_add2_i32: 3144 case INDEX_op_add2_i64: 3145 case INDEX_op_sub2_i32: 3146 case INDEX_op_sub2_i64: 3147 { 3148 static const TCGTargetOpDef arith2 3149 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } }; 3150 return &arith2; 3151 } 3152 case INDEX_op_ctz_i32: 3153 case INDEX_op_ctz_i64: 3154 { 3155 static const TCGTargetOpDef ctz[2] = { 3156 { .args_ct_str = { "&r", "r", "r" } }, 3157 { .args_ct_str = { "&r", "r", "rW" } }, 3158 }; 3159 return &ctz[have_bmi1]; 3160 } 3161 case INDEX_op_clz_i32: 3162 case INDEX_op_clz_i64: 3163 { 3164 static const TCGTargetOpDef clz[2] = { 3165 { .args_ct_str = { "&r", "r", "r" } }, 3166 { .args_ct_str = { "&r", "r", "rW" } }, 3167 }; 3168 return &clz[have_lzcnt]; 3169 } 3170 3171 case INDEX_op_qemu_ld_i32: 3172 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L; 3173 case INDEX_op_qemu_st_i32: 3174 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L; 3175 case INDEX_op_qemu_st8_i32: 3176 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L; 3177 case INDEX_op_qemu_ld_i64: 3178 return (TCG_TARGET_REG_BITS == 64 ? &r_L 3179 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L 3180 : &r_r_L_L); 3181 case INDEX_op_qemu_st_i64: 3182 return (TCG_TARGET_REG_BITS == 64 ? &L_L 3183 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L 3184 : &L_L_L_L); 3185 3186 case INDEX_op_brcond2_i32: 3187 { 3188 static const TCGTargetOpDef b2 3189 = { .args_ct_str = { "r", "r", "ri", "ri" } }; 3190 return &b2; 3191 } 3192 case INDEX_op_setcond2_i32: 3193 { 3194 static const TCGTargetOpDef s2 3195 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } }; 3196 return &s2; 3197 } 3198 3199 case INDEX_op_ld_vec: 3200 case INDEX_op_st_vec: 3201 case INDEX_op_dupm_vec: 3202 return &x_r; 3203 3204 case INDEX_op_add_vec: 3205 case INDEX_op_sub_vec: 3206 case INDEX_op_mul_vec: 3207 case INDEX_op_and_vec: 3208 case INDEX_op_or_vec: 3209 case INDEX_op_xor_vec: 3210 case INDEX_op_andc_vec: 3211 case INDEX_op_ssadd_vec: 3212 case INDEX_op_usadd_vec: 3213 case INDEX_op_sssub_vec: 3214 case INDEX_op_ussub_vec: 3215 case INDEX_op_smin_vec: 3216 case INDEX_op_umin_vec: 3217 case INDEX_op_smax_vec: 3218 case INDEX_op_umax_vec: 3219 case INDEX_op_shlv_vec: 3220 case INDEX_op_shrv_vec: 3221 case INDEX_op_sarv_vec: 3222 case INDEX_op_shls_vec: 3223 case INDEX_op_shrs_vec: 3224 case INDEX_op_sars_vec: 3225 case INDEX_op_rotls_vec: 3226 case INDEX_op_cmp_vec: 3227 case INDEX_op_x86_shufps_vec: 3228 case INDEX_op_x86_blend_vec: 3229 case INDEX_op_x86_packss_vec: 3230 case INDEX_op_x86_packus_vec: 3231 case INDEX_op_x86_vperm2i128_vec: 3232 case INDEX_op_x86_punpckl_vec: 3233 case INDEX_op_x86_punpckh_vec: 3234#if TCG_TARGET_REG_BITS == 32 3235 case INDEX_op_dup2_vec: 3236#endif 3237 return &x_x_x; 3238 case INDEX_op_abs_vec: 3239 case INDEX_op_dup_vec: 3240 case INDEX_op_shli_vec: 3241 case INDEX_op_shri_vec: 3242 case INDEX_op_sari_vec: 3243 case INDEX_op_x86_psrldq_vec: 3244 return &x_x; 3245 case INDEX_op_x86_vpblendvb_vec: 3246 return &x_x_x_x; 3247 3248 default: 3249 break; 3250 } 3251 return NULL; 3252} 3253 3254int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3255{ 3256 switch (opc) { 3257 case INDEX_op_add_vec: 3258 case INDEX_op_sub_vec: 3259 case INDEX_op_and_vec: 3260 case INDEX_op_or_vec: 3261 case INDEX_op_xor_vec: 3262 case INDEX_op_andc_vec: 3263 return 1; 3264 case INDEX_op_rotli_vec: 3265 case INDEX_op_cmp_vec: 3266 case INDEX_op_cmpsel_vec: 3267 return -1; 3268 3269 case INDEX_op_shli_vec: 3270 case INDEX_op_shri_vec: 3271 /* We must expand the operation for MO_8. */ 3272 return vece == MO_8 ? -1 : 1; 3273 3274 case INDEX_op_sari_vec: 3275 /* We must expand the operation for MO_8. */ 3276 if (vece == MO_8) { 3277 return -1; 3278 } 3279 /* We can emulate this for MO_64, but it does not pay off 3280 unless we're producing at least 4 values. */ 3281 if (vece == MO_64) { 3282 return type >= TCG_TYPE_V256 ? -1 : 0; 3283 } 3284 return 1; 3285 3286 case INDEX_op_shls_vec: 3287 case INDEX_op_shrs_vec: 3288 return vece >= MO_16; 3289 case INDEX_op_sars_vec: 3290 return vece >= MO_16 && vece <= MO_32; 3291 case INDEX_op_rotls_vec: 3292 return vece >= MO_16 ? -1 : 0; 3293 3294 case INDEX_op_shlv_vec: 3295 case INDEX_op_shrv_vec: 3296 return have_avx2 && vece >= MO_32; 3297 case INDEX_op_sarv_vec: 3298 return have_avx2 && vece == MO_32; 3299 case INDEX_op_rotlv_vec: 3300 case INDEX_op_rotrv_vec: 3301 return have_avx2 && vece >= MO_32 ? -1 : 0; 3302 3303 case INDEX_op_mul_vec: 3304 if (vece == MO_8) { 3305 /* We can expand the operation for MO_8. */ 3306 return -1; 3307 } 3308 if (vece == MO_64) { 3309 return 0; 3310 } 3311 return 1; 3312 3313 case INDEX_op_ssadd_vec: 3314 case INDEX_op_usadd_vec: 3315 case INDEX_op_sssub_vec: 3316 case INDEX_op_ussub_vec: 3317 return vece <= MO_16; 3318 case INDEX_op_smin_vec: 3319 case INDEX_op_smax_vec: 3320 case INDEX_op_umin_vec: 3321 case INDEX_op_umax_vec: 3322 case INDEX_op_abs_vec: 3323 return vece <= MO_32; 3324 3325 default: 3326 return 0; 3327 } 3328} 3329 3330static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3331 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3332{ 3333 TCGv_vec t1, t2; 3334 3335 tcg_debug_assert(vece == MO_8); 3336 3337 t1 = tcg_temp_new_vec(type); 3338 t2 = tcg_temp_new_vec(type); 3339 3340 /* 3341 * Unpack to W, shift, and repack. Tricky bits: 3342 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3343 * i.e. duplicate in other half of the 16-bit lane. 3344 * (2) For right-shift, add 8 so that the high half of the lane 3345 * becomes zero. For left-shift, and left-rotate, we must 3346 * shift up and down again. 3347 * (3) Step 2 leaves high half zero such that PACKUSWB 3348 * (pack with unsigned saturation) does not modify 3349 * the quantity. 3350 */ 3351 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3352 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3353 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3354 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3355 3356 if (opc != INDEX_op_rotli_vec) { 3357 imm += 8; 3358 } 3359 if (opc == INDEX_op_shri_vec) { 3360 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3361 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3362 } else { 3363 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3364 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3365 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3366 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3367 } 3368 3369 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3370 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3371 tcg_temp_free_vec(t1); 3372 tcg_temp_free_vec(t2); 3373} 3374 3375static void expand_vec_sari(TCGType type, unsigned vece, 3376 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3377{ 3378 TCGv_vec t1, t2; 3379 3380 switch (vece) { 3381 case MO_8: 3382 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3383 t1 = tcg_temp_new_vec(type); 3384 t2 = tcg_temp_new_vec(type); 3385 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3386 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3387 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3388 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3389 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3390 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3391 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3392 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3393 tcg_temp_free_vec(t1); 3394 tcg_temp_free_vec(t2); 3395 break; 3396 3397 case MO_64: 3398 if (imm <= 32) { 3399 /* 3400 * We can emulate a small sign extend by performing an arithmetic 3401 * 32-bit shift and overwriting the high half of a 64-bit logical 3402 * shift. Note that the ISA says shift of 32 is valid, but TCG 3403 * does not, so we have to bound the smaller shift -- we get the 3404 * same result in the high half either way. 3405 */ 3406 t1 = tcg_temp_new_vec(type); 3407 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3408 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3409 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3410 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3411 tcgv_vec_arg(t1), 0xaa); 3412 tcg_temp_free_vec(t1); 3413 } else { 3414 /* Otherwise we will need to use a compare vs 0 to produce 3415 * the sign-extend, shift and merge. 3416 */ 3417 t1 = tcg_const_zeros_vec(type); 3418 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); 3419 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3420 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3421 tcg_gen_or_vec(MO_64, v0, v0, t1); 3422 tcg_temp_free_vec(t1); 3423 } 3424 break; 3425 3426 default: 3427 g_assert_not_reached(); 3428 } 3429} 3430 3431static void expand_vec_rotli(TCGType type, unsigned vece, 3432 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3433{ 3434 TCGv_vec t; 3435 3436 if (vece == MO_8) { 3437 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3438 return; 3439 } 3440 3441 t = tcg_temp_new_vec(type); 3442 tcg_gen_shli_vec(vece, t, v1, imm); 3443 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3444 tcg_gen_or_vec(vece, v0, v0, t); 3445 tcg_temp_free_vec(t); 3446} 3447 3448static void expand_vec_rotls(TCGType type, unsigned vece, 3449 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3450{ 3451 TCGv_i32 rsh; 3452 TCGv_vec t; 3453 3454 tcg_debug_assert(vece != MO_8); 3455 3456 t = tcg_temp_new_vec(type); 3457 rsh = tcg_temp_new_i32(); 3458 3459 tcg_gen_neg_i32(rsh, lsh); 3460 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3461 tcg_gen_shls_vec(vece, t, v1, lsh); 3462 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3463 tcg_gen_or_vec(vece, v0, v0, t); 3464 tcg_temp_free_vec(t); 3465 tcg_temp_free_i32(rsh); 3466} 3467 3468static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3469 TCGv_vec v1, TCGv_vec sh, bool right) 3470{ 3471 TCGv_vec t = tcg_temp_new_vec(type); 3472 3473 tcg_gen_dupi_vec(vece, t, 8 << vece); 3474 tcg_gen_sub_vec(vece, t, t, sh); 3475 if (right) { 3476 tcg_gen_shlv_vec(vece, t, v1, t); 3477 tcg_gen_shrv_vec(vece, v0, v1, sh); 3478 } else { 3479 tcg_gen_shrv_vec(vece, t, v1, t); 3480 tcg_gen_shlv_vec(vece, v0, v1, sh); 3481 } 3482 tcg_gen_or_vec(vece, v0, v0, t); 3483 tcg_temp_free_vec(t); 3484} 3485 3486static void expand_vec_mul(TCGType type, unsigned vece, 3487 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3488{ 3489 TCGv_vec t1, t2, t3, t4, zero; 3490 3491 tcg_debug_assert(vece == MO_8); 3492 3493 /* 3494 * Unpack v1 bytes to words, 0 | x. 3495 * Unpack v2 bytes to words, y | 0. 3496 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3497 * Shift logical right by 8 bits to clear the high 8 bytes before 3498 * using an unsigned saturated pack. 3499 * 3500 * The difference between the V64, V128 and V256 cases is merely how 3501 * we distribute the expansion between temporaries. 3502 */ 3503 switch (type) { 3504 case TCG_TYPE_V64: 3505 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3506 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3507 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3508 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3509 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3510 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3511 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3512 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3513 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3514 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3515 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3516 tcg_temp_free_vec(t1); 3517 tcg_temp_free_vec(t2); 3518 break; 3519 3520 case TCG_TYPE_V128: 3521 case TCG_TYPE_V256: 3522 t1 = tcg_temp_new_vec(type); 3523 t2 = tcg_temp_new_vec(type); 3524 t3 = tcg_temp_new_vec(type); 3525 t4 = tcg_temp_new_vec(type); 3526 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 3527 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3528 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3529 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3530 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3531 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3532 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 3533 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3534 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 3535 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3536 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3537 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3538 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3539 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3540 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3541 tcg_temp_free_vec(t1); 3542 tcg_temp_free_vec(t2); 3543 tcg_temp_free_vec(t3); 3544 tcg_temp_free_vec(t4); 3545 break; 3546 3547 default: 3548 g_assert_not_reached(); 3549 } 3550} 3551 3552static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3553 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3554{ 3555 enum { 3556 NEED_INV = 1, 3557 NEED_SWAP = 2, 3558 NEED_BIAS = 4, 3559 NEED_UMIN = 8, 3560 NEED_UMAX = 16, 3561 }; 3562 TCGv_vec t1, t2, t3; 3563 uint8_t fixup; 3564 3565 switch (cond) { 3566 case TCG_COND_EQ: 3567 case TCG_COND_GT: 3568 fixup = 0; 3569 break; 3570 case TCG_COND_NE: 3571 case TCG_COND_LE: 3572 fixup = NEED_INV; 3573 break; 3574 case TCG_COND_LT: 3575 fixup = NEED_SWAP; 3576 break; 3577 case TCG_COND_GE: 3578 fixup = NEED_SWAP | NEED_INV; 3579 break; 3580 case TCG_COND_LEU: 3581 if (vece <= MO_32) { 3582 fixup = NEED_UMIN; 3583 } else { 3584 fixup = NEED_BIAS | NEED_INV; 3585 } 3586 break; 3587 case TCG_COND_GTU: 3588 if (vece <= MO_32) { 3589 fixup = NEED_UMIN | NEED_INV; 3590 } else { 3591 fixup = NEED_BIAS; 3592 } 3593 break; 3594 case TCG_COND_GEU: 3595 if (vece <= MO_32) { 3596 fixup = NEED_UMAX; 3597 } else { 3598 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3599 } 3600 break; 3601 case TCG_COND_LTU: 3602 if (vece <= MO_32) { 3603 fixup = NEED_UMAX | NEED_INV; 3604 } else { 3605 fixup = NEED_BIAS | NEED_SWAP; 3606 } 3607 break; 3608 default: 3609 g_assert_not_reached(); 3610 } 3611 3612 if (fixup & NEED_INV) { 3613 cond = tcg_invert_cond(cond); 3614 } 3615 if (fixup & NEED_SWAP) { 3616 t1 = v1, v1 = v2, v2 = t1; 3617 cond = tcg_swap_cond(cond); 3618 } 3619 3620 t1 = t2 = NULL; 3621 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3622 t1 = tcg_temp_new_vec(type); 3623 if (fixup & NEED_UMIN) { 3624 tcg_gen_umin_vec(vece, t1, v1, v2); 3625 } else { 3626 tcg_gen_umax_vec(vece, t1, v1, v2); 3627 } 3628 v2 = t1; 3629 cond = TCG_COND_EQ; 3630 } else if (fixup & NEED_BIAS) { 3631 t1 = tcg_temp_new_vec(type); 3632 t2 = tcg_temp_new_vec(type); 3633 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 3634 tcg_gen_sub_vec(vece, t1, v1, t3); 3635 tcg_gen_sub_vec(vece, t2, v2, t3); 3636 v1 = t1; 3637 v2 = t2; 3638 cond = tcg_signed_cond(cond); 3639 } 3640 3641 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 3642 /* Expand directly; do not recurse. */ 3643 vec_gen_4(INDEX_op_cmp_vec, type, vece, 3644 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 3645 3646 if (t1) { 3647 tcg_temp_free_vec(t1); 3648 if (t2) { 3649 tcg_temp_free_vec(t2); 3650 } 3651 } 3652 return fixup & NEED_INV; 3653} 3654 3655static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 3656 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3657{ 3658 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 3659 tcg_gen_not_vec(vece, v0, v0); 3660 } 3661} 3662 3663static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 3664 TCGv_vec c1, TCGv_vec c2, 3665 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 3666{ 3667 TCGv_vec t = tcg_temp_new_vec(type); 3668 3669 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 3670 /* Invert the sense of the compare by swapping arguments. */ 3671 TCGv_vec x; 3672 x = v3, v3 = v4, v4 = x; 3673 } 3674 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 3675 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 3676 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 3677 tcg_temp_free_vec(t); 3678} 3679 3680void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 3681 TCGArg a0, ...) 3682{ 3683 va_list va; 3684 TCGArg a2; 3685 TCGv_vec v0, v1, v2, v3, v4; 3686 3687 va_start(va, a0); 3688 v0 = temp_tcgv_vec(arg_temp(a0)); 3689 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3690 a2 = va_arg(va, TCGArg); 3691 3692 switch (opc) { 3693 case INDEX_op_shli_vec: 3694 case INDEX_op_shri_vec: 3695 expand_vec_shi(type, vece, opc, v0, v1, a2); 3696 break; 3697 3698 case INDEX_op_sari_vec: 3699 expand_vec_sari(type, vece, v0, v1, a2); 3700 break; 3701 3702 case INDEX_op_rotli_vec: 3703 expand_vec_rotli(type, vece, v0, v1, a2); 3704 break; 3705 3706 case INDEX_op_rotls_vec: 3707 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 3708 break; 3709 3710 case INDEX_op_rotlv_vec: 3711 v2 = temp_tcgv_vec(arg_temp(a2)); 3712 expand_vec_rotv(type, vece, v0, v1, v2, false); 3713 break; 3714 case INDEX_op_rotrv_vec: 3715 v2 = temp_tcgv_vec(arg_temp(a2)); 3716 expand_vec_rotv(type, vece, v0, v1, v2, true); 3717 break; 3718 3719 case INDEX_op_mul_vec: 3720 v2 = temp_tcgv_vec(arg_temp(a2)); 3721 expand_vec_mul(type, vece, v0, v1, v2); 3722 break; 3723 3724 case INDEX_op_cmp_vec: 3725 v2 = temp_tcgv_vec(arg_temp(a2)); 3726 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 3727 break; 3728 3729 case INDEX_op_cmpsel_vec: 3730 v2 = temp_tcgv_vec(arg_temp(a2)); 3731 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3732 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3733 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 3734 break; 3735 3736 default: 3737 break; 3738 } 3739 3740 va_end(va); 3741} 3742 3743static const int tcg_target_callee_save_regs[] = { 3744#if TCG_TARGET_REG_BITS == 64 3745 TCG_REG_RBP, 3746 TCG_REG_RBX, 3747#if defined(_WIN64) 3748 TCG_REG_RDI, 3749 TCG_REG_RSI, 3750#endif 3751 TCG_REG_R12, 3752 TCG_REG_R13, 3753 TCG_REG_R14, /* Currently used for the global env. */ 3754 TCG_REG_R15, 3755#else 3756 TCG_REG_EBP, /* Currently used for the global env. */ 3757 TCG_REG_EBX, 3758 TCG_REG_ESI, 3759 TCG_REG_EDI, 3760#endif 3761}; 3762 3763/* Compute frame size via macros, to share between tcg_target_qemu_prologue 3764 and tcg_register_jit. */ 3765 3766#define PUSH_SIZE \ 3767 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 3768 * (TCG_TARGET_REG_BITS / 8)) 3769 3770#define FRAME_SIZE \ 3771 ((PUSH_SIZE \ 3772 + TCG_STATIC_CALL_ARGS_SIZE \ 3773 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 3774 + TCG_TARGET_STACK_ALIGN - 1) \ 3775 & ~(TCG_TARGET_STACK_ALIGN - 1)) 3776 3777/* Generate global QEMU prologue and epilogue code */ 3778static void tcg_target_qemu_prologue(TCGContext *s) 3779{ 3780 int i, stack_addend; 3781 3782 /* TB prologue */ 3783 3784 /* Reserve some stack space, also for TCG temps. */ 3785 stack_addend = FRAME_SIZE - PUSH_SIZE; 3786 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 3787 CPU_TEMP_BUF_NLONGS * sizeof(long)); 3788 3789 /* Save all callee saved registers. */ 3790 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 3791 tcg_out_push(s, tcg_target_callee_save_regs[i]); 3792 } 3793 3794#if TCG_TARGET_REG_BITS == 32 3795 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 3796 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 3797 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3798 /* jmp *tb. */ 3799 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 3800 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 3801 + stack_addend); 3802#else 3803# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64 3804 if (guest_base) { 3805 int seg = setup_guest_base_seg(); 3806 if (seg != 0) { 3807 x86_guest_base_seg = seg; 3808 } else if (guest_base == (int32_t)guest_base) { 3809 x86_guest_base_offset = guest_base; 3810 } else { 3811 /* Choose R12 because, as a base, it requires a SIB byte. */ 3812 x86_guest_base_index = TCG_REG_R12; 3813 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base); 3814 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index); 3815 } 3816 } 3817# endif 3818 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 3819 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3820 /* jmp *tb. */ 3821 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 3822#endif 3823 3824 /* 3825 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 3826 * and fall through to the rest of the epilogue. 3827 */ 3828 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 3829 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 3830 3831 /* TB epilogue */ 3832 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 3833 3834 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 3835 3836 if (have_avx2) { 3837 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 3838 } 3839 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 3840 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 3841 } 3842 tcg_out_opc(s, OPC_RET, 0, 0, 0); 3843} 3844 3845static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 3846{ 3847 memset(p, 0x90, count); 3848} 3849 3850static void tcg_target_init(TCGContext *s) 3851{ 3852#ifdef CONFIG_CPUID_H 3853 unsigned a, b, c, d, b7 = 0; 3854 int max = __get_cpuid_max(0, 0); 3855 3856 if (max >= 7) { 3857 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ 3858 __cpuid_count(7, 0, a, b7, c, d); 3859 have_bmi1 = (b7 & bit_BMI) != 0; 3860 have_bmi2 = (b7 & bit_BMI2) != 0; 3861 } 3862 3863 if (max >= 1) { 3864 __cpuid(1, a, b, c, d); 3865#ifndef have_cmov 3866 /* For 32-bit, 99% certainty that we're running on hardware that 3867 supports cmov, but we still need to check. In case cmov is not 3868 available, we'll use a small forward branch. */ 3869 have_cmov = (d & bit_CMOV) != 0; 3870#endif 3871 3872 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we 3873 need to probe for it. */ 3874 have_movbe = (c & bit_MOVBE) != 0; 3875 have_popcnt = (c & bit_POPCNT) != 0; 3876 3877 /* There are a number of things we must check before we can be 3878 sure of not hitting invalid opcode. */ 3879 if (c & bit_OSXSAVE) { 3880 unsigned xcrl, xcrh; 3881 /* The xgetbv instruction is not available to older versions of 3882 * the assembler, so we encode the instruction manually. 3883 */ 3884 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); 3885 if ((xcrl & 6) == 6) { 3886 have_avx1 = (c & bit_AVX) != 0; 3887 have_avx2 = (b7 & bit_AVX2) != 0; 3888 } 3889 } 3890 } 3891 3892 max = __get_cpuid_max(0x8000000, 0); 3893 if (max >= 1) { 3894 __cpuid(0x80000001, a, b, c, d); 3895 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */ 3896 have_lzcnt = (c & bit_LZCNT) != 0; 3897 } 3898#endif /* CONFIG_CPUID_H */ 3899 3900 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 3901 if (TCG_TARGET_REG_BITS == 64) { 3902 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 3903 } 3904 if (have_avx1) { 3905 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 3906 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 3907 } 3908 if (have_avx2) { 3909 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 3910 } 3911 3912 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 3913 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 3914 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 3915 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 3916 if (TCG_TARGET_REG_BITS == 64) { 3917#if !defined(_WIN64) 3918 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 3919 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 3920#endif 3921 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 3922 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 3923 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 3924 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 3925 } 3926 3927 s->reserved_regs = 0; 3928 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 3929} 3930 3931typedef struct { 3932 DebugFrameHeader h; 3933 uint8_t fde_def_cfa[4]; 3934 uint8_t fde_reg_ofs[14]; 3935} DebugFrame; 3936 3937/* We're expecting a 2 byte uleb128 encoded value. */ 3938QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 3939 3940#if !defined(__ELF__) 3941 /* Host machine without ELF. */ 3942#elif TCG_TARGET_REG_BITS == 64 3943#define ELF_HOST_MACHINE EM_X86_64 3944static const DebugFrame debug_frame = { 3945 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 3946 .h.cie.id = -1, 3947 .h.cie.version = 1, 3948 .h.cie.code_align = 1, 3949 .h.cie.data_align = 0x78, /* sleb128 -8 */ 3950 .h.cie.return_column = 16, 3951 3952 /* Total FDE size does not include the "len" member. */ 3953 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 3954 3955 .fde_def_cfa = { 3956 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 3957 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 3958 (FRAME_SIZE >> 7) 3959 }, 3960 .fde_reg_ofs = { 3961 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 3962 /* The following ordering must match tcg_target_callee_save_regs. */ 3963 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 3964 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 3965 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 3966 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 3967 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 3968 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 3969 } 3970}; 3971#else 3972#define ELF_HOST_MACHINE EM_386 3973static const DebugFrame debug_frame = { 3974 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 3975 .h.cie.id = -1, 3976 .h.cie.version = 1, 3977 .h.cie.code_align = 1, 3978 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 3979 .h.cie.return_column = 8, 3980 3981 /* Total FDE size does not include the "len" member. */ 3982 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 3983 3984 .fde_def_cfa = { 3985 12, 4, /* DW_CFA_def_cfa %esp, ... */ 3986 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 3987 (FRAME_SIZE >> 7) 3988 }, 3989 .fde_reg_ofs = { 3990 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 3991 /* The following ordering must match tcg_target_callee_save_regs. */ 3992 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 3993 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 3994 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 3995 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 3996 } 3997}; 3998#endif 3999 4000#if defined(ELF_HOST_MACHINE) 4001void tcg_register_jit(const void *buf, size_t buf_size) 4002{ 4003 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4004} 4005#endif 4006