1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-pool.c.inc" 26 27#ifdef CONFIG_DEBUG_TCG 28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 29#if TCG_TARGET_REG_BITS == 64 30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 31#else 32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 33#endif 34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 36#if TCG_TARGET_REG_BITS == 64 37 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 38 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 39#endif 40}; 41#endif 42 43static const int tcg_target_reg_alloc_order[] = { 44#if TCG_TARGET_REG_BITS == 64 45 TCG_REG_RBP, 46 TCG_REG_RBX, 47 TCG_REG_R12, 48 TCG_REG_R13, 49 TCG_REG_R14, 50 TCG_REG_R15, 51 TCG_REG_R10, 52 TCG_REG_R11, 53 TCG_REG_R9, 54 TCG_REG_R8, 55 TCG_REG_RCX, 56 TCG_REG_RDX, 57 TCG_REG_RSI, 58 TCG_REG_RDI, 59 TCG_REG_RAX, 60#else 61 TCG_REG_EBX, 62 TCG_REG_ESI, 63 TCG_REG_EDI, 64 TCG_REG_EBP, 65 TCG_REG_ECX, 66 TCG_REG_EDX, 67 TCG_REG_EAX, 68#endif 69 TCG_REG_XMM0, 70 TCG_REG_XMM1, 71 TCG_REG_XMM2, 72 TCG_REG_XMM3, 73 TCG_REG_XMM4, 74 TCG_REG_XMM5, 75#ifndef _WIN64 76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 78 TCG_REG_XMM6, 79 TCG_REG_XMM7, 80#if TCG_TARGET_REG_BITS == 64 81 TCG_REG_XMM8, 82 TCG_REG_XMM9, 83 TCG_REG_XMM10, 84 TCG_REG_XMM11, 85 TCG_REG_XMM12, 86 TCG_REG_XMM13, 87 TCG_REG_XMM14, 88 TCG_REG_XMM15, 89#endif 90#endif 91}; 92 93static const int tcg_target_call_iarg_regs[] = { 94#if TCG_TARGET_REG_BITS == 64 95#if defined(_WIN64) 96 TCG_REG_RCX, 97 TCG_REG_RDX, 98#else 99 TCG_REG_RDI, 100 TCG_REG_RSI, 101 TCG_REG_RDX, 102 TCG_REG_RCX, 103#endif 104 TCG_REG_R8, 105 TCG_REG_R9, 106#else 107 /* 32 bit mode uses stack based calling convention (GCC default). */ 108#endif 109}; 110 111static const int tcg_target_call_oarg_regs[] = { 112 TCG_REG_EAX, 113#if TCG_TARGET_REG_BITS == 32 114 TCG_REG_EDX 115#endif 116}; 117 118/* Constants we accept. */ 119#define TCG_CT_CONST_S32 0x100 120#define TCG_CT_CONST_U32 0x200 121#define TCG_CT_CONST_I32 0x400 122#define TCG_CT_CONST_WSZ 0x800 123 124/* Registers used with L constraint, which are the first argument 125 registers on x86_64, and two random call clobbered registers on 126 i386. */ 127#if TCG_TARGET_REG_BITS == 64 128# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 129# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 130#else 131# define TCG_REG_L0 TCG_REG_EAX 132# define TCG_REG_L1 TCG_REG_EDX 133#endif 134 135/* The host compiler should supply <cpuid.h> to enable runtime features 136 detection, as we're not going to go so far as our own inline assembly. 137 If not available, default values will be assumed. */ 138#if defined(CONFIG_CPUID_H) 139#include "qemu/cpuid.h" 140#endif 141 142/* For 64-bit, we always know that CMOV is available. */ 143#if TCG_TARGET_REG_BITS == 64 144# define have_cmov 1 145#elif defined(CONFIG_CPUID_H) 146static bool have_cmov; 147#else 148# define have_cmov 0 149#endif 150 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize 152 it there. Therefore we always define the variable. */ 153bool have_bmi1; 154bool have_popcnt; 155bool have_avx1; 156bool have_avx2; 157bool have_movbe; 158 159#ifdef CONFIG_CPUID_H 160static bool have_bmi2; 161static bool have_lzcnt; 162#else 163# define have_bmi2 0 164# define have_lzcnt 0 165#endif 166 167static const tcg_insn_unit *tb_ret_addr; 168 169static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 170 intptr_t value, intptr_t addend) 171{ 172 value += addend; 173 switch(type) { 174 case R_386_PC32: 175 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 176 if (value != (int32_t)value) { 177 return false; 178 } 179 /* FALLTHRU */ 180 case R_386_32: 181 tcg_patch32(code_ptr, value); 182 break; 183 case R_386_PC8: 184 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 185 if (value != (int8_t)value) { 186 return false; 187 } 188 tcg_patch8(code_ptr, value); 189 break; 190 default: 191 tcg_abort(); 192 } 193 return true; 194} 195 196#if TCG_TARGET_REG_BITS == 64 197#define ALL_GENERAL_REGS 0x0000ffffu 198#define ALL_VECTOR_REGS 0xffff0000u 199#else 200#define ALL_GENERAL_REGS 0x000000ffu 201#define ALL_VECTOR_REGS 0x00ff0000u 202#endif 203 204/* parse target specific constraints */ 205static const char *target_parse_constraint(TCGArgConstraint *ct, 206 const char *ct_str, TCGType type) 207{ 208 switch(*ct_str++) { 209 case 'a': 210 tcg_regset_set_reg(ct->regs, TCG_REG_EAX); 211 break; 212 case 'b': 213 tcg_regset_set_reg(ct->regs, TCG_REG_EBX); 214 break; 215 case 'c': 216 tcg_regset_set_reg(ct->regs, TCG_REG_ECX); 217 break; 218 case 'd': 219 tcg_regset_set_reg(ct->regs, TCG_REG_EDX); 220 break; 221 case 'S': 222 tcg_regset_set_reg(ct->regs, TCG_REG_ESI); 223 break; 224 case 'D': 225 tcg_regset_set_reg(ct->regs, TCG_REG_EDI); 226 break; 227 case 'q': 228 /* A register that can be used as a byte operand. */ 229 ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf; 230 break; 231 case 'Q': 232 /* A register with an addressable second byte (e.g. %ah). */ 233 ct->regs = 0xf; 234 break; 235 case 'r': 236 /* A general register. */ 237 ct->regs |= ALL_GENERAL_REGS; 238 break; 239 case 'W': 240 /* With TZCNT/LZCNT, we can have operand-size as an input. */ 241 ct->ct |= TCG_CT_CONST_WSZ; 242 break; 243 case 'x': 244 /* A vector register. */ 245 ct->regs |= ALL_VECTOR_REGS; 246 break; 247 248 case 'L': 249 /* qemu_ld/st data+address constraint */ 250 ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff; 251#ifdef CONFIG_SOFTMMU 252 tcg_regset_reset_reg(ct->regs, TCG_REG_L0); 253 tcg_regset_reset_reg(ct->regs, TCG_REG_L1); 254#endif 255 break; 256 case 's': 257 /* qemu_st8_i32 data constraint */ 258 ct->regs = 0xf; 259#ifdef CONFIG_SOFTMMU 260 tcg_regset_reset_reg(ct->regs, TCG_REG_L0); 261 tcg_regset_reset_reg(ct->regs, TCG_REG_L1); 262#endif 263 break; 264 265 case 'e': 266 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32); 267 break; 268 case 'Z': 269 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32); 270 break; 271 case 'I': 272 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32); 273 break; 274 275 default: 276 return NULL; 277 } 278 return ct_str; 279} 280 281/* test if a constant matches the constraint */ 282static inline int tcg_target_const_match(tcg_target_long val, TCGType type, 283 const TCGArgConstraint *arg_ct) 284{ 285 int ct = arg_ct->ct; 286 if (ct & TCG_CT_CONST) { 287 return 1; 288 } 289 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 290 return 1; 291 } 292 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 293 return 1; 294 } 295 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 296 return 1; 297 } 298 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 299 return 1; 300 } 301 return 0; 302} 303 304# define LOWREGMASK(x) ((x) & 7) 305 306#define P_EXT 0x100 /* 0x0f opcode prefix */ 307#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 308#define P_DATA16 0x400 /* 0x66 opcode prefix */ 309#if TCG_TARGET_REG_BITS == 64 310# define P_REXW 0x1000 /* Set REX.W = 1 */ 311# define P_REXB_R 0x2000 /* REG field as byte register */ 312# define P_REXB_RM 0x4000 /* R/M field as byte register */ 313# define P_GS 0x8000 /* gs segment override */ 314#else 315# define P_REXW 0 316# define P_REXB_R 0 317# define P_REXB_RM 0 318# define P_GS 0 319#endif 320#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 321#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 322#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 323#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 324 325#define OPC_ARITH_EvIz (0x81) 326#define OPC_ARITH_EvIb (0x83) 327#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 328#define OPC_ANDN (0xf2 | P_EXT38) 329#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 330#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 331#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 332#define OPC_BSF (0xbc | P_EXT) 333#define OPC_BSR (0xbd | P_EXT) 334#define OPC_BSWAP (0xc8 | P_EXT) 335#define OPC_CALL_Jz (0xe8) 336#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 337#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 338#define OPC_DEC_r32 (0x48) 339#define OPC_IMUL_GvEv (0xaf | P_EXT) 340#define OPC_IMUL_GvEvIb (0x6b) 341#define OPC_IMUL_GvEvIz (0x69) 342#define OPC_INC_r32 (0x40) 343#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 344#define OPC_JCC_short (0x70) /* ... plus condition code */ 345#define OPC_JMP_long (0xe9) 346#define OPC_JMP_short (0xeb) 347#define OPC_LEA (0x8d) 348#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 349#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 350#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 351#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 352#define OPC_MOVB_EvIz (0xc6) 353#define OPC_MOVL_EvIz (0xc7) 354#define OPC_MOVL_Iv (0xb8) 355#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 356#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 357#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 358#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 359#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 360#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 361#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 362#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 363#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 364#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 365#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 366#define OPC_MOVSBL (0xbe | P_EXT) 367#define OPC_MOVSWL (0xbf | P_EXT) 368#define OPC_MOVSLQ (0x63 | P_REXW) 369#define OPC_MOVZBL (0xb6 | P_EXT) 370#define OPC_MOVZWL (0xb7 | P_EXT) 371#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 372#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 373#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 374#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 375#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 376#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 377#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 378#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 379#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 380#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 381#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 382#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 383#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 384#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 385#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 386#define OPC_PAND (0xdb | P_EXT | P_DATA16) 387#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 388#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 389#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 390#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 391#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 392#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 393#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 394#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 395#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 396#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 397#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 398#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 399#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 400#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 401#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 402#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 403#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 404#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 405#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 406#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 407#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 408#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 409#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 410#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 411#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 412#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 413#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 414#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 415#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 416#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 417#define OPC_POR (0xeb | P_EXT | P_DATA16) 418#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 419#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 420#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 421#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 422#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 423#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ 424#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 425#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 426#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 427#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 428#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 429#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 430#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 431#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 432#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 433#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 434#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 435#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 436#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 437#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 438#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 439#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 440#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 441#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 442#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 443#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 444#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 445#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 446#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 447#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 448#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 449#define OPC_PXOR (0xef | P_EXT | P_DATA16) 450#define OPC_POP_r32 (0x58) 451#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 452#define OPC_PUSH_r32 (0x50) 453#define OPC_PUSH_Iv (0x68) 454#define OPC_PUSH_Ib (0x6a) 455#define OPC_RET (0xc3) 456#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 457#define OPC_SHIFT_1 (0xd1) 458#define OPC_SHIFT_Ib (0xc1) 459#define OPC_SHIFT_cl (0xd3) 460#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 461#define OPC_SHUFPS (0xc6 | P_EXT) 462#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 463#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 464#define OPC_SHRD_Ib (0xac | P_EXT) 465#define OPC_TESTL (0x85) 466#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 467#define OPC_UD2 (0x0b | P_EXT) 468#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 469#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 470#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 471#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 472#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 473#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 474#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 475#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 476#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 477#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 478#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW) 479#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 480#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 481#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_REXW) 482#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 483#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 484#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_REXW) 485#define OPC_VZEROUPPER (0x77 | P_EXT) 486#define OPC_XCHG_ax_r32 (0x90) 487 488#define OPC_GRP3_Ev (0xf7) 489#define OPC_GRP5 (0xff) 490#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 491 492/* Group 1 opcode extensions for 0x80-0x83. 493 These are also used as modifiers for OPC_ARITH. */ 494#define ARITH_ADD 0 495#define ARITH_OR 1 496#define ARITH_ADC 2 497#define ARITH_SBB 3 498#define ARITH_AND 4 499#define ARITH_SUB 5 500#define ARITH_XOR 6 501#define ARITH_CMP 7 502 503/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 504#define SHIFT_ROL 0 505#define SHIFT_ROR 1 506#define SHIFT_SHL 4 507#define SHIFT_SHR 5 508#define SHIFT_SAR 7 509 510/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 511#define EXT3_NOT 2 512#define EXT3_NEG 3 513#define EXT3_MUL 4 514#define EXT3_IMUL 5 515#define EXT3_DIV 6 516#define EXT3_IDIV 7 517 518/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 519#define EXT5_INC_Ev 0 520#define EXT5_DEC_Ev 1 521#define EXT5_CALLN_Ev 2 522#define EXT5_JMPN_Ev 4 523 524/* Condition codes to be added to OPC_JCC_{long,short}. */ 525#define JCC_JMP (-1) 526#define JCC_JO 0x0 527#define JCC_JNO 0x1 528#define JCC_JB 0x2 529#define JCC_JAE 0x3 530#define JCC_JE 0x4 531#define JCC_JNE 0x5 532#define JCC_JBE 0x6 533#define JCC_JA 0x7 534#define JCC_JS 0x8 535#define JCC_JNS 0x9 536#define JCC_JP 0xa 537#define JCC_JNP 0xb 538#define JCC_JL 0xc 539#define JCC_JGE 0xd 540#define JCC_JLE 0xe 541#define JCC_JG 0xf 542 543static const uint8_t tcg_cond_to_jcc[] = { 544 [TCG_COND_EQ] = JCC_JE, 545 [TCG_COND_NE] = JCC_JNE, 546 [TCG_COND_LT] = JCC_JL, 547 [TCG_COND_GE] = JCC_JGE, 548 [TCG_COND_LE] = JCC_JLE, 549 [TCG_COND_GT] = JCC_JG, 550 [TCG_COND_LTU] = JCC_JB, 551 [TCG_COND_GEU] = JCC_JAE, 552 [TCG_COND_LEU] = JCC_JBE, 553 [TCG_COND_GTU] = JCC_JA, 554}; 555 556#if TCG_TARGET_REG_BITS == 64 557static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 558{ 559 int rex; 560 561 if (opc & P_GS) { 562 tcg_out8(s, 0x65); 563 } 564 if (opc & P_DATA16) { 565 /* We should never be asking for both 16 and 64-bit operation. */ 566 tcg_debug_assert((opc & P_REXW) == 0); 567 tcg_out8(s, 0x66); 568 } 569 if (opc & P_SIMDF3) { 570 tcg_out8(s, 0xf3); 571 } else if (opc & P_SIMDF2) { 572 tcg_out8(s, 0xf2); 573 } 574 575 rex = 0; 576 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 577 rex |= (r & 8) >> 1; /* REX.R */ 578 rex |= (x & 8) >> 2; /* REX.X */ 579 rex |= (rm & 8) >> 3; /* REX.B */ 580 581 /* P_REXB_{R,RM} indicates that the given register is the low byte. 582 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 583 as otherwise the encoding indicates %[abcd]h. Note that the values 584 that are ORed in merely indicate that the REX byte must be present; 585 those bits get discarded in output. */ 586 rex |= opc & (r >= 4 ? P_REXB_R : 0); 587 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 588 589 if (rex) { 590 tcg_out8(s, (uint8_t)(rex | 0x40)); 591 } 592 593 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 594 tcg_out8(s, 0x0f); 595 if (opc & P_EXT38) { 596 tcg_out8(s, 0x38); 597 } else if (opc & P_EXT3A) { 598 tcg_out8(s, 0x3a); 599 } 600 } 601 602 tcg_out8(s, opc); 603} 604#else 605static void tcg_out_opc(TCGContext *s, int opc) 606{ 607 if (opc & P_DATA16) { 608 tcg_out8(s, 0x66); 609 } 610 if (opc & P_SIMDF3) { 611 tcg_out8(s, 0xf3); 612 } else if (opc & P_SIMDF2) { 613 tcg_out8(s, 0xf2); 614 } 615 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 616 tcg_out8(s, 0x0f); 617 if (opc & P_EXT38) { 618 tcg_out8(s, 0x38); 619 } else if (opc & P_EXT3A) { 620 tcg_out8(s, 0x3a); 621 } 622 } 623 tcg_out8(s, opc); 624} 625/* Discard the register arguments to tcg_out_opc early, so as not to penalize 626 the 32-bit compilation paths. This method works with all versions of gcc, 627 whereas relying on optimization may not be able to exclude them. */ 628#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 629#endif 630 631static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 632{ 633 tcg_out_opc(s, opc, r, rm, 0); 634 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 635} 636 637static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 638 int rm, int index) 639{ 640 int tmp; 641 642 /* Use the two byte form if possible, which cannot encode 643 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 644 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT 645 && ((rm | index) & 8) == 0) { 646 /* Two byte VEX prefix. */ 647 tcg_out8(s, 0xc5); 648 649 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 650 } else { 651 /* Three byte VEX prefix. */ 652 tcg_out8(s, 0xc4); 653 654 /* VEX.m-mmmm */ 655 if (opc & P_EXT3A) { 656 tmp = 3; 657 } else if (opc & P_EXT38) { 658 tmp = 2; 659 } else if (opc & P_EXT) { 660 tmp = 1; 661 } else { 662 g_assert_not_reached(); 663 } 664 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 665 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 666 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 667 tcg_out8(s, tmp); 668 669 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ 670 } 671 672 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 673 /* VEX.pp */ 674 if (opc & P_DATA16) { 675 tmp |= 1; /* 0x66 */ 676 } else if (opc & P_SIMDF3) { 677 tmp |= 2; /* 0xf3 */ 678 } else if (opc & P_SIMDF2) { 679 tmp |= 3; /* 0xf2 */ 680 } 681 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 682 tcg_out8(s, tmp); 683 tcg_out8(s, opc); 684} 685 686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 687{ 688 tcg_out_vex_opc(s, opc, r, v, rm, 0); 689 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 690} 691 692/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 693 We handle either RM and INDEX missing with a negative value. In 64-bit 694 mode for absolute addresses, ~RM is the size of the immediate operand 695 that will follow the instruction. */ 696 697static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 698 int shift, intptr_t offset) 699{ 700 int mod, len; 701 702 if (index < 0 && rm < 0) { 703 if (TCG_TARGET_REG_BITS == 64) { 704 /* Try for a rip-relative addressing mode. This has replaced 705 the 32-bit-mode absolute addressing encoding. */ 706 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 707 intptr_t disp = offset - pc; 708 if (disp == (int32_t)disp) { 709 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 710 tcg_out32(s, disp); 711 return; 712 } 713 714 /* Try for an absolute address encoding. This requires the 715 use of the MODRM+SIB encoding and is therefore larger than 716 rip-relative addressing. */ 717 if (offset == (int32_t)offset) { 718 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 719 tcg_out8(s, (4 << 3) | 5); 720 tcg_out32(s, offset); 721 return; 722 } 723 724 /* ??? The memory isn't directly addressable. */ 725 g_assert_not_reached(); 726 } else { 727 /* Absolute address. */ 728 tcg_out8(s, (r << 3) | 5); 729 tcg_out32(s, offset); 730 return; 731 } 732 } 733 734 /* Find the length of the immediate addend. Note that the encoding 735 that would be used for (%ebp) indicates absolute addressing. */ 736 if (rm < 0) { 737 mod = 0, len = 4, rm = 5; 738 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 739 mod = 0, len = 0; 740 } else if (offset == (int8_t)offset) { 741 mod = 0x40, len = 1; 742 } else { 743 mod = 0x80, len = 4; 744 } 745 746 /* Use a single byte MODRM format if possible. Note that the encoding 747 that would be used for %esp is the escape to the two byte form. */ 748 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 749 /* Single byte MODRM format. */ 750 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 751 } else { 752 /* Two byte MODRM+SIB format. */ 753 754 /* Note that the encoding that would place %esp into the index 755 field indicates no index register. In 64-bit mode, the REX.X 756 bit counts, so %r12 can be used as the index. */ 757 if (index < 0) { 758 index = 4; 759 } else { 760 tcg_debug_assert(index != TCG_REG_ESP); 761 } 762 763 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 764 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 765 } 766 767 if (len == 1) { 768 tcg_out8(s, offset); 769 } else if (len == 4) { 770 tcg_out32(s, offset); 771 } 772} 773 774static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 775 int index, int shift, intptr_t offset) 776{ 777 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 778 tcg_out_sib_offset(s, r, rm, index, shift, offset); 779} 780 781static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 782 int rm, int index, int shift, 783 intptr_t offset) 784{ 785 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 786 tcg_out_sib_offset(s, r, rm, index, shift, offset); 787} 788 789/* A simplification of the above with no index or shift. */ 790static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 791 int rm, intptr_t offset) 792{ 793 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 794} 795 796static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 797 int v, int rm, intptr_t offset) 798{ 799 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 800} 801 802/* Output an opcode with an expected reference to the constant pool. */ 803static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 804{ 805 tcg_out_opc(s, opc, r, 0, 0); 806 /* Absolute for 32-bit, pc-relative for 64-bit. */ 807 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 808 tcg_out32(s, 0); 809} 810 811/* Output an opcode with an expected reference to the constant pool. */ 812static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 813{ 814 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 815 /* Absolute for 32-bit, pc-relative for 64-bit. */ 816 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 817 tcg_out32(s, 0); 818} 819 820/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 821static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 822{ 823 /* Propagate an opcode prefix, such as P_REXW. */ 824 int ext = subop & ~0x7; 825 subop &= 0x7; 826 827 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 828} 829 830static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 831{ 832 int rexw = 0; 833 834 if (arg == ret) { 835 return true; 836 } 837 switch (type) { 838 case TCG_TYPE_I64: 839 rexw = P_REXW; 840 /* fallthru */ 841 case TCG_TYPE_I32: 842 if (ret < 16) { 843 if (arg < 16) { 844 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 845 } else { 846 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 847 } 848 } else { 849 if (arg < 16) { 850 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 851 } else { 852 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 853 } 854 } 855 break; 856 857 case TCG_TYPE_V64: 858 tcg_debug_assert(ret >= 16 && arg >= 16); 859 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 860 break; 861 case TCG_TYPE_V128: 862 tcg_debug_assert(ret >= 16 && arg >= 16); 863 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 864 break; 865 case TCG_TYPE_V256: 866 tcg_debug_assert(ret >= 16 && arg >= 16); 867 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 868 break; 869 870 default: 871 g_assert_not_reached(); 872 } 873 return true; 874} 875 876static const int avx2_dup_insn[4] = { 877 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 878 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 879}; 880 881static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 882 TCGReg r, TCGReg a) 883{ 884 if (have_avx2) { 885 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 886 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a); 887 } else { 888 switch (vece) { 889 case MO_8: 890 /* ??? With zero in a register, use PSHUFB. */ 891 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 892 a = r; 893 /* FALLTHRU */ 894 case MO_16: 895 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 896 a = r; 897 /* FALLTHRU */ 898 case MO_32: 899 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 900 /* imm8 operand: all output lanes selected from input lane 0. */ 901 tcg_out8(s, 0); 902 break; 903 case MO_64: 904 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 905 break; 906 default: 907 g_assert_not_reached(); 908 } 909 } 910 return true; 911} 912 913static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 914 TCGReg r, TCGReg base, intptr_t offset) 915{ 916 if (have_avx2) { 917 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 918 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 919 r, 0, base, offset); 920 } else { 921 switch (vece) { 922 case MO_64: 923 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 924 break; 925 case MO_32: 926 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 927 break; 928 case MO_16: 929 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 930 tcg_out8(s, 0); /* imm8 */ 931 tcg_out_dup_vec(s, type, vece, r, r); 932 break; 933 case MO_8: 934 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 935 tcg_out8(s, 0); /* imm8 */ 936 tcg_out_dup_vec(s, type, vece, r, r); 937 break; 938 default: 939 g_assert_not_reached(); 940 } 941 } 942 return true; 943} 944 945static void tcg_out_dupi_vec(TCGContext *s, TCGType type, 946 TCGReg ret, tcg_target_long arg) 947{ 948 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 949 950 if (arg == 0) { 951 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 952 return; 953 } 954 if (arg == -1) { 955 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 956 return; 957 } 958 959 if (TCG_TARGET_REG_BITS == 64) { 960 if (type == TCG_TYPE_V64) { 961 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 962 } else if (have_avx2) { 963 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 964 } else { 965 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 966 } 967 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 968 } else { 969 if (have_avx2) { 970 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 971 } else { 972 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 973 } 974 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 975 } 976} 977 978static void tcg_out_movi(TCGContext *s, TCGType type, 979 TCGReg ret, tcg_target_long arg) 980{ 981 tcg_target_long diff; 982 983 switch (type) { 984 case TCG_TYPE_I32: 985#if TCG_TARGET_REG_BITS == 64 986 case TCG_TYPE_I64: 987#endif 988 if (ret < 16) { 989 break; 990 } 991 /* fallthru */ 992 case TCG_TYPE_V64: 993 case TCG_TYPE_V128: 994 case TCG_TYPE_V256: 995 tcg_debug_assert(ret >= 16); 996 tcg_out_dupi_vec(s, type, ret, arg); 997 return; 998 default: 999 g_assert_not_reached(); 1000 } 1001 1002 if (arg == 0) { 1003 tgen_arithr(s, ARITH_XOR, ret, ret); 1004 return; 1005 } 1006 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1007 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1008 tcg_out32(s, arg); 1009 return; 1010 } 1011 if (arg == (int32_t)arg) { 1012 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1013 tcg_out32(s, arg); 1014 return; 1015 } 1016 1017 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1018 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1019 if (diff == (int32_t)diff) { 1020 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1021 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1022 tcg_out32(s, diff); 1023 return; 1024 } 1025 1026 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1027 tcg_out64(s, arg); 1028} 1029 1030static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1031{ 1032 if (val == (int8_t)val) { 1033 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1034 tcg_out8(s, val); 1035 } else if (val == (int32_t)val) { 1036 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1037 tcg_out32(s, val); 1038 } else { 1039 tcg_abort(); 1040 } 1041} 1042 1043static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1044{ 1045 /* Given the strength of x86 memory ordering, we only need care for 1046 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1047 faster than "mfence", so don't bother with the sse insn. */ 1048 if (a0 & TCG_MO_ST_LD) { 1049 tcg_out8(s, 0xf0); 1050 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1051 tcg_out8(s, 0); 1052 } 1053} 1054 1055static inline void tcg_out_push(TCGContext *s, int reg) 1056{ 1057 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1058} 1059 1060static inline void tcg_out_pop(TCGContext *s, int reg) 1061{ 1062 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1063} 1064 1065static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1066 TCGReg arg1, intptr_t arg2) 1067{ 1068 switch (type) { 1069 case TCG_TYPE_I32: 1070 if (ret < 16) { 1071 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1072 } else { 1073 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1074 } 1075 break; 1076 case TCG_TYPE_I64: 1077 if (ret < 16) { 1078 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1079 break; 1080 } 1081 /* FALLTHRU */ 1082 case TCG_TYPE_V64: 1083 /* There is no instruction that can validate 8-byte alignment. */ 1084 tcg_debug_assert(ret >= 16); 1085 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1086 break; 1087 case TCG_TYPE_V128: 1088 /* 1089 * The gvec infrastructure is asserts that v128 vector loads 1090 * and stores use a 16-byte aligned offset. Validate that the 1091 * final pointer is aligned by using an insn that will SIGSEGV. 1092 */ 1093 tcg_debug_assert(ret >= 16); 1094 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1095 break; 1096 case TCG_TYPE_V256: 1097 /* 1098 * The gvec infrastructure only requires 16-byte alignment, 1099 * so here we must use an unaligned load. 1100 */ 1101 tcg_debug_assert(ret >= 16); 1102 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1103 ret, 0, arg1, arg2); 1104 break; 1105 default: 1106 g_assert_not_reached(); 1107 } 1108} 1109 1110static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1111 TCGReg arg1, intptr_t arg2) 1112{ 1113 switch (type) { 1114 case TCG_TYPE_I32: 1115 if (arg < 16) { 1116 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1117 } else { 1118 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1119 } 1120 break; 1121 case TCG_TYPE_I64: 1122 if (arg < 16) { 1123 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1124 break; 1125 } 1126 /* FALLTHRU */ 1127 case TCG_TYPE_V64: 1128 /* There is no instruction that can validate 8-byte alignment. */ 1129 tcg_debug_assert(arg >= 16); 1130 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1131 break; 1132 case TCG_TYPE_V128: 1133 /* 1134 * The gvec infrastructure is asserts that v128 vector loads 1135 * and stores use a 16-byte aligned offset. Validate that the 1136 * final pointer is aligned by using an insn that will SIGSEGV. 1137 */ 1138 tcg_debug_assert(arg >= 16); 1139 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1140 break; 1141 case TCG_TYPE_V256: 1142 /* 1143 * The gvec infrastructure only requires 16-byte alignment, 1144 * so here we must use an unaligned store. 1145 */ 1146 tcg_debug_assert(arg >= 16); 1147 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1148 arg, 0, arg1, arg2); 1149 break; 1150 default: 1151 g_assert_not_reached(); 1152 } 1153} 1154 1155static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1156 TCGReg base, intptr_t ofs) 1157{ 1158 int rexw = 0; 1159 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1160 if (val != (int32_t)val) { 1161 return false; 1162 } 1163 rexw = P_REXW; 1164 } else if (type != TCG_TYPE_I32) { 1165 return false; 1166 } 1167 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1168 tcg_out32(s, val); 1169 return true; 1170} 1171 1172static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1173{ 1174 /* Propagate an opcode prefix, such as P_DATA16. */ 1175 int ext = subopc & ~0x7; 1176 subopc &= 0x7; 1177 1178 if (count == 1) { 1179 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1180 } else { 1181 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1182 tcg_out8(s, count); 1183 } 1184} 1185 1186static inline void tcg_out_bswap32(TCGContext *s, int reg) 1187{ 1188 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1189} 1190 1191static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1192{ 1193 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1194} 1195 1196static inline void tcg_out_ext8u(TCGContext *s, int dest, int src) 1197{ 1198 /* movzbl */ 1199 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1200 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1201} 1202 1203static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw) 1204{ 1205 /* movsbl */ 1206 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1207 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1208} 1209 1210static inline void tcg_out_ext16u(TCGContext *s, int dest, int src) 1211{ 1212 /* movzwl */ 1213 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1214} 1215 1216static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw) 1217{ 1218 /* movsw[lq] */ 1219 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1220} 1221 1222static inline void tcg_out_ext32u(TCGContext *s, int dest, int src) 1223{ 1224 /* 32-bit mov zero extends. */ 1225 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1226} 1227 1228static inline void tcg_out_ext32s(TCGContext *s, int dest, int src) 1229{ 1230 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1231} 1232 1233static inline void tcg_out_bswap64(TCGContext *s, int reg) 1234{ 1235 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1236} 1237 1238static void tgen_arithi(TCGContext *s, int c, int r0, 1239 tcg_target_long val, int cf) 1240{ 1241 int rexw = 0; 1242 1243 if (TCG_TARGET_REG_BITS == 64) { 1244 rexw = c & -8; 1245 c &= 7; 1246 } 1247 1248 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1249 partial flags update stalls on Pentium4 and are not recommended 1250 by current Intel optimization manuals. */ 1251 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) { 1252 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1253 if (TCG_TARGET_REG_BITS == 64) { 1254 /* The single-byte increment encodings are re-tasked as the 1255 REX prefixes. Use the MODRM encoding. */ 1256 tcg_out_modrm(s, OPC_GRP5 + rexw, 1257 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1258 } else { 1259 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1260 } 1261 return; 1262 } 1263 1264 if (c == ARITH_AND) { 1265 if (TCG_TARGET_REG_BITS == 64) { 1266 if (val == 0xffffffffu) { 1267 tcg_out_ext32u(s, r0, r0); 1268 return; 1269 } 1270 if (val == (uint32_t)val) { 1271 /* AND with no high bits set can use a 32-bit operation. */ 1272 rexw = 0; 1273 } 1274 } 1275 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1276 tcg_out_ext8u(s, r0, r0); 1277 return; 1278 } 1279 if (val == 0xffffu) { 1280 tcg_out_ext16u(s, r0, r0); 1281 return; 1282 } 1283 } 1284 1285 if (val == (int8_t)val) { 1286 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1287 tcg_out8(s, val); 1288 return; 1289 } 1290 if (rexw == 0 || val == (int32_t)val) { 1291 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1292 tcg_out32(s, val); 1293 return; 1294 } 1295 1296 tcg_abort(); 1297} 1298 1299static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1300{ 1301 if (val != 0) { 1302 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1303 } 1304} 1305 1306/* Use SMALL != 0 to force a short forward branch. */ 1307static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small) 1308{ 1309 int32_t val, val1; 1310 1311 if (l->has_value) { 1312 val = tcg_pcrel_diff(s, l->u.value_ptr); 1313 val1 = val - 2; 1314 if ((int8_t)val1 == val1) { 1315 if (opc == -1) { 1316 tcg_out8(s, OPC_JMP_short); 1317 } else { 1318 tcg_out8(s, OPC_JCC_short + opc); 1319 } 1320 tcg_out8(s, val1); 1321 } else { 1322 if (small) { 1323 tcg_abort(); 1324 } 1325 if (opc == -1) { 1326 tcg_out8(s, OPC_JMP_long); 1327 tcg_out32(s, val - 5); 1328 } else { 1329 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1330 tcg_out32(s, val - 6); 1331 } 1332 } 1333 } else if (small) { 1334 if (opc == -1) { 1335 tcg_out8(s, OPC_JMP_short); 1336 } else { 1337 tcg_out8(s, OPC_JCC_short + opc); 1338 } 1339 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1340 s->code_ptr += 1; 1341 } else { 1342 if (opc == -1) { 1343 tcg_out8(s, OPC_JMP_long); 1344 } else { 1345 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1346 } 1347 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1348 s->code_ptr += 4; 1349 } 1350} 1351 1352static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2, 1353 int const_arg2, int rexw) 1354{ 1355 if (const_arg2) { 1356 if (arg2 == 0) { 1357 /* test r, r */ 1358 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1359 } else { 1360 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1361 } 1362 } else { 1363 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1364 } 1365} 1366 1367static void tcg_out_brcond32(TCGContext *s, TCGCond cond, 1368 TCGArg arg1, TCGArg arg2, int const_arg2, 1369 TCGLabel *label, int small) 1370{ 1371 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1372 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1373} 1374 1375#if TCG_TARGET_REG_BITS == 64 1376static void tcg_out_brcond64(TCGContext *s, TCGCond cond, 1377 TCGArg arg1, TCGArg arg2, int const_arg2, 1378 TCGLabel *label, int small) 1379{ 1380 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1381 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small); 1382} 1383#else 1384/* XXX: we implement it at the target level to avoid having to 1385 handle cross basic blocks temporaries */ 1386static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1387 const int *const_args, int small) 1388{ 1389 TCGLabel *label_next = gen_new_label(); 1390 TCGLabel *label_this = arg_label(args[5]); 1391 1392 switch(args[4]) { 1393 case TCG_COND_EQ: 1394 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1395 label_next, 1); 1396 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3], 1397 label_this, small); 1398 break; 1399 case TCG_COND_NE: 1400 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2], 1401 label_this, small); 1402 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3], 1403 label_this, small); 1404 break; 1405 case TCG_COND_LT: 1406 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1407 label_this, small); 1408 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1409 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1410 label_this, small); 1411 break; 1412 case TCG_COND_LE: 1413 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3], 1414 label_this, small); 1415 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1416 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1417 label_this, small); 1418 break; 1419 case TCG_COND_GT: 1420 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1421 label_this, small); 1422 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1423 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1424 label_this, small); 1425 break; 1426 case TCG_COND_GE: 1427 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3], 1428 label_this, small); 1429 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1430 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1431 label_this, small); 1432 break; 1433 case TCG_COND_LTU: 1434 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1435 label_this, small); 1436 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1437 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2], 1438 label_this, small); 1439 break; 1440 case TCG_COND_LEU: 1441 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3], 1442 label_this, small); 1443 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1444 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2], 1445 label_this, small); 1446 break; 1447 case TCG_COND_GTU: 1448 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1449 label_this, small); 1450 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1451 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2], 1452 label_this, small); 1453 break; 1454 case TCG_COND_GEU: 1455 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3], 1456 label_this, small); 1457 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1458 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2], 1459 label_this, small); 1460 break; 1461 default: 1462 tcg_abort(); 1463 } 1464 tcg_out_label(s, label_next); 1465} 1466#endif 1467 1468static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest, 1469 TCGArg arg1, TCGArg arg2, int const_arg2) 1470{ 1471 tcg_out_cmp(s, arg1, arg2, const_arg2, 0); 1472 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1473 tcg_out_ext8u(s, dest, dest); 1474} 1475 1476#if TCG_TARGET_REG_BITS == 64 1477static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest, 1478 TCGArg arg1, TCGArg arg2, int const_arg2) 1479{ 1480 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW); 1481 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest); 1482 tcg_out_ext8u(s, dest, dest); 1483} 1484#else 1485static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1486 const int *const_args) 1487{ 1488 TCGArg new_args[6]; 1489 TCGLabel *label_true, *label_over; 1490 1491 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1492 1493 if (args[0] == args[1] || args[0] == args[2] 1494 || (!const_args[3] && args[0] == args[3]) 1495 || (!const_args[4] && args[0] == args[4])) { 1496 /* When the destination overlaps with one of the argument 1497 registers, don't do anything tricky. */ 1498 label_true = gen_new_label(); 1499 label_over = gen_new_label(); 1500 1501 new_args[5] = label_arg(label_true); 1502 tcg_out_brcond2(s, new_args, const_args+1, 1); 1503 1504 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1505 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1506 tcg_out_label(s, label_true); 1507 1508 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1509 tcg_out_label(s, label_over); 1510 } else { 1511 /* When the destination does not overlap one of the arguments, 1512 clear the destination first, jump if cond false, and emit an 1513 increment in the true case. This results in smaller code. */ 1514 1515 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1516 1517 label_over = gen_new_label(); 1518 new_args[4] = tcg_invert_cond(new_args[4]); 1519 new_args[5] = label_arg(label_over); 1520 tcg_out_brcond2(s, new_args, const_args+1, 1); 1521 1522 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1523 tcg_out_label(s, label_over); 1524 } 1525} 1526#endif 1527 1528static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw, 1529 TCGReg dest, TCGReg v1) 1530{ 1531 if (have_cmov) { 1532 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1); 1533 } else { 1534 TCGLabel *over = gen_new_label(); 1535 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1); 1536 tcg_out_mov(s, TCG_TYPE_I32, dest, v1); 1537 tcg_out_label(s, over); 1538 } 1539} 1540 1541static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest, 1542 TCGReg c1, TCGArg c2, int const_c2, 1543 TCGReg v1) 1544{ 1545 tcg_out_cmp(s, c1, c2, const_c2, 0); 1546 tcg_out_cmov(s, cond, 0, dest, v1); 1547} 1548 1549#if TCG_TARGET_REG_BITS == 64 1550static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest, 1551 TCGReg c1, TCGArg c2, int const_c2, 1552 TCGReg v1) 1553{ 1554 tcg_out_cmp(s, c1, c2, const_c2, P_REXW); 1555 tcg_out_cmov(s, cond, P_REXW, dest, v1); 1556} 1557#endif 1558 1559static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1560 TCGArg arg2, bool const_a2) 1561{ 1562 if (have_bmi1) { 1563 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1564 if (const_a2) { 1565 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1566 } else { 1567 tcg_debug_assert(dest != arg2); 1568 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1569 } 1570 } else { 1571 tcg_debug_assert(dest != arg2); 1572 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1573 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1574 } 1575} 1576 1577static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1578 TCGArg arg2, bool const_a2) 1579{ 1580 if (have_lzcnt) { 1581 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1582 if (const_a2) { 1583 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1584 } else { 1585 tcg_debug_assert(dest != arg2); 1586 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2); 1587 } 1588 } else { 1589 tcg_debug_assert(!const_a2); 1590 tcg_debug_assert(dest != arg1); 1591 tcg_debug_assert(dest != arg2); 1592 1593 /* Recall that the output of BSR is the index not the count. */ 1594 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1595 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1596 1597 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1598 tcg_out_cmp(s, arg1, 0, 1, rexw); 1599 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2); 1600 } 1601} 1602 1603static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1604{ 1605 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1606 1607 if (disp == (int32_t)disp) { 1608 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1609 tcg_out32(s, disp); 1610 } else { 1611 /* rip-relative addressing into the constant pool. 1612 This is 6 + 8 = 14 bytes, as compared to using an 1613 an immediate load 10 + 6 = 16 bytes, plus we may 1614 be able to re-use the pool constant for more calls. */ 1615 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1616 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1617 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1618 tcg_out32(s, 0); 1619 } 1620} 1621 1622static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest) 1623{ 1624 tcg_out_branch(s, 1, dest); 1625} 1626 1627static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1628{ 1629 tcg_out_branch(s, 0, dest); 1630} 1631 1632static void tcg_out_nopn(TCGContext *s, int n) 1633{ 1634 int i; 1635 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1636 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1637 * duplicate prefix, and all of the interesting recent cores can 1638 * decode and discard the duplicates in a single cycle. 1639 */ 1640 tcg_debug_assert(n >= 1); 1641 for (i = 1; i < n; ++i) { 1642 tcg_out8(s, 0x66); 1643 } 1644 tcg_out8(s, 0x90); 1645} 1646 1647#if defined(CONFIG_SOFTMMU) 1648#include "../tcg-ldst.c.inc" 1649 1650/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, 1651 * int mmu_idx, uintptr_t ra) 1652 */ 1653static void * const qemu_ld_helpers[16] = { 1654 [MO_UB] = helper_ret_ldub_mmu, 1655 [MO_LEUW] = helper_le_lduw_mmu, 1656 [MO_LEUL] = helper_le_ldul_mmu, 1657 [MO_LEQ] = helper_le_ldq_mmu, 1658 [MO_BEUW] = helper_be_lduw_mmu, 1659 [MO_BEUL] = helper_be_ldul_mmu, 1660 [MO_BEQ] = helper_be_ldq_mmu, 1661}; 1662 1663/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, 1664 * uintxx_t val, int mmu_idx, uintptr_t ra) 1665 */ 1666static void * const qemu_st_helpers[16] = { 1667 [MO_UB] = helper_ret_stb_mmu, 1668 [MO_LEUW] = helper_le_stw_mmu, 1669 [MO_LEUL] = helper_le_stl_mmu, 1670 [MO_LEQ] = helper_le_stq_mmu, 1671 [MO_BEUW] = helper_be_stw_mmu, 1672 [MO_BEUL] = helper_be_stl_mmu, 1673 [MO_BEQ] = helper_be_stq_mmu, 1674}; 1675 1676/* Perform the TLB load and compare. 1677 1678 Inputs: 1679 ADDRLO and ADDRHI contain the low and high part of the address. 1680 1681 MEM_INDEX and S_BITS are the memory context and log2 size of the load. 1682 1683 WHICH is the offset into the CPUTLBEntry structure of the slot to read. 1684 This should be offsetof addr_read or addr_write. 1685 1686 Outputs: 1687 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses) 1688 positions of the displacements of forward jumps to the TLB miss case. 1689 1690 Second argument register is loaded with the low part of the address. 1691 In the TLB hit case, it has been adjusted as indicated by the TLB 1692 and so is a host address. In the TLB miss case, it continues to 1693 hold a guest address. 1694 1695 First argument register is clobbered. */ 1696 1697static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, 1698 int mem_index, MemOp opc, 1699 tcg_insn_unit **label_ptr, int which) 1700{ 1701 const TCGReg r0 = TCG_REG_L0; 1702 const TCGReg r1 = TCG_REG_L1; 1703 TCGType ttype = TCG_TYPE_I32; 1704 TCGType tlbtype = TCG_TYPE_I32; 1705 int trexw = 0, hrexw = 0, tlbrexw = 0; 1706 unsigned a_bits = get_alignment_bits(opc); 1707 unsigned s_bits = opc & MO_SIZE; 1708 unsigned a_mask = (1 << a_bits) - 1; 1709 unsigned s_mask = (1 << s_bits) - 1; 1710 target_ulong tlb_mask; 1711 1712 if (TCG_TARGET_REG_BITS == 64) { 1713 if (TARGET_LONG_BITS == 64) { 1714 ttype = TCG_TYPE_I64; 1715 trexw = P_REXW; 1716 } 1717 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 1718 hrexw = P_REXW; 1719 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) { 1720 tlbtype = TCG_TYPE_I64; 1721 tlbrexw = P_REXW; 1722 } 1723 } 1724 } 1725 1726 tcg_out_mov(s, tlbtype, r0, addrlo); 1727 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0, 1728 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 1729 1730 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0, 1731 TLB_MASK_TABLE_OFS(mem_index) + 1732 offsetof(CPUTLBDescFast, mask)); 1733 1734 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0, 1735 TLB_MASK_TABLE_OFS(mem_index) + 1736 offsetof(CPUTLBDescFast, table)); 1737 1738 /* If the required alignment is at least as large as the access, simply 1739 copy the address and mask. For lesser alignments, check that we don't 1740 cross pages for the complete access. */ 1741 if (a_bits >= s_bits) { 1742 tcg_out_mov(s, ttype, r1, addrlo); 1743 } else { 1744 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask); 1745 } 1746 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask; 1747 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0); 1748 1749 /* cmp 0(r0), r1 */ 1750 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which); 1751 1752 /* Prepare for both the fast path add of the tlb addend, and the slow 1753 path function argument setup. */ 1754 tcg_out_mov(s, ttype, r1, addrlo); 1755 1756 /* jne slow_path */ 1757 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1758 label_ptr[0] = s->code_ptr; 1759 s->code_ptr += 4; 1760 1761 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1762 /* cmp 4(r0), addrhi */ 1763 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4); 1764 1765 /* jne slow_path */ 1766 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 1767 label_ptr[1] = s->code_ptr; 1768 s->code_ptr += 4; 1769 } 1770 1771 /* TLB Hit. */ 1772 1773 /* add addend(r0), r1 */ 1774 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0, 1775 offsetof(CPUTLBEntry, addend)); 1776} 1777 1778/* 1779 * Record the context of a call to the out of line helper code for the slow path 1780 * for a load or store, so that we can later generate the correct helper code 1781 */ 1782static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64, 1783 TCGMemOpIdx oi, 1784 TCGReg datalo, TCGReg datahi, 1785 TCGReg addrlo, TCGReg addrhi, 1786 tcg_insn_unit *raddr, 1787 tcg_insn_unit **label_ptr) 1788{ 1789 TCGLabelQemuLdst *label = new_ldst_label(s); 1790 1791 label->is_ld = is_ld; 1792 label->oi = oi; 1793 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32; 1794 label->datalo_reg = datalo; 1795 label->datahi_reg = datahi; 1796 label->addrlo_reg = addrlo; 1797 label->addrhi_reg = addrhi; 1798 /* TODO: Cast goes away when all hosts converted */ 1799 label->raddr = (void *)tcg_splitwx_to_rx(raddr); 1800 label->label_ptr[0] = label_ptr[0]; 1801 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1802 label->label_ptr[1] = label_ptr[1]; 1803 } 1804} 1805 1806/* 1807 * Generate code for the slow path for a load at the end of block 1808 */ 1809static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1810{ 1811 TCGMemOpIdx oi = l->oi; 1812 MemOp opc = get_memop(oi); 1813 TCGReg data_reg; 1814 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1815 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0); 1816 1817 /* resolve label address */ 1818 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1819 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1820 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1821 } 1822 1823 if (TCG_TARGET_REG_BITS == 32) { 1824 int ofs = 0; 1825 1826 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 1827 ofs += 4; 1828 1829 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 1830 ofs += 4; 1831 1832 if (TARGET_LONG_BITS == 64) { 1833 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 1834 ofs += 4; 1835 } 1836 1837 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs); 1838 ofs += 4; 1839 1840 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs); 1841 } else { 1842 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 1843 /* The second argument is already loaded with addrlo. */ 1844 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi); 1845 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], 1846 (uintptr_t)l->raddr); 1847 } 1848 1849 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); 1850 1851 data_reg = l->datalo_reg; 1852 switch (opc & MO_SSIZE) { 1853 case MO_SB: 1854 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw); 1855 break; 1856 case MO_SW: 1857 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw); 1858 break; 1859#if TCG_TARGET_REG_BITS == 64 1860 case MO_SL: 1861 tcg_out_ext32s(s, data_reg, TCG_REG_EAX); 1862 break; 1863#endif 1864 case MO_UB: 1865 case MO_UW: 1866 /* Note that the helpers have zero-extended to tcg_target_long. */ 1867 case MO_UL: 1868 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); 1869 break; 1870 case MO_Q: 1871 if (TCG_TARGET_REG_BITS == 64) { 1872 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); 1873 } else if (data_reg == TCG_REG_EDX) { 1874 /* xchg %edx, %eax */ 1875 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); 1876 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); 1877 } else { 1878 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); 1879 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); 1880 } 1881 break; 1882 default: 1883 tcg_abort(); 1884 } 1885 1886 /* Jump to the code corresponding to next IR of qemu_st */ 1887 tcg_out_jmp(s, l->raddr); 1888 return true; 1889} 1890 1891/* 1892 * Generate code for the slow path for a store at the end of block 1893 */ 1894static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 1895{ 1896 TCGMemOpIdx oi = l->oi; 1897 MemOp opc = get_memop(oi); 1898 MemOp s_bits = opc & MO_SIZE; 1899 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 1900 TCGReg retaddr; 1901 1902 /* resolve label address */ 1903 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 1904 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { 1905 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 1906 } 1907 1908 if (TCG_TARGET_REG_BITS == 32) { 1909 int ofs = 0; 1910 1911 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); 1912 ofs += 4; 1913 1914 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); 1915 ofs += 4; 1916 1917 if (TARGET_LONG_BITS == 64) { 1918 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); 1919 ofs += 4; 1920 } 1921 1922 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs); 1923 ofs += 4; 1924 1925 if (s_bits == MO_64) { 1926 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs); 1927 ofs += 4; 1928 } 1929 1930 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs); 1931 ofs += 4; 1932 1933 retaddr = TCG_REG_EAX; 1934 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 1935 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs); 1936 } else { 1937 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); 1938 /* The second argument is already loaded with addrlo. */ 1939 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32), 1940 tcg_target_call_iarg_regs[2], l->datalo_reg); 1941 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi); 1942 1943 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) { 1944 retaddr = tcg_target_call_iarg_regs[4]; 1945 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 1946 } else { 1947 retaddr = TCG_REG_RAX; 1948 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); 1949 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, 1950 TCG_TARGET_CALL_STACK_OFFSET); 1951 } 1952 } 1953 1954 /* "Tail call" to the helper, with the return address back inline. */ 1955 tcg_out_push(s, retaddr); 1956 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); 1957 return true; 1958} 1959#elif TCG_TARGET_REG_BITS == 32 1960# define x86_guest_base_seg 0 1961# define x86_guest_base_index -1 1962# define x86_guest_base_offset guest_base 1963#else 1964static int x86_guest_base_seg; 1965static int x86_guest_base_index = -1; 1966static int32_t x86_guest_base_offset; 1967# if defined(__x86_64__) && defined(__linux__) 1968# include <asm/prctl.h> 1969# include <sys/prctl.h> 1970int arch_prctl(int code, unsigned long addr); 1971static inline int setup_guest_base_seg(void) 1972{ 1973 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 1974 return P_GS; 1975 } 1976 return 0; 1977} 1978# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__) 1979# include <machine/sysarch.h> 1980static inline int setup_guest_base_seg(void) 1981{ 1982 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 1983 return P_GS; 1984 } 1985 return 0; 1986} 1987# else 1988static inline int setup_guest_base_seg(void) 1989{ 1990 return 0; 1991} 1992# endif 1993#endif /* SOFTMMU */ 1994 1995static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 1996 TCGReg base, int index, intptr_t ofs, 1997 int seg, bool is64, MemOp memop) 1998{ 1999 bool use_movbe = false; 2000 int rexw = is64 * P_REXW; 2001 int movop = OPC_MOVL_GvEv; 2002 2003 /* Do big-endian loads with movbe. */ 2004 if (memop & MO_BSWAP) { 2005 tcg_debug_assert(have_movbe); 2006 use_movbe = true; 2007 movop = OPC_MOVBE_GyMy; 2008 } 2009 2010 switch (memop & MO_SSIZE) { 2011 case MO_UB: 2012 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo, 2013 base, index, 0, ofs); 2014 break; 2015 case MO_SB: 2016 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo, 2017 base, index, 0, ofs); 2018 break; 2019 case MO_UW: 2020 if (use_movbe) { 2021 /* There is no extending movbe; only low 16-bits are modified. */ 2022 if (datalo != base && datalo != index) { 2023 /* XOR breaks dependency chains. */ 2024 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2025 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2026 datalo, base, index, 0, ofs); 2027 } else { 2028 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2029 datalo, base, index, 0, ofs); 2030 tcg_out_ext16u(s, datalo, datalo); 2031 } 2032 } else { 2033 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo, 2034 base, index, 0, ofs); 2035 } 2036 break; 2037 case MO_SW: 2038 if (use_movbe) { 2039 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, 2040 datalo, base, index, 0, ofs); 2041 tcg_out_ext16s(s, datalo, datalo, rexw); 2042 } else { 2043 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg, 2044 datalo, base, index, 0, ofs); 2045 } 2046 break; 2047 case MO_UL: 2048 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs); 2049 break; 2050#if TCG_TARGET_REG_BITS == 64 2051 case MO_SL: 2052 if (use_movbe) { 2053 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo, 2054 base, index, 0, ofs); 2055 tcg_out_ext32s(s, datalo, datalo); 2056 } else { 2057 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo, 2058 base, index, 0, ofs); 2059 } 2060 break; 2061#endif 2062 case MO_Q: 2063 if (TCG_TARGET_REG_BITS == 64) { 2064 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo, 2065 base, index, 0, ofs); 2066 } else { 2067 if (use_movbe) { 2068 TCGReg t = datalo; 2069 datalo = datahi; 2070 datahi = t; 2071 } 2072 if (base != datalo) { 2073 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2074 base, index, 0, ofs); 2075 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2076 base, index, 0, ofs + 4); 2077 } else { 2078 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2079 base, index, 0, ofs + 4); 2080 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2081 base, index, 0, ofs); 2082 } 2083 } 2084 break; 2085 default: 2086 g_assert_not_reached(); 2087 } 2088} 2089 2090/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and 2091 EAX. It will be useful once fixed registers globals are less 2092 common. */ 2093static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64) 2094{ 2095 TCGReg datalo, datahi, addrlo; 2096 TCGReg addrhi __attribute__((unused)); 2097 TCGMemOpIdx oi; 2098 MemOp opc; 2099#if defined(CONFIG_SOFTMMU) 2100 int mem_index; 2101 tcg_insn_unit *label_ptr[2]; 2102#endif 2103 2104 datalo = *args++; 2105 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); 2106 addrlo = *args++; 2107 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); 2108 oi = *args++; 2109 opc = get_memop(oi); 2110 2111#if defined(CONFIG_SOFTMMU) 2112 mem_index = get_mmuidx(oi); 2113 2114 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, 2115 label_ptr, offsetof(CPUTLBEntry, addr_read)); 2116 2117 /* TLB Hit. */ 2118 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc); 2119 2120 /* Record the current context of a load into ldst label */ 2121 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi, 2122 s->code_ptr, label_ptr); 2123#else 2124 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index, 2125 x86_guest_base_offset, x86_guest_base_seg, 2126 is64, opc); 2127#endif 2128} 2129 2130static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2131 TCGReg base, int index, intptr_t ofs, 2132 int seg, MemOp memop) 2133{ 2134 bool use_movbe = false; 2135 int movop = OPC_MOVL_EvGv; 2136 2137 /* 2138 * Do big-endian stores with movbe or softmmu. 2139 * User-only without movbe will have its swapping done generically. 2140 */ 2141 if (memop & MO_BSWAP) { 2142 tcg_debug_assert(have_movbe); 2143 use_movbe = true; 2144 movop = OPC_MOVBE_MyGy; 2145 } 2146 2147 switch (memop & MO_SIZE) { 2148 case MO_8: 2149 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2150 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2151 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg, 2152 datalo, base, index, 0, ofs); 2153 break; 2154 case MO_16: 2155 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo, 2156 base, index, 0, ofs); 2157 break; 2158 case MO_32: 2159 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs); 2160 break; 2161 case MO_64: 2162 if (TCG_TARGET_REG_BITS == 64) { 2163 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo, 2164 base, index, 0, ofs); 2165 } else { 2166 if (use_movbe) { 2167 TCGReg t = datalo; 2168 datalo = datahi; 2169 datahi = t; 2170 } 2171 tcg_out_modrm_sib_offset(s, movop + seg, datalo, 2172 base, index, 0, ofs); 2173 tcg_out_modrm_sib_offset(s, movop + seg, datahi, 2174 base, index, 0, ofs + 4); 2175 } 2176 break; 2177 default: 2178 g_assert_not_reached(); 2179 } 2180} 2181 2182static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) 2183{ 2184 TCGReg datalo, datahi, addrlo; 2185 TCGReg addrhi __attribute__((unused)); 2186 TCGMemOpIdx oi; 2187 MemOp opc; 2188#if defined(CONFIG_SOFTMMU) 2189 int mem_index; 2190 tcg_insn_unit *label_ptr[2]; 2191#endif 2192 2193 datalo = *args++; 2194 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); 2195 addrlo = *args++; 2196 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); 2197 oi = *args++; 2198 opc = get_memop(oi); 2199 2200#if defined(CONFIG_SOFTMMU) 2201 mem_index = get_mmuidx(oi); 2202 2203 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, 2204 label_ptr, offsetof(CPUTLBEntry, addr_write)); 2205 2206 /* TLB Hit. */ 2207 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc); 2208 2209 /* Record the current context of a store into ldst label */ 2210 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi, 2211 s->code_ptr, label_ptr); 2212#else 2213 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index, 2214 x86_guest_base_offset, x86_guest_base_seg, opc); 2215#endif 2216} 2217 2218static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2219 const TCGArg *args, const int *const_args) 2220{ 2221 TCGArg a0, a1, a2; 2222 int c, const_a2, vexop, rexw = 0; 2223 2224#if TCG_TARGET_REG_BITS == 64 2225# define OP_32_64(x) \ 2226 case glue(glue(INDEX_op_, x), _i64): \ 2227 rexw = P_REXW; /* FALLTHRU */ \ 2228 case glue(glue(INDEX_op_, x), _i32) 2229#else 2230# define OP_32_64(x) \ 2231 case glue(glue(INDEX_op_, x), _i32) 2232#endif 2233 2234 /* Hoist the loads of the most common arguments. */ 2235 a0 = args[0]; 2236 a1 = args[1]; 2237 a2 = args[2]; 2238 const_a2 = const_args[2]; 2239 2240 switch (opc) { 2241 case INDEX_op_exit_tb: 2242 /* Reuse the zeroing that exists for goto_ptr. */ 2243 if (a0 == 0) { 2244 tcg_out_jmp(s, tcg_code_gen_epilogue); 2245 } else { 2246 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2247 tcg_out_jmp(s, tb_ret_addr); 2248 } 2249 break; 2250 case INDEX_op_goto_tb: 2251 if (s->tb_jmp_insn_offset) { 2252 /* direct jump method */ 2253 int gap; 2254 /* jump displacement must be aligned for atomic patching; 2255 * see if we need to add extra nops before jump 2256 */ 2257 gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2258 if (gap != 1) { 2259 tcg_out_nopn(s, gap - 1); 2260 } 2261 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2262 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); 2263 tcg_out32(s, 0); 2264 } else { 2265 /* indirect jump method */ 2266 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1, 2267 (intptr_t)(s->tb_jmp_target_addr + a0)); 2268 } 2269 set_jmp_reset_offset(s, a0); 2270 break; 2271 case INDEX_op_goto_ptr: 2272 /* jmp to the given host address (could be epilogue) */ 2273 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2274 break; 2275 case INDEX_op_br: 2276 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2277 break; 2278 OP_32_64(ld8u): 2279 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2280 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2281 break; 2282 OP_32_64(ld8s): 2283 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2284 break; 2285 OP_32_64(ld16u): 2286 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2287 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2288 break; 2289 OP_32_64(ld16s): 2290 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2291 break; 2292#if TCG_TARGET_REG_BITS == 64 2293 case INDEX_op_ld32u_i64: 2294#endif 2295 case INDEX_op_ld_i32: 2296 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2297 break; 2298 2299 OP_32_64(st8): 2300 if (const_args[0]) { 2301 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2302 tcg_out8(s, a0); 2303 } else { 2304 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2305 } 2306 break; 2307 OP_32_64(st16): 2308 if (const_args[0]) { 2309 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2310 tcg_out16(s, a0); 2311 } else { 2312 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2313 } 2314 break; 2315#if TCG_TARGET_REG_BITS == 64 2316 case INDEX_op_st32_i64: 2317#endif 2318 case INDEX_op_st_i32: 2319 if (const_args[0]) { 2320 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2321 tcg_out32(s, a0); 2322 } else { 2323 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2324 } 2325 break; 2326 2327 OP_32_64(add): 2328 /* For 3-operand addition, use LEA. */ 2329 if (a0 != a1) { 2330 TCGArg c3 = 0; 2331 if (const_a2) { 2332 c3 = a2, a2 = -1; 2333 } else if (a0 == a2) { 2334 /* Watch out for dest = src + dest, since we've removed 2335 the matching constraint on the add. */ 2336 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2337 break; 2338 } 2339 2340 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2341 break; 2342 } 2343 c = ARITH_ADD; 2344 goto gen_arith; 2345 OP_32_64(sub): 2346 c = ARITH_SUB; 2347 goto gen_arith; 2348 OP_32_64(and): 2349 c = ARITH_AND; 2350 goto gen_arith; 2351 OP_32_64(or): 2352 c = ARITH_OR; 2353 goto gen_arith; 2354 OP_32_64(xor): 2355 c = ARITH_XOR; 2356 goto gen_arith; 2357 gen_arith: 2358 if (const_a2) { 2359 tgen_arithi(s, c + rexw, a0, a2, 0); 2360 } else { 2361 tgen_arithr(s, c + rexw, a0, a2); 2362 } 2363 break; 2364 2365 OP_32_64(andc): 2366 if (const_a2) { 2367 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2368 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2369 } else { 2370 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2371 } 2372 break; 2373 2374 OP_32_64(mul): 2375 if (const_a2) { 2376 int32_t val; 2377 val = a2; 2378 if (val == (int8_t)val) { 2379 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2380 tcg_out8(s, val); 2381 } else { 2382 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2383 tcg_out32(s, val); 2384 } 2385 } else { 2386 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2387 } 2388 break; 2389 2390 OP_32_64(div2): 2391 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2392 break; 2393 OP_32_64(divu2): 2394 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2395 break; 2396 2397 OP_32_64(shl): 2398 /* For small constant 3-operand shift, use LEA. */ 2399 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2400 if (a2 - 1 == 0) { 2401 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2402 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2403 } else { 2404 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2405 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2406 } 2407 break; 2408 } 2409 c = SHIFT_SHL; 2410 vexop = OPC_SHLX; 2411 goto gen_shift_maybe_vex; 2412 OP_32_64(shr): 2413 c = SHIFT_SHR; 2414 vexop = OPC_SHRX; 2415 goto gen_shift_maybe_vex; 2416 OP_32_64(sar): 2417 c = SHIFT_SAR; 2418 vexop = OPC_SARX; 2419 goto gen_shift_maybe_vex; 2420 OP_32_64(rotl): 2421 c = SHIFT_ROL; 2422 goto gen_shift; 2423 OP_32_64(rotr): 2424 c = SHIFT_ROR; 2425 goto gen_shift; 2426 gen_shift_maybe_vex: 2427 if (have_bmi2) { 2428 if (!const_a2) { 2429 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2430 break; 2431 } 2432 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2433 } 2434 /* FALLTHRU */ 2435 gen_shift: 2436 if (const_a2) { 2437 tcg_out_shifti(s, c + rexw, a0, a2); 2438 } else { 2439 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2440 } 2441 break; 2442 2443 OP_32_64(ctz): 2444 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2445 break; 2446 OP_32_64(clz): 2447 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2448 break; 2449 OP_32_64(ctpop): 2450 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2451 break; 2452 2453 case INDEX_op_brcond_i32: 2454 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2455 break; 2456 case INDEX_op_setcond_i32: 2457 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2); 2458 break; 2459 case INDEX_op_movcond_i32: 2460 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]); 2461 break; 2462 2463 OP_32_64(bswap16): 2464 tcg_out_rolw_8(s, a0); 2465 break; 2466 OP_32_64(bswap32): 2467 tcg_out_bswap32(s, a0); 2468 break; 2469 2470 OP_32_64(neg): 2471 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2472 break; 2473 OP_32_64(not): 2474 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2475 break; 2476 2477 OP_32_64(ext8s): 2478 tcg_out_ext8s(s, a0, a1, rexw); 2479 break; 2480 OP_32_64(ext16s): 2481 tcg_out_ext16s(s, a0, a1, rexw); 2482 break; 2483 OP_32_64(ext8u): 2484 tcg_out_ext8u(s, a0, a1); 2485 break; 2486 OP_32_64(ext16u): 2487 tcg_out_ext16u(s, a0, a1); 2488 break; 2489 2490 case INDEX_op_qemu_ld_i32: 2491 tcg_out_qemu_ld(s, args, 0); 2492 break; 2493 case INDEX_op_qemu_ld_i64: 2494 tcg_out_qemu_ld(s, args, 1); 2495 break; 2496 case INDEX_op_qemu_st_i32: 2497 case INDEX_op_qemu_st8_i32: 2498 tcg_out_qemu_st(s, args, 0); 2499 break; 2500 case INDEX_op_qemu_st_i64: 2501 tcg_out_qemu_st(s, args, 1); 2502 break; 2503 2504 OP_32_64(mulu2): 2505 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2506 break; 2507 OP_32_64(muls2): 2508 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2509 break; 2510 OP_32_64(add2): 2511 if (const_args[4]) { 2512 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2513 } else { 2514 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2515 } 2516 if (const_args[5]) { 2517 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2518 } else { 2519 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2520 } 2521 break; 2522 OP_32_64(sub2): 2523 if (const_args[4]) { 2524 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2525 } else { 2526 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2527 } 2528 if (const_args[5]) { 2529 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2530 } else { 2531 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2532 } 2533 break; 2534 2535#if TCG_TARGET_REG_BITS == 32 2536 case INDEX_op_brcond2_i32: 2537 tcg_out_brcond2(s, args, const_args, 0); 2538 break; 2539 case INDEX_op_setcond2_i32: 2540 tcg_out_setcond2(s, args, const_args); 2541 break; 2542#else /* TCG_TARGET_REG_BITS == 64 */ 2543 case INDEX_op_ld32s_i64: 2544 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2545 break; 2546 case INDEX_op_ld_i64: 2547 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2548 break; 2549 case INDEX_op_st_i64: 2550 if (const_args[0]) { 2551 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2552 tcg_out32(s, a0); 2553 } else { 2554 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2555 } 2556 break; 2557 2558 case INDEX_op_brcond_i64: 2559 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0); 2560 break; 2561 case INDEX_op_setcond_i64: 2562 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2); 2563 break; 2564 case INDEX_op_movcond_i64: 2565 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]); 2566 break; 2567 2568 case INDEX_op_bswap64_i64: 2569 tcg_out_bswap64(s, a0); 2570 break; 2571 case INDEX_op_extu_i32_i64: 2572 case INDEX_op_ext32u_i64: 2573 case INDEX_op_extrl_i64_i32: 2574 tcg_out_ext32u(s, a0, a1); 2575 break; 2576 case INDEX_op_ext_i32_i64: 2577 case INDEX_op_ext32s_i64: 2578 tcg_out_ext32s(s, a0, a1); 2579 break; 2580 case INDEX_op_extrh_i64_i32: 2581 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2582 break; 2583#endif 2584 2585 OP_32_64(deposit): 2586 if (args[3] == 0 && args[4] == 8) { 2587 /* load bits 0..7 */ 2588 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2589 } else if (args[3] == 8 && args[4] == 8) { 2590 /* load bits 8..15 */ 2591 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2592 } else if (args[3] == 0 && args[4] == 16) { 2593 /* load bits 0..15 */ 2594 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2595 } else { 2596 tcg_abort(); 2597 } 2598 break; 2599 2600 case INDEX_op_extract_i64: 2601 if (a2 + args[3] == 32) { 2602 /* This is a 32-bit zero-extending right shift. */ 2603 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 2604 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 2605 break; 2606 } 2607 /* FALLTHRU */ 2608 case INDEX_op_extract_i32: 2609 /* On the off-chance that we can use the high-byte registers. 2610 Otherwise we emit the same ext16 + shift pattern that we 2611 would have gotten from the normal tcg-op.c expansion. */ 2612 tcg_debug_assert(a2 == 8 && args[3] == 8); 2613 if (a1 < 4 && a0 < 8) { 2614 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 2615 } else { 2616 tcg_out_ext16u(s, a0, a1); 2617 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 2618 } 2619 break; 2620 2621 case INDEX_op_sextract_i32: 2622 /* We don't implement sextract_i64, as we cannot sign-extend to 2623 64-bits without using the REX prefix that explicitly excludes 2624 access to the high-byte registers. */ 2625 tcg_debug_assert(a2 == 8 && args[3] == 8); 2626 if (a1 < 4 && a0 < 8) { 2627 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 2628 } else { 2629 tcg_out_ext16s(s, a0, a1, 0); 2630 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 2631 } 2632 break; 2633 2634 OP_32_64(extract2): 2635 /* Note that SHRD outputs to the r/m operand. */ 2636 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 2637 tcg_out8(s, args[3]); 2638 break; 2639 2640 case INDEX_op_mb: 2641 tcg_out_mb(s, a0); 2642 break; 2643 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 2644 case INDEX_op_mov_i64: 2645 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ 2646 case INDEX_op_movi_i64: 2647 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 2648 default: 2649 tcg_abort(); 2650 } 2651 2652#undef OP_32_64 2653} 2654 2655static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 2656 unsigned vecl, unsigned vece, 2657 const TCGArg *args, const int *const_args) 2658{ 2659 static int const add_insn[4] = { 2660 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 2661 }; 2662 static int const ssadd_insn[4] = { 2663 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 2664 }; 2665 static int const usadd_insn[4] = { 2666 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 2667 }; 2668 static int const sub_insn[4] = { 2669 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 2670 }; 2671 static int const sssub_insn[4] = { 2672 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 2673 }; 2674 static int const ussub_insn[4] = { 2675 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 2676 }; 2677 static int const mul_insn[4] = { 2678 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2 2679 }; 2680 static int const shift_imm_insn[4] = { 2681 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 2682 }; 2683 static int const cmpeq_insn[4] = { 2684 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 2685 }; 2686 static int const cmpgt_insn[4] = { 2687 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 2688 }; 2689 static int const punpckl_insn[4] = { 2690 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 2691 }; 2692 static int const punpckh_insn[4] = { 2693 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 2694 }; 2695 static int const packss_insn[4] = { 2696 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 2697 }; 2698 static int const packus_insn[4] = { 2699 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 2700 }; 2701 static int const smin_insn[4] = { 2702 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2 2703 }; 2704 static int const smax_insn[4] = { 2705 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2 2706 }; 2707 static int const umin_insn[4] = { 2708 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2 2709 }; 2710 static int const umax_insn[4] = { 2711 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2 2712 }; 2713 static int const shlv_insn[4] = { 2714 /* TODO: AVX512 adds support for MO_16. */ 2715 OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ 2716 }; 2717 static int const shrv_insn[4] = { 2718 /* TODO: AVX512 adds support for MO_16. */ 2719 OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ 2720 }; 2721 static int const sarv_insn[4] = { 2722 /* TODO: AVX512 adds support for MO_16, MO_64. */ 2723 OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2 2724 }; 2725 static int const shls_insn[4] = { 2726 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 2727 }; 2728 static int const shrs_insn[4] = { 2729 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 2730 }; 2731 static int const sars_insn[4] = { 2732 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2 2733 }; 2734 static int const abs_insn[4] = { 2735 /* TODO: AVX512 adds support for MO_64. */ 2736 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2 2737 }; 2738 2739 TCGType type = vecl + TCG_TYPE_V64; 2740 int insn, sub; 2741 TCGArg a0, a1, a2; 2742 2743 a0 = args[0]; 2744 a1 = args[1]; 2745 a2 = args[2]; 2746 2747 switch (opc) { 2748 case INDEX_op_add_vec: 2749 insn = add_insn[vece]; 2750 goto gen_simd; 2751 case INDEX_op_ssadd_vec: 2752 insn = ssadd_insn[vece]; 2753 goto gen_simd; 2754 case INDEX_op_usadd_vec: 2755 insn = usadd_insn[vece]; 2756 goto gen_simd; 2757 case INDEX_op_sub_vec: 2758 insn = sub_insn[vece]; 2759 goto gen_simd; 2760 case INDEX_op_sssub_vec: 2761 insn = sssub_insn[vece]; 2762 goto gen_simd; 2763 case INDEX_op_ussub_vec: 2764 insn = ussub_insn[vece]; 2765 goto gen_simd; 2766 case INDEX_op_mul_vec: 2767 insn = mul_insn[vece]; 2768 goto gen_simd; 2769 case INDEX_op_and_vec: 2770 insn = OPC_PAND; 2771 goto gen_simd; 2772 case INDEX_op_or_vec: 2773 insn = OPC_POR; 2774 goto gen_simd; 2775 case INDEX_op_xor_vec: 2776 insn = OPC_PXOR; 2777 goto gen_simd; 2778 case INDEX_op_smin_vec: 2779 insn = smin_insn[vece]; 2780 goto gen_simd; 2781 case INDEX_op_umin_vec: 2782 insn = umin_insn[vece]; 2783 goto gen_simd; 2784 case INDEX_op_smax_vec: 2785 insn = smax_insn[vece]; 2786 goto gen_simd; 2787 case INDEX_op_umax_vec: 2788 insn = umax_insn[vece]; 2789 goto gen_simd; 2790 case INDEX_op_shlv_vec: 2791 insn = shlv_insn[vece]; 2792 goto gen_simd; 2793 case INDEX_op_shrv_vec: 2794 insn = shrv_insn[vece]; 2795 goto gen_simd; 2796 case INDEX_op_sarv_vec: 2797 insn = sarv_insn[vece]; 2798 goto gen_simd; 2799 case INDEX_op_shls_vec: 2800 insn = shls_insn[vece]; 2801 goto gen_simd; 2802 case INDEX_op_shrs_vec: 2803 insn = shrs_insn[vece]; 2804 goto gen_simd; 2805 case INDEX_op_sars_vec: 2806 insn = sars_insn[vece]; 2807 goto gen_simd; 2808 case INDEX_op_x86_punpckl_vec: 2809 insn = punpckl_insn[vece]; 2810 goto gen_simd; 2811 case INDEX_op_x86_punpckh_vec: 2812 insn = punpckh_insn[vece]; 2813 goto gen_simd; 2814 case INDEX_op_x86_packss_vec: 2815 insn = packss_insn[vece]; 2816 goto gen_simd; 2817 case INDEX_op_x86_packus_vec: 2818 insn = packus_insn[vece]; 2819 goto gen_simd; 2820#if TCG_TARGET_REG_BITS == 32 2821 case INDEX_op_dup2_vec: 2822 /* First merge the two 32-bit inputs to a single 64-bit element. */ 2823 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 2824 /* Then replicate the 64-bit elements across the rest of the vector. */ 2825 if (type != TCG_TYPE_V64) { 2826 tcg_out_dup_vec(s, type, MO_64, a0, a0); 2827 } 2828 break; 2829#endif 2830 case INDEX_op_abs_vec: 2831 insn = abs_insn[vece]; 2832 a2 = a1; 2833 a1 = 0; 2834 goto gen_simd; 2835 gen_simd: 2836 tcg_debug_assert(insn != OPC_UD2); 2837 if (type == TCG_TYPE_V256) { 2838 insn |= P_VEXL; 2839 } 2840 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2841 break; 2842 2843 case INDEX_op_cmp_vec: 2844 sub = args[3]; 2845 if (sub == TCG_COND_EQ) { 2846 insn = cmpeq_insn[vece]; 2847 } else if (sub == TCG_COND_GT) { 2848 insn = cmpgt_insn[vece]; 2849 } else { 2850 g_assert_not_reached(); 2851 } 2852 goto gen_simd; 2853 2854 case INDEX_op_andc_vec: 2855 insn = OPC_PANDN; 2856 if (type == TCG_TYPE_V256) { 2857 insn |= P_VEXL; 2858 } 2859 tcg_out_vex_modrm(s, insn, a0, a2, a1); 2860 break; 2861 2862 case INDEX_op_shli_vec: 2863 sub = 6; 2864 goto gen_shift; 2865 case INDEX_op_shri_vec: 2866 sub = 2; 2867 goto gen_shift; 2868 case INDEX_op_sari_vec: 2869 tcg_debug_assert(vece != MO_64); 2870 sub = 4; 2871 gen_shift: 2872 tcg_debug_assert(vece != MO_8); 2873 insn = shift_imm_insn[vece]; 2874 if (type == TCG_TYPE_V256) { 2875 insn |= P_VEXL; 2876 } 2877 tcg_out_vex_modrm(s, insn, sub, a0, a1); 2878 tcg_out8(s, a2); 2879 break; 2880 2881 case INDEX_op_ld_vec: 2882 tcg_out_ld(s, type, a0, a1, a2); 2883 break; 2884 case INDEX_op_st_vec: 2885 tcg_out_st(s, type, a0, a1, a2); 2886 break; 2887 case INDEX_op_dupm_vec: 2888 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 2889 break; 2890 2891 case INDEX_op_x86_shufps_vec: 2892 insn = OPC_SHUFPS; 2893 sub = args[3]; 2894 goto gen_simd_imm8; 2895 case INDEX_op_x86_blend_vec: 2896 if (vece == MO_16) { 2897 insn = OPC_PBLENDW; 2898 } else if (vece == MO_32) { 2899 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 2900 } else { 2901 g_assert_not_reached(); 2902 } 2903 sub = args[3]; 2904 goto gen_simd_imm8; 2905 case INDEX_op_x86_vperm2i128_vec: 2906 insn = OPC_VPERM2I128; 2907 sub = args[3]; 2908 goto gen_simd_imm8; 2909 gen_simd_imm8: 2910 if (type == TCG_TYPE_V256) { 2911 insn |= P_VEXL; 2912 } 2913 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2914 tcg_out8(s, sub); 2915 break; 2916 2917 case INDEX_op_x86_vpblendvb_vec: 2918 insn = OPC_VPBLENDVB; 2919 if (type == TCG_TYPE_V256) { 2920 insn |= P_VEXL; 2921 } 2922 tcg_out_vex_modrm(s, insn, a0, a1, a2); 2923 tcg_out8(s, args[3] << 4); 2924 break; 2925 2926 case INDEX_op_x86_psrldq_vec: 2927 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 2928 tcg_out8(s, a2); 2929 break; 2930 2931 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 2932 case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi. */ 2933 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 2934 default: 2935 g_assert_not_reached(); 2936 } 2937} 2938 2939static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) 2940{ 2941 static const TCGTargetOpDef r = { .args_ct_str = { "r" } }; 2942 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } }; 2943 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } }; 2944 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } }; 2945 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } }; 2946 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } }; 2947 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } }; 2948 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } }; 2949 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } }; 2950 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } }; 2951 static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } }; 2952 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } }; 2953 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } }; 2954 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } }; 2955 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } }; 2956 static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } }; 2957 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } }; 2958 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } }; 2959 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } }; 2960 static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } }; 2961 static const TCGTargetOpDef r_r_L_L 2962 = { .args_ct_str = { "r", "r", "L", "L" } }; 2963 static const TCGTargetOpDef L_L_L_L 2964 = { .args_ct_str = { "L", "L", "L", "L" } }; 2965 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } }; 2966 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; 2967 static const TCGTargetOpDef x_x_x_x 2968 = { .args_ct_str = { "x", "x", "x", "x" } }; 2969 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; 2970 2971 switch (op) { 2972 case INDEX_op_goto_ptr: 2973 return &r; 2974 2975 case INDEX_op_ld8u_i32: 2976 case INDEX_op_ld8u_i64: 2977 case INDEX_op_ld8s_i32: 2978 case INDEX_op_ld8s_i64: 2979 case INDEX_op_ld16u_i32: 2980 case INDEX_op_ld16u_i64: 2981 case INDEX_op_ld16s_i32: 2982 case INDEX_op_ld16s_i64: 2983 case INDEX_op_ld_i32: 2984 case INDEX_op_ld32u_i64: 2985 case INDEX_op_ld32s_i64: 2986 case INDEX_op_ld_i64: 2987 return &r_r; 2988 2989 case INDEX_op_st8_i32: 2990 case INDEX_op_st8_i64: 2991 return &qi_r; 2992 case INDEX_op_st16_i32: 2993 case INDEX_op_st16_i64: 2994 case INDEX_op_st_i32: 2995 case INDEX_op_st32_i64: 2996 return &ri_r; 2997 case INDEX_op_st_i64: 2998 return &re_r; 2999 3000 case INDEX_op_add_i32: 3001 case INDEX_op_add_i64: 3002 return &r_r_re; 3003 case INDEX_op_sub_i32: 3004 case INDEX_op_sub_i64: 3005 case INDEX_op_mul_i32: 3006 case INDEX_op_mul_i64: 3007 case INDEX_op_or_i32: 3008 case INDEX_op_or_i64: 3009 case INDEX_op_xor_i32: 3010 case INDEX_op_xor_i64: 3011 return &r_0_re; 3012 3013 case INDEX_op_and_i32: 3014 case INDEX_op_and_i64: 3015 { 3016 static const TCGTargetOpDef and 3017 = { .args_ct_str = { "r", "0", "reZ" } }; 3018 return ∧ 3019 } 3020 break; 3021 case INDEX_op_andc_i32: 3022 case INDEX_op_andc_i64: 3023 { 3024 static const TCGTargetOpDef andc 3025 = { .args_ct_str = { "r", "r", "rI" } }; 3026 return &andc; 3027 } 3028 break; 3029 3030 case INDEX_op_shl_i32: 3031 case INDEX_op_shl_i64: 3032 case INDEX_op_shr_i32: 3033 case INDEX_op_shr_i64: 3034 case INDEX_op_sar_i32: 3035 case INDEX_op_sar_i64: 3036 return have_bmi2 ? &r_r_ri : &r_0_ci; 3037 case INDEX_op_rotl_i32: 3038 case INDEX_op_rotl_i64: 3039 case INDEX_op_rotr_i32: 3040 case INDEX_op_rotr_i64: 3041 return &r_0_ci; 3042 3043 case INDEX_op_brcond_i32: 3044 case INDEX_op_brcond_i64: 3045 return &r_re; 3046 3047 case INDEX_op_bswap16_i32: 3048 case INDEX_op_bswap16_i64: 3049 case INDEX_op_bswap32_i32: 3050 case INDEX_op_bswap32_i64: 3051 case INDEX_op_bswap64_i64: 3052 case INDEX_op_neg_i32: 3053 case INDEX_op_neg_i64: 3054 case INDEX_op_not_i32: 3055 case INDEX_op_not_i64: 3056 case INDEX_op_extrh_i64_i32: 3057 return &r_0; 3058 3059 case INDEX_op_ext8s_i32: 3060 case INDEX_op_ext8s_i64: 3061 case INDEX_op_ext8u_i32: 3062 case INDEX_op_ext8u_i64: 3063 return &r_q; 3064 case INDEX_op_ext16s_i32: 3065 case INDEX_op_ext16s_i64: 3066 case INDEX_op_ext16u_i32: 3067 case INDEX_op_ext16u_i64: 3068 case INDEX_op_ext32s_i64: 3069 case INDEX_op_ext32u_i64: 3070 case INDEX_op_ext_i32_i64: 3071 case INDEX_op_extu_i32_i64: 3072 case INDEX_op_extrl_i64_i32: 3073 case INDEX_op_extract_i32: 3074 case INDEX_op_extract_i64: 3075 case INDEX_op_sextract_i32: 3076 case INDEX_op_ctpop_i32: 3077 case INDEX_op_ctpop_i64: 3078 return &r_r; 3079 case INDEX_op_extract2_i32: 3080 case INDEX_op_extract2_i64: 3081 return &r_0_r; 3082 3083 case INDEX_op_deposit_i32: 3084 case INDEX_op_deposit_i64: 3085 { 3086 static const TCGTargetOpDef dep 3087 = { .args_ct_str = { "Q", "0", "Q" } }; 3088 return &dep; 3089 } 3090 case INDEX_op_setcond_i32: 3091 case INDEX_op_setcond_i64: 3092 { 3093 static const TCGTargetOpDef setc 3094 = { .args_ct_str = { "q", "r", "re" } }; 3095 return &setc; 3096 } 3097 case INDEX_op_movcond_i32: 3098 case INDEX_op_movcond_i64: 3099 { 3100 static const TCGTargetOpDef movc 3101 = { .args_ct_str = { "r", "r", "re", "r", "0" } }; 3102 return &movc; 3103 } 3104 case INDEX_op_div2_i32: 3105 case INDEX_op_div2_i64: 3106 case INDEX_op_divu2_i32: 3107 case INDEX_op_divu2_i64: 3108 { 3109 static const TCGTargetOpDef div2 3110 = { .args_ct_str = { "a", "d", "0", "1", "r" } }; 3111 return &div2; 3112 } 3113 case INDEX_op_mulu2_i32: 3114 case INDEX_op_mulu2_i64: 3115 case INDEX_op_muls2_i32: 3116 case INDEX_op_muls2_i64: 3117 { 3118 static const TCGTargetOpDef mul2 3119 = { .args_ct_str = { "a", "d", "a", "r" } }; 3120 return &mul2; 3121 } 3122 case INDEX_op_add2_i32: 3123 case INDEX_op_add2_i64: 3124 case INDEX_op_sub2_i32: 3125 case INDEX_op_sub2_i64: 3126 { 3127 static const TCGTargetOpDef arith2 3128 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } }; 3129 return &arith2; 3130 } 3131 case INDEX_op_ctz_i32: 3132 case INDEX_op_ctz_i64: 3133 { 3134 static const TCGTargetOpDef ctz[2] = { 3135 { .args_ct_str = { "&r", "r", "r" } }, 3136 { .args_ct_str = { "&r", "r", "rW" } }, 3137 }; 3138 return &ctz[have_bmi1]; 3139 } 3140 case INDEX_op_clz_i32: 3141 case INDEX_op_clz_i64: 3142 { 3143 static const TCGTargetOpDef clz[2] = { 3144 { .args_ct_str = { "&r", "r", "r" } }, 3145 { .args_ct_str = { "&r", "r", "rW" } }, 3146 }; 3147 return &clz[have_lzcnt]; 3148 } 3149 3150 case INDEX_op_qemu_ld_i32: 3151 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L; 3152 case INDEX_op_qemu_st_i32: 3153 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L; 3154 case INDEX_op_qemu_st8_i32: 3155 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L; 3156 case INDEX_op_qemu_ld_i64: 3157 return (TCG_TARGET_REG_BITS == 64 ? &r_L 3158 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L 3159 : &r_r_L_L); 3160 case INDEX_op_qemu_st_i64: 3161 return (TCG_TARGET_REG_BITS == 64 ? &L_L 3162 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L 3163 : &L_L_L_L); 3164 3165 case INDEX_op_brcond2_i32: 3166 { 3167 static const TCGTargetOpDef b2 3168 = { .args_ct_str = { "r", "r", "ri", "ri" } }; 3169 return &b2; 3170 } 3171 case INDEX_op_setcond2_i32: 3172 { 3173 static const TCGTargetOpDef s2 3174 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } }; 3175 return &s2; 3176 } 3177 3178 case INDEX_op_ld_vec: 3179 case INDEX_op_st_vec: 3180 case INDEX_op_dupm_vec: 3181 return &x_r; 3182 3183 case INDEX_op_add_vec: 3184 case INDEX_op_sub_vec: 3185 case INDEX_op_mul_vec: 3186 case INDEX_op_and_vec: 3187 case INDEX_op_or_vec: 3188 case INDEX_op_xor_vec: 3189 case INDEX_op_andc_vec: 3190 case INDEX_op_ssadd_vec: 3191 case INDEX_op_usadd_vec: 3192 case INDEX_op_sssub_vec: 3193 case INDEX_op_ussub_vec: 3194 case INDEX_op_smin_vec: 3195 case INDEX_op_umin_vec: 3196 case INDEX_op_smax_vec: 3197 case INDEX_op_umax_vec: 3198 case INDEX_op_shlv_vec: 3199 case INDEX_op_shrv_vec: 3200 case INDEX_op_sarv_vec: 3201 case INDEX_op_shls_vec: 3202 case INDEX_op_shrs_vec: 3203 case INDEX_op_sars_vec: 3204 case INDEX_op_rotls_vec: 3205 case INDEX_op_cmp_vec: 3206 case INDEX_op_x86_shufps_vec: 3207 case INDEX_op_x86_blend_vec: 3208 case INDEX_op_x86_packss_vec: 3209 case INDEX_op_x86_packus_vec: 3210 case INDEX_op_x86_vperm2i128_vec: 3211 case INDEX_op_x86_punpckl_vec: 3212 case INDEX_op_x86_punpckh_vec: 3213#if TCG_TARGET_REG_BITS == 32 3214 case INDEX_op_dup2_vec: 3215#endif 3216 return &x_x_x; 3217 case INDEX_op_abs_vec: 3218 case INDEX_op_dup_vec: 3219 case INDEX_op_shli_vec: 3220 case INDEX_op_shri_vec: 3221 case INDEX_op_sari_vec: 3222 case INDEX_op_x86_psrldq_vec: 3223 return &x_x; 3224 case INDEX_op_x86_vpblendvb_vec: 3225 return &x_x_x_x; 3226 3227 default: 3228 break; 3229 } 3230 return NULL; 3231} 3232 3233int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3234{ 3235 switch (opc) { 3236 case INDEX_op_add_vec: 3237 case INDEX_op_sub_vec: 3238 case INDEX_op_and_vec: 3239 case INDEX_op_or_vec: 3240 case INDEX_op_xor_vec: 3241 case INDEX_op_andc_vec: 3242 return 1; 3243 case INDEX_op_rotli_vec: 3244 case INDEX_op_cmp_vec: 3245 case INDEX_op_cmpsel_vec: 3246 return -1; 3247 3248 case INDEX_op_shli_vec: 3249 case INDEX_op_shri_vec: 3250 /* We must expand the operation for MO_8. */ 3251 return vece == MO_8 ? -1 : 1; 3252 3253 case INDEX_op_sari_vec: 3254 /* We must expand the operation for MO_8. */ 3255 if (vece == MO_8) { 3256 return -1; 3257 } 3258 /* We can emulate this for MO_64, but it does not pay off 3259 unless we're producing at least 4 values. */ 3260 if (vece == MO_64) { 3261 return type >= TCG_TYPE_V256 ? -1 : 0; 3262 } 3263 return 1; 3264 3265 case INDEX_op_shls_vec: 3266 case INDEX_op_shrs_vec: 3267 return vece >= MO_16; 3268 case INDEX_op_sars_vec: 3269 return vece >= MO_16 && vece <= MO_32; 3270 case INDEX_op_rotls_vec: 3271 return vece >= MO_16 ? -1 : 0; 3272 3273 case INDEX_op_shlv_vec: 3274 case INDEX_op_shrv_vec: 3275 return have_avx2 && vece >= MO_32; 3276 case INDEX_op_sarv_vec: 3277 return have_avx2 && vece == MO_32; 3278 case INDEX_op_rotlv_vec: 3279 case INDEX_op_rotrv_vec: 3280 return have_avx2 && vece >= MO_32 ? -1 : 0; 3281 3282 case INDEX_op_mul_vec: 3283 if (vece == MO_8) { 3284 /* We can expand the operation for MO_8. */ 3285 return -1; 3286 } 3287 if (vece == MO_64) { 3288 return 0; 3289 } 3290 return 1; 3291 3292 case INDEX_op_ssadd_vec: 3293 case INDEX_op_usadd_vec: 3294 case INDEX_op_sssub_vec: 3295 case INDEX_op_ussub_vec: 3296 return vece <= MO_16; 3297 case INDEX_op_smin_vec: 3298 case INDEX_op_smax_vec: 3299 case INDEX_op_umin_vec: 3300 case INDEX_op_umax_vec: 3301 case INDEX_op_abs_vec: 3302 return vece <= MO_32; 3303 3304 default: 3305 return 0; 3306 } 3307} 3308 3309static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, 3310 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3311{ 3312 TCGv_vec t1, t2; 3313 3314 tcg_debug_assert(vece == MO_8); 3315 3316 t1 = tcg_temp_new_vec(type); 3317 t2 = tcg_temp_new_vec(type); 3318 3319 /* 3320 * Unpack to W, shift, and repack. Tricky bits: 3321 * (1) Use punpck*bw x,x to produce DDCCBBAA, 3322 * i.e. duplicate in other half of the 16-bit lane. 3323 * (2) For right-shift, add 8 so that the high half of the lane 3324 * becomes zero. For left-shift, and left-rotate, we must 3325 * shift up and down again. 3326 * (3) Step 2 leaves high half zero such that PACKUSWB 3327 * (pack with unsigned saturation) does not modify 3328 * the quantity. 3329 */ 3330 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3331 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3332 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3333 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3334 3335 if (opc != INDEX_op_rotli_vec) { 3336 imm += 8; 3337 } 3338 if (opc == INDEX_op_shri_vec) { 3339 tcg_gen_shri_vec(MO_16, t1, t1, imm); 3340 tcg_gen_shri_vec(MO_16, t2, t2, imm); 3341 } else { 3342 tcg_gen_shli_vec(MO_16, t1, t1, imm); 3343 tcg_gen_shli_vec(MO_16, t2, t2, imm); 3344 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3345 tcg_gen_shri_vec(MO_16, t2, t2, 8); 3346 } 3347 3348 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3349 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3350 tcg_temp_free_vec(t1); 3351 tcg_temp_free_vec(t2); 3352} 3353 3354static void expand_vec_sari(TCGType type, unsigned vece, 3355 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3356{ 3357 TCGv_vec t1, t2; 3358 3359 switch (vece) { 3360 case MO_8: 3361 /* Unpack to W, shift, and repack, as in expand_vec_shi. */ 3362 t1 = tcg_temp_new_vec(type); 3363 t2 = tcg_temp_new_vec(type); 3364 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3365 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3366 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3367 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 3368 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 3369 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 3370 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 3371 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 3372 tcg_temp_free_vec(t1); 3373 tcg_temp_free_vec(t2); 3374 break; 3375 3376 case MO_64: 3377 if (imm <= 32) { 3378 /* 3379 * We can emulate a small sign extend by performing an arithmetic 3380 * 32-bit shift and overwriting the high half of a 64-bit logical 3381 * shift. Note that the ISA says shift of 32 is valid, but TCG 3382 * does not, so we have to bound the smaller shift -- we get the 3383 * same result in the high half either way. 3384 */ 3385 t1 = tcg_temp_new_vec(type); 3386 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 3387 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3388 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 3389 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 3390 tcgv_vec_arg(t1), 0xaa); 3391 tcg_temp_free_vec(t1); 3392 } else { 3393 /* Otherwise we will need to use a compare vs 0 to produce 3394 * the sign-extend, shift and merge. 3395 */ 3396 t1 = tcg_const_zeros_vec(type); 3397 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); 3398 tcg_gen_shri_vec(MO_64, v0, v1, imm); 3399 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 3400 tcg_gen_or_vec(MO_64, v0, v0, t1); 3401 tcg_temp_free_vec(t1); 3402 } 3403 break; 3404 3405 default: 3406 g_assert_not_reached(); 3407 } 3408} 3409 3410static void expand_vec_rotli(TCGType type, unsigned vece, 3411 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3412{ 3413 TCGv_vec t; 3414 3415 if (vece == MO_8) { 3416 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); 3417 return; 3418 } 3419 3420 t = tcg_temp_new_vec(type); 3421 tcg_gen_shli_vec(vece, t, v1, imm); 3422 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 3423 tcg_gen_or_vec(vece, v0, v0, t); 3424 tcg_temp_free_vec(t); 3425} 3426 3427static void expand_vec_rotls(TCGType type, unsigned vece, 3428 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 3429{ 3430 TCGv_i32 rsh; 3431 TCGv_vec t; 3432 3433 tcg_debug_assert(vece != MO_8); 3434 3435 t = tcg_temp_new_vec(type); 3436 rsh = tcg_temp_new_i32(); 3437 3438 tcg_gen_neg_i32(rsh, lsh); 3439 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 3440 tcg_gen_shls_vec(vece, t, v1, lsh); 3441 tcg_gen_shrs_vec(vece, v0, v1, rsh); 3442 tcg_gen_or_vec(vece, v0, v0, t); 3443 tcg_temp_free_vec(t); 3444 tcg_temp_free_i32(rsh); 3445} 3446 3447static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 3448 TCGv_vec v1, TCGv_vec sh, bool right) 3449{ 3450 TCGv_vec t = tcg_temp_new_vec(type); 3451 3452 tcg_gen_dupi_vec(vece, t, 8 << vece); 3453 tcg_gen_sub_vec(vece, t, t, sh); 3454 if (right) { 3455 tcg_gen_shlv_vec(vece, t, v1, t); 3456 tcg_gen_shrv_vec(vece, v0, v1, sh); 3457 } else { 3458 tcg_gen_shrv_vec(vece, t, v1, t); 3459 tcg_gen_shlv_vec(vece, v0, v1, sh); 3460 } 3461 tcg_gen_or_vec(vece, v0, v0, t); 3462 tcg_temp_free_vec(t); 3463} 3464 3465static void expand_vec_mul(TCGType type, unsigned vece, 3466 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 3467{ 3468 TCGv_vec t1, t2, t3, t4; 3469 3470 tcg_debug_assert(vece == MO_8); 3471 3472 /* 3473 * Unpack v1 bytes to words, 0 | x. 3474 * Unpack v2 bytes to words, y | 0. 3475 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 3476 * Shift logical right by 8 bits to clear the high 8 bytes before 3477 * using an unsigned saturated pack. 3478 * 3479 * The difference between the V64, V128 and V256 cases is merely how 3480 * we distribute the expansion between temporaries. 3481 */ 3482 switch (type) { 3483 case TCG_TYPE_V64: 3484 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 3485 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 3486 tcg_gen_dup16i_vec(t2, 0); 3487 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3488 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2)); 3489 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 3490 tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2)); 3491 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3492 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3493 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 3494 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 3495 tcg_temp_free_vec(t1); 3496 tcg_temp_free_vec(t2); 3497 break; 3498 3499 case TCG_TYPE_V128: 3500 case TCG_TYPE_V256: 3501 t1 = tcg_temp_new_vec(type); 3502 t2 = tcg_temp_new_vec(type); 3503 t3 = tcg_temp_new_vec(type); 3504 t4 = tcg_temp_new_vec(type); 3505 tcg_gen_dup16i_vec(t4, 0); 3506 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3507 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); 3508 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 3509 tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); 3510 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3511 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); 3512 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 3513 tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); 3514 tcg_gen_mul_vec(MO_16, t1, t1, t2); 3515 tcg_gen_mul_vec(MO_16, t3, t3, t4); 3516 tcg_gen_shri_vec(MO_16, t1, t1, 8); 3517 tcg_gen_shri_vec(MO_16, t3, t3, 8); 3518 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 3519 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 3520 tcg_temp_free_vec(t1); 3521 tcg_temp_free_vec(t2); 3522 tcg_temp_free_vec(t3); 3523 tcg_temp_free_vec(t4); 3524 break; 3525 3526 default: 3527 g_assert_not_reached(); 3528 } 3529} 3530 3531static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, 3532 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3533{ 3534 enum { 3535 NEED_INV = 1, 3536 NEED_SWAP = 2, 3537 NEED_BIAS = 4, 3538 NEED_UMIN = 8, 3539 NEED_UMAX = 16, 3540 }; 3541 TCGv_vec t1, t2; 3542 uint8_t fixup; 3543 3544 switch (cond) { 3545 case TCG_COND_EQ: 3546 case TCG_COND_GT: 3547 fixup = 0; 3548 break; 3549 case TCG_COND_NE: 3550 case TCG_COND_LE: 3551 fixup = NEED_INV; 3552 break; 3553 case TCG_COND_LT: 3554 fixup = NEED_SWAP; 3555 break; 3556 case TCG_COND_GE: 3557 fixup = NEED_SWAP | NEED_INV; 3558 break; 3559 case TCG_COND_LEU: 3560 if (vece <= MO_32) { 3561 fixup = NEED_UMIN; 3562 } else { 3563 fixup = NEED_BIAS | NEED_INV; 3564 } 3565 break; 3566 case TCG_COND_GTU: 3567 if (vece <= MO_32) { 3568 fixup = NEED_UMIN | NEED_INV; 3569 } else { 3570 fixup = NEED_BIAS; 3571 } 3572 break; 3573 case TCG_COND_GEU: 3574 if (vece <= MO_32) { 3575 fixup = NEED_UMAX; 3576 } else { 3577 fixup = NEED_BIAS | NEED_SWAP | NEED_INV; 3578 } 3579 break; 3580 case TCG_COND_LTU: 3581 if (vece <= MO_32) { 3582 fixup = NEED_UMAX | NEED_INV; 3583 } else { 3584 fixup = NEED_BIAS | NEED_SWAP; 3585 } 3586 break; 3587 default: 3588 g_assert_not_reached(); 3589 } 3590 3591 if (fixup & NEED_INV) { 3592 cond = tcg_invert_cond(cond); 3593 } 3594 if (fixup & NEED_SWAP) { 3595 t1 = v1, v1 = v2, v2 = t1; 3596 cond = tcg_swap_cond(cond); 3597 } 3598 3599 t1 = t2 = NULL; 3600 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3601 t1 = tcg_temp_new_vec(type); 3602 if (fixup & NEED_UMIN) { 3603 tcg_gen_umin_vec(vece, t1, v1, v2); 3604 } else { 3605 tcg_gen_umax_vec(vece, t1, v1, v2); 3606 } 3607 v2 = t1; 3608 cond = TCG_COND_EQ; 3609 } else if (fixup & NEED_BIAS) { 3610 t1 = tcg_temp_new_vec(type); 3611 t2 = tcg_temp_new_vec(type); 3612 tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); 3613 tcg_gen_sub_vec(vece, t1, v1, t2); 3614 tcg_gen_sub_vec(vece, t2, v2, t2); 3615 v1 = t1; 3616 v2 = t2; 3617 cond = tcg_signed_cond(cond); 3618 } 3619 3620 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); 3621 /* Expand directly; do not recurse. */ 3622 vec_gen_4(INDEX_op_cmp_vec, type, vece, 3623 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); 3624 3625 if (t1) { 3626 tcg_temp_free_vec(t1); 3627 if (t2) { 3628 tcg_temp_free_vec(t2); 3629 } 3630 } 3631 return fixup & NEED_INV; 3632} 3633 3634static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, 3635 TCGv_vec v1, TCGv_vec v2, TCGCond cond) 3636{ 3637 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) { 3638 tcg_gen_not_vec(vece, v0, v0); 3639 } 3640} 3641 3642static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0, 3643 TCGv_vec c1, TCGv_vec c2, 3644 TCGv_vec v3, TCGv_vec v4, TCGCond cond) 3645{ 3646 TCGv_vec t = tcg_temp_new_vec(type); 3647 3648 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) { 3649 /* Invert the sense of the compare by swapping arguments. */ 3650 TCGv_vec x; 3651 x = v3, v3 = v4, v4 = x; 3652 } 3653 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, 3654 tcgv_vec_arg(v0), tcgv_vec_arg(v4), 3655 tcgv_vec_arg(v3), tcgv_vec_arg(t)); 3656 tcg_temp_free_vec(t); 3657} 3658 3659void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 3660 TCGArg a0, ...) 3661{ 3662 va_list va; 3663 TCGArg a2; 3664 TCGv_vec v0, v1, v2, v3, v4; 3665 3666 va_start(va, a0); 3667 v0 = temp_tcgv_vec(arg_temp(a0)); 3668 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3669 a2 = va_arg(va, TCGArg); 3670 3671 switch (opc) { 3672 case INDEX_op_shli_vec: 3673 case INDEX_op_shri_vec: 3674 expand_vec_shi(type, vece, opc, v0, v1, a2); 3675 break; 3676 3677 case INDEX_op_sari_vec: 3678 expand_vec_sari(type, vece, v0, v1, a2); 3679 break; 3680 3681 case INDEX_op_rotli_vec: 3682 expand_vec_rotli(type, vece, v0, v1, a2); 3683 break; 3684 3685 case INDEX_op_rotls_vec: 3686 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 3687 break; 3688 3689 case INDEX_op_rotlv_vec: 3690 v2 = temp_tcgv_vec(arg_temp(a2)); 3691 expand_vec_rotv(type, vece, v0, v1, v2, false); 3692 break; 3693 case INDEX_op_rotrv_vec: 3694 v2 = temp_tcgv_vec(arg_temp(a2)); 3695 expand_vec_rotv(type, vece, v0, v1, v2, true); 3696 break; 3697 3698 case INDEX_op_mul_vec: 3699 v2 = temp_tcgv_vec(arg_temp(a2)); 3700 expand_vec_mul(type, vece, v0, v1, v2); 3701 break; 3702 3703 case INDEX_op_cmp_vec: 3704 v2 = temp_tcgv_vec(arg_temp(a2)); 3705 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); 3706 break; 3707 3708 case INDEX_op_cmpsel_vec: 3709 v2 = temp_tcgv_vec(arg_temp(a2)); 3710 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3711 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); 3712 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg)); 3713 break; 3714 3715 default: 3716 break; 3717 } 3718 3719 va_end(va); 3720} 3721 3722static const int tcg_target_callee_save_regs[] = { 3723#if TCG_TARGET_REG_BITS == 64 3724 TCG_REG_RBP, 3725 TCG_REG_RBX, 3726#if defined(_WIN64) 3727 TCG_REG_RDI, 3728 TCG_REG_RSI, 3729#endif 3730 TCG_REG_R12, 3731 TCG_REG_R13, 3732 TCG_REG_R14, /* Currently used for the global env. */ 3733 TCG_REG_R15, 3734#else 3735 TCG_REG_EBP, /* Currently used for the global env. */ 3736 TCG_REG_EBX, 3737 TCG_REG_ESI, 3738 TCG_REG_EDI, 3739#endif 3740}; 3741 3742/* Compute frame size via macros, to share between tcg_target_qemu_prologue 3743 and tcg_register_jit. */ 3744 3745#define PUSH_SIZE \ 3746 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 3747 * (TCG_TARGET_REG_BITS / 8)) 3748 3749#define FRAME_SIZE \ 3750 ((PUSH_SIZE \ 3751 + TCG_STATIC_CALL_ARGS_SIZE \ 3752 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 3753 + TCG_TARGET_STACK_ALIGN - 1) \ 3754 & ~(TCG_TARGET_STACK_ALIGN - 1)) 3755 3756/* Generate global QEMU prologue and epilogue code */ 3757static void tcg_target_qemu_prologue(TCGContext *s) 3758{ 3759 int i, stack_addend; 3760 3761 /* TB prologue */ 3762 3763 /* Reserve some stack space, also for TCG temps. */ 3764 stack_addend = FRAME_SIZE - PUSH_SIZE; 3765 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 3766 CPU_TEMP_BUF_NLONGS * sizeof(long)); 3767 3768 /* Save all callee saved registers. */ 3769 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 3770 tcg_out_push(s, tcg_target_callee_save_regs[i]); 3771 } 3772 3773#if TCG_TARGET_REG_BITS == 32 3774 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 3775 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 3776 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3777 /* jmp *tb. */ 3778 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 3779 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 3780 + stack_addend); 3781#else 3782# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64 3783 if (guest_base) { 3784 int seg = setup_guest_base_seg(); 3785 if (seg != 0) { 3786 x86_guest_base_seg = seg; 3787 } else if (guest_base == (int32_t)guest_base) { 3788 x86_guest_base_offset = guest_base; 3789 } else { 3790 /* Choose R12 because, as a base, it requires a SIB byte. */ 3791 x86_guest_base_index = TCG_REG_R12; 3792 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base); 3793 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index); 3794 } 3795 } 3796# endif 3797 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 3798 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 3799 /* jmp *tb. */ 3800 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 3801#endif 3802 3803 /* 3804 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 3805 * and fall through to the rest of the epilogue. 3806 */ 3807 /* TODO: Cast goes away when all hosts converted */ 3808 tcg_code_gen_epilogue = (void *)tcg_splitwx_to_rx(s->code_ptr); 3809 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 3810 3811 /* TB epilogue */ 3812 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 3813 3814 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 3815 3816 if (have_avx2) { 3817 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 3818 } 3819 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 3820 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 3821 } 3822 tcg_out_opc(s, OPC_RET, 0, 0, 0); 3823} 3824 3825static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 3826{ 3827 memset(p, 0x90, count); 3828} 3829 3830static void tcg_target_init(TCGContext *s) 3831{ 3832#ifdef CONFIG_CPUID_H 3833 unsigned a, b, c, d, b7 = 0; 3834 int max = __get_cpuid_max(0, 0); 3835 3836 if (max >= 7) { 3837 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ 3838 __cpuid_count(7, 0, a, b7, c, d); 3839 have_bmi1 = (b7 & bit_BMI) != 0; 3840 have_bmi2 = (b7 & bit_BMI2) != 0; 3841 } 3842 3843 if (max >= 1) { 3844 __cpuid(1, a, b, c, d); 3845#ifndef have_cmov 3846 /* For 32-bit, 99% certainty that we're running on hardware that 3847 supports cmov, but we still need to check. In case cmov is not 3848 available, we'll use a small forward branch. */ 3849 have_cmov = (d & bit_CMOV) != 0; 3850#endif 3851 3852 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we 3853 need to probe for it. */ 3854 have_movbe = (c & bit_MOVBE) != 0; 3855 have_popcnt = (c & bit_POPCNT) != 0; 3856 3857 /* There are a number of things we must check before we can be 3858 sure of not hitting invalid opcode. */ 3859 if (c & bit_OSXSAVE) { 3860 unsigned xcrl, xcrh; 3861 /* The xgetbv instruction is not available to older versions of 3862 * the assembler, so we encode the instruction manually. 3863 */ 3864 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); 3865 if ((xcrl & 6) == 6) { 3866 have_avx1 = (c & bit_AVX) != 0; 3867 have_avx2 = (b7 & bit_AVX2) != 0; 3868 } 3869 } 3870 } 3871 3872 max = __get_cpuid_max(0x8000000, 0); 3873 if (max >= 1) { 3874 __cpuid(0x80000001, a, b, c, d); 3875 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */ 3876 have_lzcnt = (c & bit_LZCNT) != 0; 3877 } 3878#endif /* CONFIG_CPUID_H */ 3879 3880 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 3881 if (TCG_TARGET_REG_BITS == 64) { 3882 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 3883 } 3884 if (have_avx1) { 3885 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 3886 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 3887 } 3888 if (have_avx2) { 3889 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 3890 } 3891 3892 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 3893 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 3894 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 3895 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 3896 if (TCG_TARGET_REG_BITS == 64) { 3897#if !defined(_WIN64) 3898 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 3899 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 3900#endif 3901 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 3902 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 3903 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 3904 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 3905 } 3906 3907 s->reserved_regs = 0; 3908 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 3909} 3910 3911typedef struct { 3912 DebugFrameHeader h; 3913 uint8_t fde_def_cfa[4]; 3914 uint8_t fde_reg_ofs[14]; 3915} DebugFrame; 3916 3917/* We're expecting a 2 byte uleb128 encoded value. */ 3918QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 3919 3920#if !defined(__ELF__) 3921 /* Host machine without ELF. */ 3922#elif TCG_TARGET_REG_BITS == 64 3923#define ELF_HOST_MACHINE EM_X86_64 3924static const DebugFrame debug_frame = { 3925 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 3926 .h.cie.id = -1, 3927 .h.cie.version = 1, 3928 .h.cie.code_align = 1, 3929 .h.cie.data_align = 0x78, /* sleb128 -8 */ 3930 .h.cie.return_column = 16, 3931 3932 /* Total FDE size does not include the "len" member. */ 3933 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 3934 3935 .fde_def_cfa = { 3936 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 3937 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 3938 (FRAME_SIZE >> 7) 3939 }, 3940 .fde_reg_ofs = { 3941 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 3942 /* The following ordering must match tcg_target_callee_save_regs. */ 3943 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 3944 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 3945 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 3946 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 3947 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 3948 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 3949 } 3950}; 3951#else 3952#define ELF_HOST_MACHINE EM_386 3953static const DebugFrame debug_frame = { 3954 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 3955 .h.cie.id = -1, 3956 .h.cie.version = 1, 3957 .h.cie.code_align = 1, 3958 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 3959 .h.cie.return_column = 8, 3960 3961 /* Total FDE size does not include the "len" member. */ 3962 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 3963 3964 .fde_def_cfa = { 3965 12, 4, /* DW_CFA_def_cfa %esp, ... */ 3966 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 3967 (FRAME_SIZE >> 7) 3968 }, 3969 .fde_reg_ofs = { 3970 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 3971 /* The following ordering must match tcg_target_callee_save_regs. */ 3972 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 3973 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 3974 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 3975 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 3976 } 3977}; 3978#endif 3979 3980#if defined(ELF_HOST_MACHINE) 3981void tcg_register_jit(const void *buf, size_t buf_size) 3982{ 3983 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 3984} 3985#endif 3986