1/* 2 * Tiny Code Generator for QEMU 3 * 4 * Copyright (c) 2008 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#include "../tcg-ldst.c.inc" 26#include "../tcg-pool.c.inc" 27 28#ifdef CONFIG_DEBUG_TCG 29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { 30#if TCG_TARGET_REG_BITS == 64 31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", 32#else 33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", 34#endif 35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", 36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", 37#if TCG_TARGET_REG_BITS == 64 38 "%xmm8", "%xmm9", "%xmm10", "%xmm11", 39 "%xmm12", "%xmm13", "%xmm14", "%xmm15", 40#endif 41}; 42#endif 43 44static const int tcg_target_reg_alloc_order[] = { 45#if TCG_TARGET_REG_BITS == 64 46 TCG_REG_RBP, 47 TCG_REG_RBX, 48 TCG_REG_R12, 49 TCG_REG_R13, 50 TCG_REG_R14, 51 TCG_REG_R15, 52 TCG_REG_R10, 53 TCG_REG_R11, 54 TCG_REG_R9, 55 TCG_REG_R8, 56 TCG_REG_RCX, 57 TCG_REG_RDX, 58 TCG_REG_RSI, 59 TCG_REG_RDI, 60 TCG_REG_RAX, 61#else 62 TCG_REG_EBX, 63 TCG_REG_ESI, 64 TCG_REG_EDI, 65 TCG_REG_EBP, 66 TCG_REG_ECX, 67 TCG_REG_EDX, 68 TCG_REG_EAX, 69#endif 70 TCG_REG_XMM0, 71 TCG_REG_XMM1, 72 TCG_REG_XMM2, 73 TCG_REG_XMM3, 74 TCG_REG_XMM4, 75 TCG_REG_XMM5, 76#ifndef _WIN64 77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save 78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */ 79 TCG_REG_XMM6, 80 TCG_REG_XMM7, 81#if TCG_TARGET_REG_BITS == 64 82 TCG_REG_XMM8, 83 TCG_REG_XMM9, 84 TCG_REG_XMM10, 85 TCG_REG_XMM11, 86 TCG_REG_XMM12, 87 TCG_REG_XMM13, 88 TCG_REG_XMM14, 89 TCG_REG_XMM15, 90#endif 91#endif 92}; 93 94#define TCG_TMP_VEC TCG_REG_XMM5 95 96static const int tcg_target_call_iarg_regs[] = { 97#if TCG_TARGET_REG_BITS == 64 98#if defined(_WIN64) 99 TCG_REG_RCX, 100 TCG_REG_RDX, 101#else 102 TCG_REG_RDI, 103 TCG_REG_RSI, 104 TCG_REG_RDX, 105 TCG_REG_RCX, 106#endif 107 TCG_REG_R8, 108 TCG_REG_R9, 109#else 110 /* 32 bit mode uses stack based calling convention (GCC default). */ 111#endif 112}; 113 114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) 115{ 116 switch (kind) { 117 case TCG_CALL_RET_NORMAL: 118 tcg_debug_assert(slot >= 0 && slot <= 1); 119 return slot ? TCG_REG_EDX : TCG_REG_EAX; 120#ifdef _WIN64 121 case TCG_CALL_RET_BY_VEC: 122 tcg_debug_assert(slot == 0); 123 return TCG_REG_XMM0; 124#endif 125 default: 126 g_assert_not_reached(); 127 } 128} 129 130/* Constants we accept. */ 131#define TCG_CT_CONST_S32 0x100 132#define TCG_CT_CONST_U32 0x200 133#define TCG_CT_CONST_I32 0x400 134#define TCG_CT_CONST_WSZ 0x800 135#define TCG_CT_CONST_TST 0x1000 136#define TCG_CT_CONST_ZERO 0x2000 137 138/* Registers used with L constraint, which are the first argument 139 registers on x86_64, and two random call clobbered registers on 140 i386. */ 141#if TCG_TARGET_REG_BITS == 64 142# define TCG_REG_L0 tcg_target_call_iarg_regs[0] 143# define TCG_REG_L1 tcg_target_call_iarg_regs[1] 144#else 145# define TCG_REG_L0 TCG_REG_EAX 146# define TCG_REG_L1 TCG_REG_EDX 147#endif 148 149#if TCG_TARGET_REG_BITS == 64 150# define ALL_GENERAL_REGS 0x0000ffffu 151# define ALL_VECTOR_REGS 0xffff0000u 152# define ALL_BYTEL_REGS ALL_GENERAL_REGS 153#else 154# define ALL_GENERAL_REGS 0x000000ffu 155# define ALL_VECTOR_REGS 0x00ff0000u 156# define ALL_BYTEL_REGS 0x0000000fu 157#endif 158#define SOFTMMU_RESERVE_REGS \ 159 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) 160 161#define have_bmi2 (cpuinfo & CPUINFO_BMI2) 162#define have_lzcnt (cpuinfo & CPUINFO_LZCNT) 163 164static const tcg_insn_unit *tb_ret_addr; 165 166static bool patch_reloc(tcg_insn_unit *code_ptr, int type, 167 intptr_t value, intptr_t addend) 168{ 169 value += addend; 170 switch(type) { 171 case R_386_PC32: 172 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 173 if (value != (int32_t)value) { 174 return false; 175 } 176 /* FALLTHRU */ 177 case R_386_32: 178 tcg_patch32(code_ptr, value); 179 break; 180 case R_386_PC8: 181 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr); 182 if (value != (int8_t)value) { 183 return false; 184 } 185 tcg_patch8(code_ptr, value); 186 break; 187 default: 188 g_assert_not_reached(); 189 } 190 return true; 191} 192 193/* test if a constant matches the constraint */ 194static bool tcg_target_const_match(int64_t val, int ct, 195 TCGType type, TCGCond cond, int vece) 196{ 197 if (ct & TCG_CT_CONST) { 198 return 1; 199 } 200 if (type == TCG_TYPE_I32) { 201 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | 202 TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) { 203 return 1; 204 } 205 } else { 206 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) { 207 return 1; 208 } 209 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { 210 return 1; 211 } 212 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { 213 return 1; 214 } 215 /* 216 * This will be used in combination with TCG_CT_CONST_S32, 217 * so "normal" TESTQ is already matched. Also accept: 218 * TESTQ -> TESTL (uint32_t) 219 * TESTQ -> BT (is_power_of_2) 220 */ 221 if ((ct & TCG_CT_CONST_TST) 222 && is_tst_cond(cond) 223 && (val == (uint32_t)val || is_power_of_2(val))) { 224 return 1; 225 } 226 } 227 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) { 228 return 1; 229 } 230 if ((ct & TCG_CT_CONST_ZERO) && val == 0) { 231 return 1; 232 } 233 return 0; 234} 235 236# define LOWREGMASK(x) ((x) & 7) 237 238#define P_EXT 0x100 /* 0x0f opcode prefix */ 239#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ 240#define P_DATA16 0x400 /* 0x66 opcode prefix */ 241#define P_VEXW 0x1000 /* Set VEX.W = 1 */ 242#if TCG_TARGET_REG_BITS == 64 243# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */ 244# define P_REXB_R 0x2000 /* REG field as byte register */ 245# define P_REXB_RM 0x4000 /* R/M field as byte register */ 246# define P_GS 0x8000 /* gs segment override */ 247#else 248# define P_REXW 0 249# define P_REXB_R 0 250# define P_REXB_RM 0 251# define P_GS 0 252#endif 253#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ 254#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ 255#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ 256#define P_VEXL 0x80000 /* Set VEX.L = 1 */ 257#define P_EVEX 0x100000 /* Requires EVEX encoding */ 258 259#define OPC_ARITH_EbIb (0x80) 260#define OPC_ARITH_EvIz (0x81) 261#define OPC_ARITH_EvIb (0x83) 262#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ 263#define OPC_ANDN (0xf2 | P_EXT38) 264#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) 265#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) 266#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) 267#define OPC_BSF (0xbc | P_EXT) 268#define OPC_BSR (0xbd | P_EXT) 269#define OPC_BSWAP (0xc8 | P_EXT) 270#define OPC_CALL_Jz (0xe8) 271#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */ 272#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3)) 273#define OPC_DEC_r32 (0x48) 274#define OPC_IMUL_GvEv (0xaf | P_EXT) 275#define OPC_IMUL_GvEvIb (0x6b) 276#define OPC_IMUL_GvEvIz (0x69) 277#define OPC_INC_r32 (0x40) 278#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */ 279#define OPC_JCC_short (0x70) /* ... plus condition code */ 280#define OPC_JMP_long (0xe9) 281#define OPC_JMP_short (0xeb) 282#define OPC_LEA (0x8d) 283#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3) 284#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ 285#define OPC_MOVL_EvGv (0x89) /* stores, more or less */ 286#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ 287#define OPC_MOVB_EvIz (0xc6) 288#define OPC_MOVL_EvIz (0xc7) 289#define OPC_MOVB_Ib (0xb0) 290#define OPC_MOVL_Iv (0xb8) 291#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) 292#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) 293#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) 294#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) 295#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) 296#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) 297#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) 298#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) 299#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) 300#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) 301#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) 302#define OPC_MOVSBL (0xbe | P_EXT) 303#define OPC_MOVSWL (0xbf | P_EXT) 304#define OPC_MOVSLQ (0x63 | P_REXW) 305#define OPC_MOVZBL (0xb6 | P_EXT) 306#define OPC_MOVZWL (0xb7 | P_EXT) 307#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) 308#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) 309#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) 310#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 311#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) 312#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) 313#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) 314#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) 315#define OPC_PADDB (0xfc | P_EXT | P_DATA16) 316#define OPC_PADDW (0xfd | P_EXT | P_DATA16) 317#define OPC_PADDD (0xfe | P_EXT | P_DATA16) 318#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) 319#define OPC_PADDSB (0xec | P_EXT | P_DATA16) 320#define OPC_PADDSW (0xed | P_EXT | P_DATA16) 321#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) 322#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) 323#define OPC_PAND (0xdb | P_EXT | P_DATA16) 324#define OPC_PANDN (0xdf | P_EXT | P_DATA16) 325#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) 326#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) 327#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) 328#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) 329#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) 330#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) 331#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) 332#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) 333#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) 334#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) 335#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) 336#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) 337#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) 338#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) 339#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 340#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) 341#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) 342#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) 343#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 344#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) 345#define OPC_PMINSW (0xea | P_EXT | P_DATA16) 346#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) 347#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 348#define OPC_PMINUB (0xda | P_EXT | P_DATA16) 349#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) 350#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) 351#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 352#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) 353#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) 354#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) 355#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) 356#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) 357#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) 358#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) 359#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) 360#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 361#define OPC_POR (0xeb | P_EXT | P_DATA16) 362#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) 363#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) 364#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) 365#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) 366#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ 367#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ 368#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ 369#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) 370#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) 371#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) 372#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) 373#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) 374#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) 375#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) 376#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) 377#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) 378#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) 379#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) 380#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) 381#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) 382#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) 383#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) 384#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) 385#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) 386#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) 387#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) 388#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) 389#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) 390#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) 391#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) 392#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) 393#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) 394#define OPC_PXOR (0xef | P_EXT | P_DATA16) 395#define OPC_POP_r32 (0x58) 396#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) 397#define OPC_PUSH_r32 (0x50) 398#define OPC_PUSH_Iv (0x68) 399#define OPC_PUSH_Ib (0x6a) 400#define OPC_RET (0xc3) 401#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */ 402#define OPC_SHIFT_1 (0xd1) 403#define OPC_SHIFT_Ib (0xc1) 404#define OPC_SHIFT_cl (0xd3) 405#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) 406#define OPC_SHUFPS (0xc6 | P_EXT) 407#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) 408#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) 409#define OPC_SHRD_Ib (0xac | P_EXT) 410#define OPC_TESTB (0x84) 411#define OPC_TESTL (0x85) 412#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) 413#define OPC_UD2 (0x0b | P_EXT) 414#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) 415#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) 416#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) 417#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 418#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) 419#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 420#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) 421#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) 422#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 423#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 424#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX) 425#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX) 426#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 427#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 428#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16) 429#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16) 430#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16) 431#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16) 432#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) 433#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) 434#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) 435#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) 436#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) 437#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 438#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) 439#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 440#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) 441#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) 442#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) 443#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 444#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) 445#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 446#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 447#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) 448#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 449#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 450#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) 451#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 452#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 453#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) 454#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 455#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 456#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) 457#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) 458#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 459#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) 460#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 461#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 462#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) 463#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) 464#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) 465#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX) 466#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 467#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX) 468#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) 469#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX) 470#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 471#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX) 472#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) 473#define OPC_VZEROUPPER (0x77 | P_EXT) 474#define OPC_XCHG_ax_r32 (0x90) 475#define OPC_XCHG_EvGv (0x87) 476 477#define OPC_GRP3_Eb (0xf6) 478#define OPC_GRP3_Ev (0xf7) 479#define OPC_GRP5 (0xff) 480#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) 481#define OPC_GRPBT (0xba | P_EXT) 482 483#define OPC_GRPBT_BT 4 484#define OPC_GRPBT_BTS 5 485#define OPC_GRPBT_BTR 6 486#define OPC_GRPBT_BTC 7 487 488/* Group 1 opcode extensions for 0x80-0x83. 489 These are also used as modifiers for OPC_ARITH. */ 490#define ARITH_ADD 0 491#define ARITH_OR 1 492#define ARITH_ADC 2 493#define ARITH_SBB 3 494#define ARITH_AND 4 495#define ARITH_SUB 5 496#define ARITH_XOR 6 497#define ARITH_CMP 7 498 499/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */ 500#define SHIFT_ROL 0 501#define SHIFT_ROR 1 502#define SHIFT_SHL 4 503#define SHIFT_SHR 5 504#define SHIFT_SAR 7 505 506/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */ 507#define EXT3_TESTi 0 508#define EXT3_NOT 2 509#define EXT3_NEG 3 510#define EXT3_MUL 4 511#define EXT3_IMUL 5 512#define EXT3_DIV 6 513#define EXT3_IDIV 7 514 515/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */ 516#define EXT5_INC_Ev 0 517#define EXT5_DEC_Ev 1 518#define EXT5_CALLN_Ev 2 519#define EXT5_JMPN_Ev 4 520 521/* Condition codes to be added to OPC_JCC_{long,short}. */ 522#define JCC_JMP (-1) 523#define JCC_JO 0x0 524#define JCC_JNO 0x1 525#define JCC_JB 0x2 526#define JCC_JAE 0x3 527#define JCC_JE 0x4 528#define JCC_JNE 0x5 529#define JCC_JBE 0x6 530#define JCC_JA 0x7 531#define JCC_JS 0x8 532#define JCC_JNS 0x9 533#define JCC_JP 0xa 534#define JCC_JNP 0xb 535#define JCC_JL 0xc 536#define JCC_JGE 0xd 537#define JCC_JLE 0xe 538#define JCC_JG 0xf 539 540static const uint8_t tcg_cond_to_jcc[] = { 541 [TCG_COND_EQ] = JCC_JE, 542 [TCG_COND_NE] = JCC_JNE, 543 [TCG_COND_LT] = JCC_JL, 544 [TCG_COND_GE] = JCC_JGE, 545 [TCG_COND_LE] = JCC_JLE, 546 [TCG_COND_GT] = JCC_JG, 547 [TCG_COND_LTU] = JCC_JB, 548 [TCG_COND_GEU] = JCC_JAE, 549 [TCG_COND_LEU] = JCC_JBE, 550 [TCG_COND_GTU] = JCC_JA, 551 [TCG_COND_TSTEQ] = JCC_JE, 552 [TCG_COND_TSTNE] = JCC_JNE, 553}; 554 555#if TCG_TARGET_REG_BITS == 64 556static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) 557{ 558 int rex; 559 560 if (opc & P_GS) { 561 tcg_out8(s, 0x65); 562 } 563 if (opc & P_DATA16) { 564 /* We should never be asking for both 16 and 64-bit operation. */ 565 tcg_debug_assert((opc & P_REXW) == 0); 566 tcg_out8(s, 0x66); 567 } 568 if (opc & P_SIMDF3) { 569 tcg_out8(s, 0xf3); 570 } else if (opc & P_SIMDF2) { 571 tcg_out8(s, 0xf2); 572 } 573 574 rex = 0; 575 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ 576 rex |= (r & 8) >> 1; /* REX.R */ 577 rex |= (x & 8) >> 2; /* REX.X */ 578 rex |= (rm & 8) >> 3; /* REX.B */ 579 580 /* P_REXB_{R,RM} indicates that the given register is the low byte. 581 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, 582 as otherwise the encoding indicates %[abcd]h. Note that the values 583 that are ORed in merely indicate that the REX byte must be present; 584 those bits get discarded in output. */ 585 rex |= opc & (r >= 4 ? P_REXB_R : 0); 586 rex |= opc & (rm >= 4 ? P_REXB_RM : 0); 587 588 if (rex) { 589 tcg_out8(s, (uint8_t)(rex | 0x40)); 590 } 591 592 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 593 tcg_out8(s, 0x0f); 594 if (opc & P_EXT38) { 595 tcg_out8(s, 0x38); 596 } else if (opc & P_EXT3A) { 597 tcg_out8(s, 0x3a); 598 } 599 } 600 601 tcg_out8(s, opc); 602} 603#else 604static void tcg_out_opc(TCGContext *s, int opc) 605{ 606 if (opc & P_DATA16) { 607 tcg_out8(s, 0x66); 608 } 609 if (opc & P_SIMDF3) { 610 tcg_out8(s, 0xf3); 611 } else if (opc & P_SIMDF2) { 612 tcg_out8(s, 0xf2); 613 } 614 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { 615 tcg_out8(s, 0x0f); 616 if (opc & P_EXT38) { 617 tcg_out8(s, 0x38); 618 } else if (opc & P_EXT3A) { 619 tcg_out8(s, 0x3a); 620 } 621 } 622 tcg_out8(s, opc); 623} 624/* Discard the register arguments to tcg_out_opc early, so as not to penalize 625 the 32-bit compilation paths. This method works with all versions of gcc, 626 whereas relying on optimization may not be able to exclude them. */ 627#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc) 628#endif 629 630static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) 631{ 632 tcg_out_opc(s, opc, r, rm, 0); 633 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 634} 635 636static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, 637 int rm, int index) 638{ 639 int tmp; 640 641 if (opc & P_GS) { 642 tcg_out8(s, 0x65); 643 } 644 /* Use the two byte form if possible, which cannot encode 645 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ 646 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT 647 && ((rm | index) & 8) == 0) { 648 /* Two byte VEX prefix. */ 649 tcg_out8(s, 0xc5); 650 651 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ 652 } else { 653 /* Three byte VEX prefix. */ 654 tcg_out8(s, 0xc4); 655 656 /* VEX.m-mmmm */ 657 if (opc & P_EXT3A) { 658 tmp = 3; 659 } else if (opc & P_EXT38) { 660 tmp = 2; 661 } else if (opc & P_EXT) { 662 tmp = 1; 663 } else { 664 g_assert_not_reached(); 665 } 666 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ 667 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ 668 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ 669 tcg_out8(s, tmp); 670 671 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */ 672 } 673 674 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ 675 /* VEX.pp */ 676 if (opc & P_DATA16) { 677 tmp |= 1; /* 0x66 */ 678 } else if (opc & P_SIMDF3) { 679 tmp |= 2; /* 0xf3 */ 680 } else if (opc & P_SIMDF2) { 681 tmp |= 3; /* 0xf2 */ 682 } 683 tmp |= (~v & 15) << 3; /* VEX.vvvv */ 684 tcg_out8(s, tmp); 685 tcg_out8(s, opc); 686} 687 688static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, 689 int rm, int index, int aaa, bool z) 690{ 691 /* The entire 4-byte evex prefix; with R' and V' set. */ 692 uint32_t p = 0x08041062; 693 int mm, pp; 694 695 tcg_debug_assert(have_avx512vl); 696 697 /* EVEX.mm */ 698 if (opc & P_EXT3A) { 699 mm = 3; 700 } else if (opc & P_EXT38) { 701 mm = 2; 702 } else if (opc & P_EXT) { 703 mm = 1; 704 } else { 705 g_assert_not_reached(); 706 } 707 708 /* EVEX.pp */ 709 if (opc & P_DATA16) { 710 pp = 1; /* 0x66 */ 711 } else if (opc & P_SIMDF3) { 712 pp = 2; /* 0xf3 */ 713 } else if (opc & P_SIMDF2) { 714 pp = 3; /* 0xf2 */ 715 } else { 716 pp = 0; 717 } 718 719 p = deposit32(p, 8, 2, mm); 720 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ 721 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ 722 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ 723 p = deposit32(p, 16, 2, pp); 724 p = deposit32(p, 19, 4, ~v); 725 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); 726 p = deposit32(p, 24, 3, aaa); 727 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); 728 p = deposit32(p, 31, 1, z); 729 730 tcg_out32(s, p); 731 tcg_out8(s, opc); 732} 733 734static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) 735{ 736 if (opc & P_EVEX) { 737 tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false); 738 } else { 739 tcg_out_vex_opc(s, opc, r, v, rm, 0); 740 } 741 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 742} 743 744static void tcg_out_vex_modrm_type(TCGContext *s, int opc, 745 int r, int v, int rm, TCGType type) 746{ 747 if (type == TCG_TYPE_V256) { 748 opc |= P_VEXL; 749 } 750 tcg_out_vex_modrm(s, opc, r, v, rm); 751} 752 753static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, 754 int rm, int aaa, bool z, TCGType type) 755{ 756 if (type == TCG_TYPE_V256) { 757 opc |= P_VEXL; 758 } 759 tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); 760 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 761} 762 763/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. 764 We handle either RM and INDEX missing with a negative value. In 64-bit 765 mode for absolute addresses, ~RM is the size of the immediate operand 766 that will follow the instruction. */ 767 768static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, 769 int shift, intptr_t offset) 770{ 771 int mod, len; 772 773 if (index < 0 && rm < 0) { 774 if (TCG_TARGET_REG_BITS == 64) { 775 /* Try for a rip-relative addressing mode. This has replaced 776 the 32-bit-mode absolute addressing encoding. */ 777 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; 778 intptr_t disp = offset - pc; 779 if (disp == (int32_t)disp) { 780 tcg_out8(s, (LOWREGMASK(r) << 3) | 5); 781 tcg_out32(s, disp); 782 return; 783 } 784 785 /* Try for an absolute address encoding. This requires the 786 use of the MODRM+SIB encoding and is therefore larger than 787 rip-relative addressing. */ 788 if (offset == (int32_t)offset) { 789 tcg_out8(s, (LOWREGMASK(r) << 3) | 4); 790 tcg_out8(s, (4 << 3) | 5); 791 tcg_out32(s, offset); 792 return; 793 } 794 795 /* ??? The memory isn't directly addressable. */ 796 g_assert_not_reached(); 797 } else { 798 /* Absolute address. */ 799 tcg_out8(s, (r << 3) | 5); 800 tcg_out32(s, offset); 801 return; 802 } 803 } 804 805 /* Find the length of the immediate addend. Note that the encoding 806 that would be used for (%ebp) indicates absolute addressing. */ 807 if (rm < 0) { 808 mod = 0, len = 4, rm = 5; 809 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) { 810 mod = 0, len = 0; 811 } else if (offset == (int8_t)offset) { 812 mod = 0x40, len = 1; 813 } else { 814 mod = 0x80, len = 4; 815 } 816 817 /* Use a single byte MODRM format if possible. Note that the encoding 818 that would be used for %esp is the escape to the two byte form. */ 819 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { 820 /* Single byte MODRM format. */ 821 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); 822 } else { 823 /* Two byte MODRM+SIB format. */ 824 825 /* Note that the encoding that would place %esp into the index 826 field indicates no index register. In 64-bit mode, the REX.X 827 bit counts, so %r12 can be used as the index. */ 828 if (index < 0) { 829 index = 4; 830 } else { 831 tcg_debug_assert(index != TCG_REG_ESP); 832 } 833 834 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); 835 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); 836 } 837 838 if (len == 1) { 839 tcg_out8(s, offset); 840 } else if (len == 4) { 841 tcg_out32(s, offset); 842 } 843} 844 845static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, 846 int index, int shift, intptr_t offset) 847{ 848 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 849 tcg_out_sib_offset(s, r, rm, index, shift, offset); 850} 851 852static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, 853 int rm, int index, int shift, 854 intptr_t offset) 855{ 856 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); 857 tcg_out_sib_offset(s, r, rm, index, shift, offset); 858} 859 860/* A simplification of the above with no index or shift. */ 861static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, 862 int rm, intptr_t offset) 863{ 864 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); 865} 866 867static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, 868 int v, int rm, intptr_t offset) 869{ 870 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); 871} 872 873/* Output an opcode with an expected reference to the constant pool. */ 874static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) 875{ 876 tcg_out_opc(s, opc, r, 0, 0); 877 /* Absolute for 32-bit, pc-relative for 64-bit. */ 878 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 879 tcg_out32(s, 0); 880} 881 882/* Output an opcode with an expected reference to the constant pool. */ 883static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) 884{ 885 tcg_out_vex_opc(s, opc, r, 0, 0, 0); 886 /* Absolute for 32-bit, pc-relative for 64-bit. */ 887 tcg_out8(s, LOWREGMASK(r) << 3 | 5); 888 tcg_out32(s, 0); 889} 890 891/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ 892static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) 893{ 894 /* Propagate an opcode prefix, such as P_REXW. */ 895 int ext = subop & ~0x7; 896 subop &= 0x7; 897 898 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); 899} 900 901static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) 902{ 903 int rexw = 0; 904 905 if (arg == ret) { 906 return true; 907 } 908 switch (type) { 909 case TCG_TYPE_I64: 910 rexw = P_REXW; 911 /* fallthru */ 912 case TCG_TYPE_I32: 913 if (ret < 16) { 914 if (arg < 16) { 915 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); 916 } else { 917 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); 918 } 919 } else { 920 if (arg < 16) { 921 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); 922 } else { 923 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 924 } 925 } 926 break; 927 928 case TCG_TYPE_V64: 929 tcg_debug_assert(ret >= 16 && arg >= 16); 930 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); 931 break; 932 case TCG_TYPE_V128: 933 tcg_debug_assert(ret >= 16 && arg >= 16); 934 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); 935 break; 936 case TCG_TYPE_V256: 937 tcg_debug_assert(ret >= 16 && arg >= 16); 938 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); 939 break; 940 941 default: 942 g_assert_not_reached(); 943 } 944 return true; 945} 946 947static const int avx2_dup_insn[4] = { 948 OPC_VPBROADCASTB, OPC_VPBROADCASTW, 949 OPC_VPBROADCASTD, OPC_VPBROADCASTQ, 950}; 951 952static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, 953 TCGReg r, TCGReg a) 954{ 955 if (have_avx2) { 956 tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type); 957 } else { 958 switch (vece) { 959 case MO_8: 960 /* ??? With zero in a register, use PSHUFB. */ 961 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a); 962 a = r; 963 /* FALLTHRU */ 964 case MO_16: 965 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a); 966 a = r; 967 /* FALLTHRU */ 968 case MO_32: 969 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); 970 /* imm8 operand: all output lanes selected from input lane 0. */ 971 tcg_out8(s, 0); 972 break; 973 case MO_64: 974 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a); 975 break; 976 default: 977 g_assert_not_reached(); 978 } 979 } 980 return true; 981} 982 983static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, 984 TCGReg r, TCGReg base, intptr_t offset) 985{ 986 if (have_avx2) { 987 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 988 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l, 989 r, 0, base, offset); 990 } else { 991 switch (vece) { 992 case MO_64: 993 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset); 994 break; 995 case MO_32: 996 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset); 997 break; 998 case MO_16: 999 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset); 1000 tcg_out8(s, 0); /* imm8 */ 1001 tcg_out_dup_vec(s, type, vece, r, r); 1002 break; 1003 case MO_8: 1004 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset); 1005 tcg_out8(s, 0); /* imm8 */ 1006 tcg_out_dup_vec(s, type, vece, r, r); 1007 break; 1008 default: 1009 g_assert_not_reached(); 1010 } 1011 } 1012 return true; 1013} 1014 1015static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, 1016 TCGReg ret, int64_t arg) 1017{ 1018 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); 1019 1020 if (arg == 0) { 1021 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1022 return; 1023 } 1024 if (arg == -1) { 1025 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); 1026 return; 1027 } 1028 1029 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) { 1030 if (have_avx2) { 1031 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); 1032 } else { 1033 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret); 1034 } 1035 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1036 } else { 1037 if (type == TCG_TYPE_V64) { 1038 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); 1039 } else if (have_avx2) { 1040 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); 1041 } else { 1042 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); 1043 } 1044 if (TCG_TARGET_REG_BITS == 64) { 1045 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1046 } else { 1047 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32); 1048 } 1049 } 1050} 1051 1052static void tcg_out_movi_vec(TCGContext *s, TCGType type, 1053 TCGReg ret, tcg_target_long arg) 1054{ 1055 if (arg == 0) { 1056 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); 1057 return; 1058 } 1059 if (arg == -1) { 1060 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret); 1061 return; 1062 } 1063 1064 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 1065 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret); 1066 if (TCG_TARGET_REG_BITS == 64) { 1067 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); 1068 } else { 1069 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); 1070 } 1071} 1072 1073static void tcg_out_movi_int(TCGContext *s, TCGType type, 1074 TCGReg ret, tcg_target_long arg) 1075{ 1076 tcg_target_long diff; 1077 1078 if (arg == 0) { 1079 tgen_arithr(s, ARITH_XOR, ret, ret); 1080 return; 1081 } 1082 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) { 1083 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0); 1084 tcg_out32(s, arg); 1085 return; 1086 } 1087 if (arg == (int32_t)arg) { 1088 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret); 1089 tcg_out32(s, arg); 1090 return; 1091 } 1092 1093 /* Try a 7 byte pc-relative lea before the 10 byte movq. */ 1094 diff = tcg_pcrel_diff(s, (const void *)arg) - 7; 1095 if (diff == (int32_t)diff) { 1096 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0); 1097 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5); 1098 tcg_out32(s, diff); 1099 return; 1100 } 1101 1102 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0); 1103 tcg_out64(s, arg); 1104} 1105 1106static void tcg_out_movi(TCGContext *s, TCGType type, 1107 TCGReg ret, tcg_target_long arg) 1108{ 1109 switch (type) { 1110 case TCG_TYPE_I32: 1111#if TCG_TARGET_REG_BITS == 64 1112 case TCG_TYPE_I64: 1113#endif 1114 if (ret < 16) { 1115 tcg_out_movi_int(s, type, ret, arg); 1116 } else { 1117 tcg_out_movi_vec(s, type, ret, arg); 1118 } 1119 break; 1120 default: 1121 g_assert_not_reached(); 1122 } 1123} 1124 1125static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) 1126{ 1127 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1128 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2); 1129 return true; 1130} 1131 1132static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, 1133 tcg_target_long imm) 1134{ 1135 /* This function is only used for passing structs by reference. */ 1136 tcg_debug_assert(imm == (int32_t)imm); 1137 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); 1138} 1139 1140static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) 1141{ 1142 if (val == (int8_t)val) { 1143 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0); 1144 tcg_out8(s, val); 1145 } else if (val == (int32_t)val) { 1146 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0); 1147 tcg_out32(s, val); 1148 } else { 1149 g_assert_not_reached(); 1150 } 1151} 1152 1153static inline void tcg_out_mb(TCGContext *s, TCGArg a0) 1154{ 1155 /* Given the strength of x86 memory ordering, we only need care for 1156 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is 1157 faster than "mfence", so don't bother with the sse insn. */ 1158 if (a0 & TCG_MO_ST_LD) { 1159 tcg_out8(s, 0xf0); 1160 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); 1161 tcg_out8(s, 0); 1162 } 1163} 1164 1165static inline void tcg_out_push(TCGContext *s, int reg) 1166{ 1167 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); 1168} 1169 1170static inline void tcg_out_pop(TCGContext *s, int reg) 1171{ 1172 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); 1173} 1174 1175static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, 1176 TCGReg arg1, intptr_t arg2) 1177{ 1178 switch (type) { 1179 case TCG_TYPE_I32: 1180 if (ret < 16) { 1181 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); 1182 } else { 1183 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); 1184 } 1185 break; 1186 case TCG_TYPE_I64: 1187 if (ret < 16) { 1188 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); 1189 break; 1190 } 1191 /* FALLTHRU */ 1192 case TCG_TYPE_V64: 1193 /* There is no instruction that can validate 8-byte alignment. */ 1194 tcg_debug_assert(ret >= 16); 1195 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); 1196 break; 1197 case TCG_TYPE_V128: 1198 /* 1199 * The gvec infrastructure is asserts that v128 vector loads 1200 * and stores use a 16-byte aligned offset. Validate that the 1201 * final pointer is aligned by using an insn that will SIGSEGV. 1202 */ 1203 tcg_debug_assert(ret >= 16); 1204 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2); 1205 break; 1206 case TCG_TYPE_V256: 1207 /* 1208 * The gvec infrastructure only requires 16-byte alignment, 1209 * so here we must use an unaligned load. 1210 */ 1211 tcg_debug_assert(ret >= 16); 1212 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, 1213 ret, 0, arg1, arg2); 1214 break; 1215 default: 1216 g_assert_not_reached(); 1217 } 1218} 1219 1220static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, 1221 TCGReg arg1, intptr_t arg2) 1222{ 1223 switch (type) { 1224 case TCG_TYPE_I32: 1225 if (arg < 16) { 1226 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); 1227 } else { 1228 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); 1229 } 1230 break; 1231 case TCG_TYPE_I64: 1232 if (arg < 16) { 1233 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); 1234 break; 1235 } 1236 /* FALLTHRU */ 1237 case TCG_TYPE_V64: 1238 /* There is no instruction that can validate 8-byte alignment. */ 1239 tcg_debug_assert(arg >= 16); 1240 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); 1241 break; 1242 case TCG_TYPE_V128: 1243 /* 1244 * The gvec infrastructure is asserts that v128 vector loads 1245 * and stores use a 16-byte aligned offset. Validate that the 1246 * final pointer is aligned by using an insn that will SIGSEGV. 1247 * 1248 * This specific instance is also used by TCG_CALL_RET_BY_VEC, 1249 * for _WIN64, which must have SSE2 but may not have AVX. 1250 */ 1251 tcg_debug_assert(arg >= 16); 1252 if (have_avx1) { 1253 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2); 1254 } else { 1255 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2); 1256 } 1257 break; 1258 case TCG_TYPE_V256: 1259 /* 1260 * The gvec infrastructure only requires 16-byte alignment, 1261 * so here we must use an unaligned store. 1262 */ 1263 tcg_debug_assert(arg >= 16); 1264 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, 1265 arg, 0, arg1, arg2); 1266 break; 1267 default: 1268 g_assert_not_reached(); 1269 } 1270} 1271 1272static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, 1273 TCGReg base, intptr_t ofs) 1274{ 1275 int rexw = 0; 1276 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) { 1277 if (val != (int32_t)val) { 1278 return false; 1279 } 1280 rexw = P_REXW; 1281 } else if (type != TCG_TYPE_I32) { 1282 return false; 1283 } 1284 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); 1285 tcg_out32(s, val); 1286 return true; 1287} 1288 1289static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count) 1290{ 1291 /* Propagate an opcode prefix, such as P_DATA16. */ 1292 int ext = subopc & ~0x7; 1293 subopc &= 0x7; 1294 1295 if (count == 1) { 1296 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg); 1297 } else { 1298 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg); 1299 tcg_out8(s, count); 1300 } 1301} 1302 1303static inline void tcg_out_bswap32(TCGContext *s, int reg) 1304{ 1305 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0); 1306} 1307 1308static inline void tcg_out_rolw_8(TCGContext *s, int reg) 1309{ 1310 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8); 1311} 1312 1313static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src) 1314{ 1315 /* movzbl */ 1316 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1317 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src); 1318} 1319 1320static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1321{ 1322 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1323 /* movsbl */ 1324 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64); 1325 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src); 1326} 1327 1328static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src) 1329{ 1330 /* movzwl */ 1331 tcg_out_modrm(s, OPC_MOVZWL, dest, src); 1332} 1333 1334static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src) 1335{ 1336 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 1337 /* movsw[lq] */ 1338 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src); 1339} 1340 1341static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src) 1342{ 1343 /* 32-bit mov zero extends. */ 1344 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src); 1345} 1346 1347static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src) 1348{ 1349 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 1350 tcg_out_modrm(s, OPC_MOVSLQ, dest, src); 1351} 1352 1353static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1354{ 1355 tcg_out_ext32s(s, dest, src); 1356} 1357 1358static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) 1359{ 1360 if (dest != src) { 1361 tcg_out_ext32u(s, dest, src); 1362 } 1363} 1364 1365static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) 1366{ 1367 tcg_out_ext32u(s, dest, src); 1368} 1369 1370static inline void tcg_out_bswap64(TCGContext *s, int reg) 1371{ 1372 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0); 1373} 1374 1375static void tgen_arithi(TCGContext *s, int c, int r0, 1376 tcg_target_long val, int cf) 1377{ 1378 int rexw = 0; 1379 1380 if (TCG_TARGET_REG_BITS == 64) { 1381 rexw = c & -8; 1382 c &= 7; 1383 } 1384 1385 switch (c) { 1386 case ARITH_ADD: 1387 case ARITH_SUB: 1388 if (!cf) { 1389 /* 1390 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce 1391 * partial flags update stalls on Pentium4 and are not recommended 1392 * by current Intel optimization manuals. 1393 */ 1394 if (val == 1 || val == -1) { 1395 int is_inc = (c == ARITH_ADD) ^ (val < 0); 1396 if (TCG_TARGET_REG_BITS == 64) { 1397 /* 1398 * The single-byte increment encodings are re-tasked 1399 * as the REX prefixes. Use the MODRM encoding. 1400 */ 1401 tcg_out_modrm(s, OPC_GRP5 + rexw, 1402 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0); 1403 } else { 1404 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0); 1405 } 1406 return; 1407 } 1408 if (val == 128) { 1409 /* 1410 * Facilitate using an 8-bit immediate. Carry is inverted 1411 * by this transformation, so do it only if cf == 0. 1412 */ 1413 c ^= ARITH_ADD ^ ARITH_SUB; 1414 val = -128; 1415 } 1416 } 1417 break; 1418 1419 case ARITH_AND: 1420 if (TCG_TARGET_REG_BITS == 64) { 1421 if (val == 0xffffffffu) { 1422 tcg_out_ext32u(s, r0, r0); 1423 return; 1424 } 1425 if (val == (uint32_t)val) { 1426 /* AND with no high bits set can use a 32-bit operation. */ 1427 rexw = 0; 1428 } 1429 } 1430 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1431 tcg_out_ext8u(s, r0, r0); 1432 return; 1433 } 1434 if (val == 0xffffu) { 1435 tcg_out_ext16u(s, r0, r0); 1436 return; 1437 } 1438 break; 1439 1440 case ARITH_OR: 1441 case ARITH_XOR: 1442 if (val >= 0x80 && val <= 0xff 1443 && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { 1444 tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); 1445 tcg_out8(s, val); 1446 return; 1447 } 1448 break; 1449 } 1450 1451 if (val == (int8_t)val) { 1452 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0); 1453 tcg_out8(s, val); 1454 return; 1455 } 1456 if (rexw == 0 || val == (int32_t)val) { 1457 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); 1458 tcg_out32(s, val); 1459 return; 1460 } 1461 1462 g_assert_not_reached(); 1463} 1464 1465static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) 1466{ 1467 if (val != 0) { 1468 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0); 1469 } 1470} 1471 1472/* Set SMALL to force a short forward branch. */ 1473static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small) 1474{ 1475 int32_t val, val1; 1476 1477 if (l->has_value) { 1478 val = tcg_pcrel_diff(s, l->u.value_ptr); 1479 val1 = val - 2; 1480 if ((int8_t)val1 == val1) { 1481 if (opc == -1) { 1482 tcg_out8(s, OPC_JMP_short); 1483 } else { 1484 tcg_out8(s, OPC_JCC_short + opc); 1485 } 1486 tcg_out8(s, val1); 1487 } else { 1488 tcg_debug_assert(!small); 1489 if (opc == -1) { 1490 tcg_out8(s, OPC_JMP_long); 1491 tcg_out32(s, val - 5); 1492 } else { 1493 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1494 tcg_out32(s, val - 6); 1495 } 1496 } 1497 } else if (small) { 1498 if (opc == -1) { 1499 tcg_out8(s, OPC_JMP_short); 1500 } else { 1501 tcg_out8(s, OPC_JCC_short + opc); 1502 } 1503 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1); 1504 s->code_ptr += 1; 1505 } else { 1506 if (opc == -1) { 1507 tcg_out8(s, OPC_JMP_long); 1508 } else { 1509 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0); 1510 } 1511 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4); 1512 s->code_ptr += 4; 1513 } 1514} 1515 1516static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1, 1517 TCGArg arg2, int const_arg2, int rexw) 1518{ 1519 int jz, js; 1520 1521 if (!is_tst_cond(cond)) { 1522 if (!const_arg2) { 1523 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2); 1524 } else if (arg2 == 0) { 1525 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1); 1526 } else { 1527 tcg_debug_assert(!rexw || arg2 == (int32_t)arg2); 1528 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0); 1529 } 1530 return tcg_cond_to_jcc[cond]; 1531 } 1532 1533 jz = tcg_cond_to_jcc[cond]; 1534 js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS); 1535 1536 if (!const_arg2) { 1537 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2); 1538 return jz; 1539 } 1540 1541 if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) { 1542 if (arg2 == 0x80) { 1543 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1544 return js; 1545 } 1546 if (arg2 == 0xff) { 1547 tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1); 1548 return jz; 1549 } 1550 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1); 1551 tcg_out8(s, arg2); 1552 return jz; 1553 } 1554 1555 if ((arg2 & ~0xff00) == 0 && arg1 < 4) { 1556 if (arg2 == 0x8000) { 1557 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1558 return js; 1559 } 1560 if (arg2 == 0xff00) { 1561 tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4); 1562 return jz; 1563 } 1564 tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4); 1565 tcg_out8(s, arg2 >> 8); 1566 return jz; 1567 } 1568 1569 if (arg2 == 0xffff) { 1570 tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1); 1571 return jz; 1572 } 1573 if (arg2 == 0xffffffffu) { 1574 tcg_out_modrm(s, OPC_TESTL, arg1, arg1); 1575 return jz; 1576 } 1577 1578 if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) { 1579 int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE); 1580 int sh = ctz64(arg2); 1581 1582 rexw = (sh & 32 ? P_REXW : 0); 1583 if ((sh & 31) == 31) { 1584 tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1); 1585 return js; 1586 } else { 1587 tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1); 1588 tcg_out8(s, sh); 1589 return jc; 1590 } 1591 } 1592 1593 if (rexw) { 1594 if (arg2 == (uint32_t)arg2) { 1595 rexw = 0; 1596 } else { 1597 tcg_debug_assert(arg2 == (int32_t)arg2); 1598 } 1599 } 1600 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1); 1601 tcg_out32(s, arg2); 1602 return jz; 1603} 1604 1605static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond, 1606 TCGArg arg1, TCGArg arg2, int const_arg2, 1607 TCGLabel *label, bool small) 1608{ 1609 int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw); 1610 tcg_out_jxx(s, jcc, label, small); 1611} 1612 1613#if TCG_TARGET_REG_BITS == 32 1614static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, 1615 const int *const_args, bool small) 1616{ 1617 TCGLabel *label_next = gen_new_label(); 1618 TCGLabel *label_this = arg_label(args[5]); 1619 TCGCond cond = args[4]; 1620 1621 switch (cond) { 1622 case TCG_COND_EQ: 1623 case TCG_COND_TSTEQ: 1624 tcg_out_brcond(s, 0, tcg_invert_cond(cond), 1625 args[0], args[2], const_args[2], label_next, 1); 1626 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1627 label_this, small); 1628 break; 1629 case TCG_COND_NE: 1630 case TCG_COND_TSTNE: 1631 tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2], 1632 label_this, small); 1633 tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3], 1634 label_this, small); 1635 break; 1636 case TCG_COND_LT: 1637 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1638 label_this, small); 1639 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1640 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1641 label_this, small); 1642 break; 1643 case TCG_COND_LE: 1644 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3], 1645 label_this, small); 1646 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1647 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1648 label_this, small); 1649 break; 1650 case TCG_COND_GT: 1651 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1652 label_this, small); 1653 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1654 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1655 label_this, small); 1656 break; 1657 case TCG_COND_GE: 1658 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3], 1659 label_this, small); 1660 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1661 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1662 label_this, small); 1663 break; 1664 case TCG_COND_LTU: 1665 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1666 label_this, small); 1667 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1668 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2], 1669 label_this, small); 1670 break; 1671 case TCG_COND_LEU: 1672 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3], 1673 label_this, small); 1674 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1675 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2], 1676 label_this, small); 1677 break; 1678 case TCG_COND_GTU: 1679 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1680 label_this, small); 1681 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1682 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2], 1683 label_this, small); 1684 break; 1685 case TCG_COND_GEU: 1686 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3], 1687 label_this, small); 1688 tcg_out_jxx(s, JCC_JNE, label_next, 1); 1689 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2], 1690 label_this, small); 1691 break; 1692 default: 1693 g_assert_not_reached(); 1694 } 1695 tcg_out_label(s, label_next); 1696} 1697#endif 1698 1699static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond, 1700 TCGArg dest, TCGArg arg1, TCGArg arg2, 1701 int const_arg2, bool neg) 1702{ 1703 int cmp_rexw = rexw; 1704 bool inv = false; 1705 bool cleared; 1706 int jcc; 1707 1708 switch (cond) { 1709 case TCG_COND_NE: 1710 inv = true; 1711 /* fall through */ 1712 case TCG_COND_EQ: 1713 /* If arg2 is 0, convert to LTU/GEU vs 1. */ 1714 if (const_arg2 && arg2 == 0) { 1715 arg2 = 1; 1716 goto do_ltu; 1717 } 1718 break; 1719 1720 case TCG_COND_TSTNE: 1721 inv = true; 1722 /* fall through */ 1723 case TCG_COND_TSTEQ: 1724 /* If arg2 is -1, convert to LTU/GEU vs 1. */ 1725 if (const_arg2 && arg2 == 0xffffffffu) { 1726 arg2 = 1; 1727 cmp_rexw = 0; 1728 goto do_ltu; 1729 } 1730 break; 1731 1732 case TCG_COND_LEU: 1733 inv = true; 1734 /* fall through */ 1735 case TCG_COND_GTU: 1736 /* If arg2 is a register, swap for LTU/GEU. */ 1737 if (!const_arg2) { 1738 TCGReg t = arg1; 1739 arg1 = arg2; 1740 arg2 = t; 1741 goto do_ltu; 1742 } 1743 break; 1744 1745 case TCG_COND_GEU: 1746 inv = true; 1747 /* fall through */ 1748 case TCG_COND_LTU: 1749 do_ltu: 1750 /* 1751 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU. 1752 * We can then use NEG or INC to produce the desired result. 1753 * This is always smaller than the SETCC expansion. 1754 */ 1755 tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw); 1756 1757 /* X - X - C = -C = (C ? -1 : 0) */ 1758 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest); 1759 if (inv && neg) { 1760 /* ~(C ? -1 : 0) = (C ? 0 : -1) */ 1761 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1762 } else if (inv) { 1763 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */ 1764 tgen_arithi(s, ARITH_ADD, dest, 1, 0); 1765 } else if (!neg) { 1766 /* -(C ? -1 : 0) = (C ? 1 : 0) */ 1767 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest); 1768 } 1769 return; 1770 1771 case TCG_COND_GE: 1772 inv = true; 1773 /* fall through */ 1774 case TCG_COND_LT: 1775 /* If arg2 is 0, extract the sign bit. */ 1776 if (const_arg2 && arg2 == 0) { 1777 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1); 1778 if (inv) { 1779 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest); 1780 } 1781 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw, 1782 dest, rexw ? 63 : 31); 1783 return; 1784 } 1785 break; 1786 1787 default: 1788 break; 1789 } 1790 1791 /* 1792 * If dest does not overlap the inputs, clearing it first is preferred. 1793 * The XOR breaks any false dependency for the low-byte write to dest, 1794 * and is also one byte smaller than MOVZBL. 1795 */ 1796 cleared = false; 1797 if (dest != arg1 && (const_arg2 || dest != arg2)) { 1798 tgen_arithr(s, ARITH_XOR, dest, dest); 1799 cleared = true; 1800 } 1801 1802 jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw); 1803 tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest); 1804 1805 if (!cleared) { 1806 tcg_out_ext8u(s, dest, dest); 1807 } 1808 if (neg) { 1809 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest); 1810 } 1811} 1812 1813#if TCG_TARGET_REG_BITS == 32 1814static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, 1815 const int *const_args) 1816{ 1817 TCGArg new_args[6]; 1818 TCGLabel *label_true, *label_over; 1819 1820 memcpy(new_args, args+1, 5*sizeof(TCGArg)); 1821 1822 if (args[0] == args[1] || args[0] == args[2] 1823 || (!const_args[3] && args[0] == args[3]) 1824 || (!const_args[4] && args[0] == args[4])) { 1825 /* When the destination overlaps with one of the argument 1826 registers, don't do anything tricky. */ 1827 label_true = gen_new_label(); 1828 label_over = gen_new_label(); 1829 1830 new_args[5] = label_arg(label_true); 1831 tcg_out_brcond2(s, new_args, const_args+1, 1); 1832 1833 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1834 tcg_out_jxx(s, JCC_JMP, label_over, 1); 1835 tcg_out_label(s, label_true); 1836 1837 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); 1838 tcg_out_label(s, label_over); 1839 } else { 1840 /* When the destination does not overlap one of the arguments, 1841 clear the destination first, jump if cond false, and emit an 1842 increment in the true case. This results in smaller code. */ 1843 1844 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); 1845 1846 label_over = gen_new_label(); 1847 new_args[4] = tcg_invert_cond(new_args[4]); 1848 new_args[5] = label_arg(label_over); 1849 tcg_out_brcond2(s, new_args, const_args+1, 1); 1850 1851 tgen_arithi(s, ARITH_ADD, args[0], 1, 0); 1852 tcg_out_label(s, label_over); 1853 } 1854} 1855#endif 1856 1857static void tcg_out_cmov(TCGContext *s, int jcc, int rexw, 1858 TCGReg dest, TCGReg v1) 1859{ 1860 tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1); 1861} 1862 1863static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond, 1864 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2, 1865 TCGReg v1) 1866{ 1867 int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw); 1868 tcg_out_cmov(s, jcc, rexw, dest, v1); 1869} 1870 1871static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1872 TCGArg arg2, bool const_a2) 1873{ 1874 if (have_bmi1) { 1875 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1); 1876 if (const_a2) { 1877 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1878 } else { 1879 tcg_debug_assert(dest != arg2); 1880 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1881 } 1882 } else { 1883 tcg_debug_assert(dest != arg2); 1884 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1); 1885 tcg_out_cmov(s, JCC_JE, rexw, dest, arg2); 1886 } 1887} 1888 1889static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1, 1890 TCGArg arg2, bool const_a2) 1891{ 1892 if (have_lzcnt) { 1893 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1); 1894 if (const_a2) { 1895 tcg_debug_assert(arg2 == (rexw ? 64 : 32)); 1896 } else { 1897 tcg_debug_assert(dest != arg2); 1898 tcg_out_cmov(s, JCC_JB, rexw, dest, arg2); 1899 } 1900 } else { 1901 tcg_debug_assert(!const_a2); 1902 tcg_debug_assert(dest != arg1); 1903 tcg_debug_assert(dest != arg2); 1904 1905 /* Recall that the output of BSR is the index not the count. */ 1906 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1); 1907 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0); 1908 1909 /* Since we have destroyed the flags from BSR, we have to re-test. */ 1910 int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw); 1911 tcg_out_cmov(s, jcc, rexw, dest, arg2); 1912 } 1913} 1914 1915static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest) 1916{ 1917 intptr_t disp = tcg_pcrel_diff(s, dest) - 5; 1918 1919 if (disp == (int32_t)disp) { 1920 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0); 1921 tcg_out32(s, disp); 1922 } else { 1923 /* rip-relative addressing into the constant pool. 1924 This is 6 + 8 = 14 bytes, as compared to using an 1925 immediate load 10 + 6 = 16 bytes, plus we may 1926 be able to re-use the pool constant for more calls. */ 1927 tcg_out_opc(s, OPC_GRP5, 0, 0, 0); 1928 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5); 1929 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4); 1930 tcg_out32(s, 0); 1931 } 1932} 1933 1934static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest, 1935 const TCGHelperInfo *info) 1936{ 1937 tcg_out_branch(s, 1, dest); 1938 1939#ifndef _WIN32 1940 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) { 1941 /* 1942 * The sysv i386 abi for struct return places a reference as the 1943 * first argument of the stack, and pops that argument with the 1944 * return statement. Since we want to retain the aligned stack 1945 * pointer for the callee, we do not want to actually push that 1946 * argument before the call but rely on the normal store to the 1947 * stack slot. But we do need to compensate for the pop in order 1948 * to reset our correct stack pointer value. 1949 * Pushing a garbage value back onto the stack is quickest. 1950 */ 1951 tcg_out_push(s, TCG_REG_EAX); 1952 } 1953#endif 1954} 1955 1956static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest) 1957{ 1958 tcg_out_branch(s, 0, dest); 1959} 1960 1961static void tcg_out_nopn(TCGContext *s, int n) 1962{ 1963 int i; 1964 /* Emit 1 or 2 operand size prefixes for the standard one byte nop, 1965 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the 1966 * duplicate prefix, and all of the interesting recent cores can 1967 * decode and discard the duplicates in a single cycle. 1968 */ 1969 tcg_debug_assert(n >= 1); 1970 for (i = 1; i < n; ++i) { 1971 tcg_out8(s, 0x66); 1972 } 1973 tcg_out8(s, 0x90); 1974} 1975 1976typedef struct { 1977 TCGReg base; 1978 int index; 1979 int ofs; 1980 int seg; 1981 TCGAtomAlign aa; 1982} HostAddress; 1983 1984bool tcg_target_has_memory_bswap(MemOp memop) 1985{ 1986 TCGAtomAlign aa; 1987 1988 if (!have_movbe) { 1989 return false; 1990 } 1991 if ((memop & MO_SIZE) < MO_128) { 1992 return true; 1993 } 1994 1995 /* 1996 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, 1997 * but do allow a pair of 64-bit operations, i.e. MOVBEQ. 1998 */ 1999 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); 2000 return aa.atom < MO_128; 2001} 2002 2003/* 2004 * Because i686 has no register parameters and because x86_64 has xchg 2005 * to handle addr/data register overlap, we have placed all input arguments 2006 * before we need might need a scratch reg. 2007 * 2008 * Even then, a scratch is only needed for l->raddr. Rather than expose 2009 * a general-purpose scratch when we don't actually know it's available, 2010 * use the ra_gen hook to load into RAX if needed. 2011 */ 2012#if TCG_TARGET_REG_BITS == 64 2013static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) 2014{ 2015 if (arg < 0) { 2016 arg = TCG_REG_RAX; 2017 } 2018 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr); 2019 return arg; 2020} 2021static const TCGLdstHelperParam ldst_helper_param = { 2022 .ra_gen = ldst_ra_gen 2023}; 2024#else 2025static const TCGLdstHelperParam ldst_helper_param = { }; 2026#endif 2027 2028static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, 2029 TCGReg l, TCGReg h, TCGReg v) 2030{ 2031 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2032 2033 /* vpmov{d,q} %v, %l */ 2034 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); 2035 /* vpextr{d,q} $1, %v, %h */ 2036 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); 2037 tcg_out8(s, 1); 2038} 2039 2040static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, 2041 TCGReg v, TCGReg l, TCGReg h) 2042{ 2043 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; 2044 2045 /* vmov{d,q} %l, %v */ 2046 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); 2047 /* vpinsr{d,q} $1, %h, %v, %v */ 2048 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); 2049 tcg_out8(s, 1); 2050} 2051 2052/* 2053 * Generate code for the slow path for a load at the end of block 2054 */ 2055static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2056{ 2057 MemOp opc = get_memop(l->oi); 2058 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2059 2060 /* resolve label address */ 2061 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2062 if (label_ptr[1]) { 2063 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2064 } 2065 2066 tcg_out_ld_helper_args(s, l, &ldst_helper_param); 2067 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); 2068 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); 2069 2070 tcg_out_jmp(s, l->raddr); 2071 return true; 2072} 2073 2074/* 2075 * Generate code for the slow path for a store at the end of block 2076 */ 2077static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) 2078{ 2079 MemOp opc = get_memop(l->oi); 2080 tcg_insn_unit **label_ptr = &l->label_ptr[0]; 2081 2082 /* resolve label address */ 2083 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); 2084 if (label_ptr[1]) { 2085 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); 2086 } 2087 2088 tcg_out_st_helper_args(s, l, &ldst_helper_param); 2089 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); 2090 2091 tcg_out_jmp(s, l->raddr); 2092 return true; 2093} 2094 2095#ifdef CONFIG_USER_ONLY 2096static HostAddress x86_guest_base = { 2097 .index = -1 2098}; 2099 2100#if defined(__x86_64__) && defined(__linux__) 2101# include <asm/prctl.h> 2102# include <sys/prctl.h> 2103int arch_prctl(int code, unsigned long addr); 2104static inline int setup_guest_base_seg(void) 2105{ 2106 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) { 2107 return P_GS; 2108 } 2109 return 0; 2110} 2111#define setup_guest_base_seg setup_guest_base_seg 2112#elif defined(__x86_64__) && \ 2113 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) 2114# include <machine/sysarch.h> 2115static inline int setup_guest_base_seg(void) 2116{ 2117 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) { 2118 return P_GS; 2119 } 2120 return 0; 2121} 2122#define setup_guest_base_seg setup_guest_base_seg 2123#endif 2124#else 2125# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) 2126#endif /* CONFIG_USER_ONLY */ 2127#ifndef setup_guest_base_seg 2128# define setup_guest_base_seg() 0 2129#endif 2130 2131#define MIN_TLB_MASK_TABLE_OFS INT_MIN 2132 2133/* 2134 * For softmmu, perform the TLB load and compare. 2135 * For useronly, perform any required alignment tests. 2136 * In both cases, return a TCGLabelQemuLdst structure if the slow path 2137 * is required and fill in @h with the host address for the fast path. 2138 */ 2139static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, 2140 TCGReg addrlo, TCGReg addrhi, 2141 MemOpIdx oi, bool is_ld) 2142{ 2143 TCGLabelQemuLdst *ldst = NULL; 2144 MemOp opc = get_memop(oi); 2145 MemOp s_bits = opc & MO_SIZE; 2146 unsigned a_mask; 2147 2148 if (tcg_use_softmmu) { 2149 h->index = TCG_REG_L0; 2150 h->ofs = 0; 2151 h->seg = 0; 2152 } else { 2153 *h = x86_guest_base; 2154 } 2155 h->base = addrlo; 2156 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); 2157 a_mask = (1 << h->aa.align) - 1; 2158 2159 if (tcg_use_softmmu) { 2160 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) 2161 : offsetof(CPUTLBEntry, addr_write); 2162 TCGType ttype = TCG_TYPE_I32; 2163 TCGType tlbtype = TCG_TYPE_I32; 2164 int trexw = 0, hrexw = 0, tlbrexw = 0; 2165 unsigned mem_index = get_mmuidx(oi); 2166 unsigned s_mask = (1 << s_bits) - 1; 2167 int fast_ofs = tlb_mask_table_ofs(s, mem_index); 2168 int tlb_mask; 2169 2170 ldst = new_ldst_label(s); 2171 ldst->is_ld = is_ld; 2172 ldst->oi = oi; 2173 ldst->addrlo_reg = addrlo; 2174 ldst->addrhi_reg = addrhi; 2175 2176 if (TCG_TARGET_REG_BITS == 64) { 2177 ttype = s->addr_type; 2178 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); 2179 if (TCG_TYPE_PTR == TCG_TYPE_I64) { 2180 hrexw = P_REXW; 2181 if (s->page_bits + s->tlb_dyn_max_bits > 32) { 2182 tlbtype = TCG_TYPE_I64; 2183 tlbrexw = P_REXW; 2184 } 2185 } 2186 } 2187 2188 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); 2189 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, 2190 s->page_bits - CPU_TLB_ENTRY_BITS); 2191 2192 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, 2193 fast_ofs + offsetof(CPUTLBDescFast, mask)); 2194 2195 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, 2196 fast_ofs + offsetof(CPUTLBDescFast, table)); 2197 2198 /* 2199 * If the required alignment is at least as large as the access, 2200 * simply copy the address and mask. For lesser alignments, 2201 * check that we don't cross pages for the complete access. 2202 */ 2203 if (a_mask >= s_mask) { 2204 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); 2205 } else { 2206 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, 2207 addrlo, s_mask - a_mask); 2208 } 2209 tlb_mask = s->page_mask | a_mask; 2210 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); 2211 2212 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ 2213 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, 2214 TCG_REG_L1, TCG_REG_L0, cmp_ofs); 2215 2216 /* jne slow_path */ 2217 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2218 ldst->label_ptr[0] = s->code_ptr; 2219 s->code_ptr += 4; 2220 2221 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { 2222 /* cmp 4(TCG_REG_L0), addrhi */ 2223 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, 2224 TCG_REG_L0, cmp_ofs + 4); 2225 2226 /* jne slow_path */ 2227 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); 2228 ldst->label_ptr[1] = s->code_ptr; 2229 s->code_ptr += 4; 2230 } 2231 2232 /* TLB Hit. */ 2233 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, 2234 offsetof(CPUTLBEntry, addend)); 2235 } else if (a_mask) { 2236 int jcc; 2237 2238 ldst = new_ldst_label(s); 2239 ldst->is_ld = is_ld; 2240 ldst->oi = oi; 2241 ldst->addrlo_reg = addrlo; 2242 ldst->addrhi_reg = addrhi; 2243 2244 /* jne slow_path */ 2245 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false); 2246 tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0); 2247 ldst->label_ptr[0] = s->code_ptr; 2248 s->code_ptr += 4; 2249 } 2250 2251 return ldst; 2252} 2253 2254static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2255 HostAddress h, TCGType type, MemOp memop) 2256{ 2257 bool use_movbe = false; 2258 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW); 2259 int movop = OPC_MOVL_GvEv; 2260 2261 /* Do big-endian loads with movbe. */ 2262 if (memop & MO_BSWAP) { 2263 tcg_debug_assert(have_movbe); 2264 use_movbe = true; 2265 movop = OPC_MOVBE_GyMy; 2266 } 2267 2268 switch (memop & MO_SSIZE) { 2269 case MO_UB: 2270 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo, 2271 h.base, h.index, 0, h.ofs); 2272 break; 2273 case MO_SB: 2274 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo, 2275 h.base, h.index, 0, h.ofs); 2276 break; 2277 case MO_UW: 2278 if (use_movbe) { 2279 /* There is no extending movbe; only low 16-bits are modified. */ 2280 if (datalo != h.base && datalo != h.index) { 2281 /* XOR breaks dependency chains. */ 2282 tgen_arithr(s, ARITH_XOR, datalo, datalo); 2283 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2284 datalo, h.base, h.index, 0, h.ofs); 2285 } else { 2286 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2287 datalo, h.base, h.index, 0, h.ofs); 2288 tcg_out_ext16u(s, datalo, datalo); 2289 } 2290 } else { 2291 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo, 2292 h.base, h.index, 0, h.ofs); 2293 } 2294 break; 2295 case MO_SW: 2296 if (use_movbe) { 2297 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg, 2298 datalo, h.base, h.index, 0, h.ofs); 2299 tcg_out_ext16s(s, type, datalo, datalo); 2300 } else { 2301 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg, 2302 datalo, h.base, h.index, 0, h.ofs); 2303 } 2304 break; 2305 case MO_UL: 2306 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2307 h.base, h.index, 0, h.ofs); 2308 break; 2309#if TCG_TARGET_REG_BITS == 64 2310 case MO_SL: 2311 if (use_movbe) { 2312 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo, 2313 h.base, h.index, 0, h.ofs); 2314 tcg_out_ext32s(s, datalo, datalo); 2315 } else { 2316 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo, 2317 h.base, h.index, 0, h.ofs); 2318 } 2319 break; 2320#endif 2321 case MO_UQ: 2322 if (TCG_TARGET_REG_BITS == 64) { 2323 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2324 h.base, h.index, 0, h.ofs); 2325 break; 2326 } 2327 if (use_movbe) { 2328 TCGReg t = datalo; 2329 datalo = datahi; 2330 datahi = t; 2331 } 2332 if (h.base == datalo || h.index == datalo) { 2333 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi, 2334 h.base, h.index, 0, h.ofs); 2335 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0); 2336 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4); 2337 } else { 2338 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2339 h.base, h.index, 0, h.ofs); 2340 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2341 h.base, h.index, 0, h.ofs + 4); 2342 } 2343 break; 2344 2345 case MO_128: 2346 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2347 2348 /* 2349 * Without 16-byte atomicity, use integer regs. 2350 * That is where we want the data, and it allows bswaps. 2351 */ 2352 if (h.aa.atom < MO_128) { 2353 if (use_movbe) { 2354 TCGReg t = datalo; 2355 datalo = datahi; 2356 datahi = t; 2357 } 2358 if (h.base == datalo || h.index == datalo) { 2359 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, 2360 h.base, h.index, 0, h.ofs); 2361 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2362 datalo, datahi, 0); 2363 tcg_out_modrm_offset(s, movop + P_REXW + h.seg, 2364 datahi, datahi, 8); 2365 } else { 2366 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2367 h.base, h.index, 0, h.ofs); 2368 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2369 h.base, h.index, 0, h.ofs + 8); 2370 } 2371 break; 2372 } 2373 2374 /* 2375 * With 16-byte atomicity, a vector load is required. 2376 * If we already have 16-byte alignment, then VMOVDQA always works. 2377 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2378 * Else use we require a runtime test for alignment for VMOVDQA; 2379 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2380 */ 2381 if (h.aa.align >= MO_128) { 2382 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2383 TCG_TMP_VEC, 0, 2384 h.base, h.index, 0, h.ofs); 2385 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2386 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2387 TCG_TMP_VEC, 0, 2388 h.base, h.index, 0, h.ofs); 2389 } else { 2390 TCGLabel *l1 = gen_new_label(); 2391 TCGLabel *l2 = gen_new_label(); 2392 int jcc; 2393 2394 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2395 tcg_out_jxx(s, jcc, l1, true); 2396 2397 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, 2398 TCG_TMP_VEC, 0, 2399 h.base, h.index, 0, h.ofs); 2400 tcg_out_jxx(s, JCC_JMP, l2, true); 2401 2402 tcg_out_label(s, l1); 2403 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, 2404 TCG_TMP_VEC, 0, 2405 h.base, h.index, 0, h.ofs); 2406 tcg_out_label(s, l2); 2407 } 2408 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); 2409 break; 2410 2411 default: 2412 g_assert_not_reached(); 2413 } 2414} 2415 2416static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, 2417 TCGReg addrlo, TCGReg addrhi, 2418 MemOpIdx oi, TCGType data_type) 2419{ 2420 TCGLabelQemuLdst *ldst; 2421 HostAddress h; 2422 2423 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); 2424 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi)); 2425 2426 if (ldst) { 2427 ldst->type = data_type; 2428 ldst->datalo_reg = datalo; 2429 ldst->datahi_reg = datahi; 2430 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2431 } 2432} 2433 2434static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, 2435 HostAddress h, MemOp memop) 2436{ 2437 bool use_movbe = false; 2438 int movop = OPC_MOVL_EvGv; 2439 2440 /* 2441 * Do big-endian stores with movbe or system-mode. 2442 * User-only without movbe will have its swapping done generically. 2443 */ 2444 if (memop & MO_BSWAP) { 2445 tcg_debug_assert(have_movbe); 2446 use_movbe = true; 2447 movop = OPC_MOVBE_MyGy; 2448 } 2449 2450 switch (memop & MO_SIZE) { 2451 case MO_8: 2452 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */ 2453 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4); 2454 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg, 2455 datalo, h.base, h.index, 0, h.ofs); 2456 break; 2457 case MO_16: 2458 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo, 2459 h.base, h.index, 0, h.ofs); 2460 break; 2461 case MO_32: 2462 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2463 h.base, h.index, 0, h.ofs); 2464 break; 2465 case MO_64: 2466 if (TCG_TARGET_REG_BITS == 64) { 2467 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2468 h.base, h.index, 0, h.ofs); 2469 } else { 2470 if (use_movbe) { 2471 TCGReg t = datalo; 2472 datalo = datahi; 2473 datahi = t; 2474 } 2475 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo, 2476 h.base, h.index, 0, h.ofs); 2477 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi, 2478 h.base, h.index, 0, h.ofs + 4); 2479 } 2480 break; 2481 2482 case MO_128: 2483 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2484 2485 /* 2486 * Without 16-byte atomicity, use integer regs. 2487 * That is where we have the data, and it allows bswaps. 2488 */ 2489 if (h.aa.atom < MO_128) { 2490 if (use_movbe) { 2491 TCGReg t = datalo; 2492 datalo = datahi; 2493 datahi = t; 2494 } 2495 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, 2496 h.base, h.index, 0, h.ofs); 2497 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, 2498 h.base, h.index, 0, h.ofs + 8); 2499 break; 2500 } 2501 2502 /* 2503 * With 16-byte atomicity, a vector store is required. 2504 * If we already have 16-byte alignment, then VMOVDQA always works. 2505 * Else if VMOVDQU has atomicity with dynamic alignment, use that. 2506 * Else use we require a runtime test for alignment for VMOVDQA; 2507 * use VMOVDQU on the unaligned nonatomic path for simplicity. 2508 */ 2509 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); 2510 if (h.aa.align >= MO_128) { 2511 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2512 TCG_TMP_VEC, 0, 2513 h.base, h.index, 0, h.ofs); 2514 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { 2515 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2516 TCG_TMP_VEC, 0, 2517 h.base, h.index, 0, h.ofs); 2518 } else { 2519 TCGLabel *l1 = gen_new_label(); 2520 TCGLabel *l2 = gen_new_label(); 2521 int jcc; 2522 2523 jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false); 2524 tcg_out_jxx(s, jcc, l1, true); 2525 2526 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, 2527 TCG_TMP_VEC, 0, 2528 h.base, h.index, 0, h.ofs); 2529 tcg_out_jxx(s, JCC_JMP, l2, true); 2530 2531 tcg_out_label(s, l1); 2532 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, 2533 TCG_TMP_VEC, 0, 2534 h.base, h.index, 0, h.ofs); 2535 tcg_out_label(s, l2); 2536 } 2537 break; 2538 2539 default: 2540 g_assert_not_reached(); 2541 } 2542} 2543 2544static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, 2545 TCGReg addrlo, TCGReg addrhi, 2546 MemOpIdx oi, TCGType data_type) 2547{ 2548 TCGLabelQemuLdst *ldst; 2549 HostAddress h; 2550 2551 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); 2552 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi)); 2553 2554 if (ldst) { 2555 ldst->type = data_type; 2556 ldst->datalo_reg = datalo; 2557 ldst->datahi_reg = datahi; 2558 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); 2559 } 2560} 2561 2562static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) 2563{ 2564 /* Reuse the zeroing that exists for goto_ptr. */ 2565 if (a0 == 0) { 2566 tcg_out_jmp(s, tcg_code_gen_epilogue); 2567 } else { 2568 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0); 2569 tcg_out_jmp(s, tb_ret_addr); 2570 } 2571} 2572 2573static void tcg_out_goto_tb(TCGContext *s, int which) 2574{ 2575 /* 2576 * Jump displacement must be aligned for atomic patching; 2577 * see if we need to add extra nops before jump 2578 */ 2579 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr; 2580 if (gap != 1) { 2581 tcg_out_nopn(s, gap - 1); 2582 } 2583 tcg_out8(s, OPC_JMP_long); /* jmp im */ 2584 set_jmp_insn_offset(s, which); 2585 tcg_out32(s, 0); 2586 set_jmp_reset_offset(s, which); 2587} 2588 2589void tb_target_set_jmp_target(const TranslationBlock *tb, int n, 2590 uintptr_t jmp_rx, uintptr_t jmp_rw) 2591{ 2592 /* patch the branch destination */ 2593 uintptr_t addr = tb->jmp_target_addr[n]; 2594 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4)); 2595 /* no need to flush icache explicitly */ 2596} 2597 2598static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, 2599 const TCGArg args[TCG_MAX_OP_ARGS], 2600 const int const_args[TCG_MAX_OP_ARGS]) 2601{ 2602 TCGArg a0, a1, a2; 2603 int c, const_a2, vexop, rexw = 0; 2604 2605#if TCG_TARGET_REG_BITS == 64 2606# define OP_32_64(x) \ 2607 case glue(glue(INDEX_op_, x), _i64): \ 2608 rexw = P_REXW; /* FALLTHRU */ \ 2609 case glue(glue(INDEX_op_, x), _i32) 2610#else 2611# define OP_32_64(x) \ 2612 case glue(glue(INDEX_op_, x), _i32) 2613#endif 2614 2615 /* Hoist the loads of the most common arguments. */ 2616 a0 = args[0]; 2617 a1 = args[1]; 2618 a2 = args[2]; 2619 const_a2 = const_args[2]; 2620 2621 switch (opc) { 2622 case INDEX_op_goto_ptr: 2623 /* jmp to the given host address (could be epilogue) */ 2624 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0); 2625 break; 2626 case INDEX_op_br: 2627 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0); 2628 break; 2629 OP_32_64(ld8u): 2630 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2631 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2); 2632 break; 2633 OP_32_64(ld8s): 2634 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2); 2635 break; 2636 OP_32_64(ld16u): 2637 /* Note that we can ignore REXW for the zero-extend to 64-bit. */ 2638 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2); 2639 break; 2640 OP_32_64(ld16s): 2641 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2); 2642 break; 2643#if TCG_TARGET_REG_BITS == 64 2644 case INDEX_op_ld32u_i64: 2645#endif 2646 case INDEX_op_ld_i32: 2647 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2); 2648 break; 2649 2650 OP_32_64(st8): 2651 if (const_args[0]) { 2652 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2); 2653 tcg_out8(s, a0); 2654 } else { 2655 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2); 2656 } 2657 break; 2658 OP_32_64(st16): 2659 if (const_args[0]) { 2660 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2); 2661 tcg_out16(s, a0); 2662 } else { 2663 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2); 2664 } 2665 break; 2666#if TCG_TARGET_REG_BITS == 64 2667 case INDEX_op_st32_i64: 2668#endif 2669 case INDEX_op_st_i32: 2670 if (const_args[0]) { 2671 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2); 2672 tcg_out32(s, a0); 2673 } else { 2674 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2); 2675 } 2676 break; 2677 2678 OP_32_64(add): 2679 /* For 3-operand addition, use LEA. */ 2680 if (a0 != a1) { 2681 TCGArg c3 = 0; 2682 if (const_a2) { 2683 c3 = a2, a2 = -1; 2684 } else if (a0 == a2) { 2685 /* Watch out for dest = src + dest, since we've removed 2686 the matching constraint on the add. */ 2687 tgen_arithr(s, ARITH_ADD + rexw, a0, a1); 2688 break; 2689 } 2690 2691 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3); 2692 break; 2693 } 2694 c = ARITH_ADD; 2695 goto gen_arith; 2696 OP_32_64(sub): 2697 c = ARITH_SUB; 2698 goto gen_arith; 2699 OP_32_64(and): 2700 c = ARITH_AND; 2701 goto gen_arith; 2702 OP_32_64(or): 2703 c = ARITH_OR; 2704 goto gen_arith; 2705 OP_32_64(xor): 2706 c = ARITH_XOR; 2707 goto gen_arith; 2708 gen_arith: 2709 if (const_a2) { 2710 tgen_arithi(s, c + rexw, a0, a2, 0); 2711 } else { 2712 tgen_arithr(s, c + rexw, a0, a2); 2713 } 2714 break; 2715 2716 OP_32_64(andc): 2717 if (const_a2) { 2718 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2719 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0); 2720 } else { 2721 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1); 2722 } 2723 break; 2724 2725 OP_32_64(mul): 2726 if (const_a2) { 2727 int32_t val; 2728 val = a2; 2729 if (val == (int8_t)val) { 2730 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0); 2731 tcg_out8(s, val); 2732 } else { 2733 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0); 2734 tcg_out32(s, val); 2735 } 2736 } else { 2737 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2); 2738 } 2739 break; 2740 2741 OP_32_64(div2): 2742 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]); 2743 break; 2744 OP_32_64(divu2): 2745 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]); 2746 break; 2747 2748 OP_32_64(shl): 2749 /* For small constant 3-operand shift, use LEA. */ 2750 if (const_a2 && a0 != a1 && (a2 - 1) < 3) { 2751 if (a2 - 1 == 0) { 2752 /* shl $1,a1,a0 -> lea (a1,a1),a0 */ 2753 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0); 2754 } else { 2755 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */ 2756 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0); 2757 } 2758 break; 2759 } 2760 c = SHIFT_SHL; 2761 vexop = OPC_SHLX; 2762 goto gen_shift_maybe_vex; 2763 OP_32_64(shr): 2764 c = SHIFT_SHR; 2765 vexop = OPC_SHRX; 2766 goto gen_shift_maybe_vex; 2767 OP_32_64(sar): 2768 c = SHIFT_SAR; 2769 vexop = OPC_SARX; 2770 goto gen_shift_maybe_vex; 2771 OP_32_64(rotl): 2772 c = SHIFT_ROL; 2773 goto gen_shift; 2774 OP_32_64(rotr): 2775 c = SHIFT_ROR; 2776 goto gen_shift; 2777 gen_shift_maybe_vex: 2778 if (have_bmi2) { 2779 if (!const_a2) { 2780 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1); 2781 break; 2782 } 2783 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1); 2784 } 2785 /* FALLTHRU */ 2786 gen_shift: 2787 if (const_a2) { 2788 tcg_out_shifti(s, c + rexw, a0, a2); 2789 } else { 2790 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0); 2791 } 2792 break; 2793 2794 OP_32_64(ctz): 2795 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]); 2796 break; 2797 OP_32_64(clz): 2798 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]); 2799 break; 2800 OP_32_64(ctpop): 2801 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1); 2802 break; 2803 2804 OP_32_64(brcond): 2805 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1], 2806 arg_label(args[3]), 0); 2807 break; 2808 OP_32_64(setcond): 2809 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false); 2810 break; 2811 OP_32_64(negsetcond): 2812 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true); 2813 break; 2814 OP_32_64(movcond): 2815 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]); 2816 break; 2817 2818 OP_32_64(bswap16): 2819 if (a2 & TCG_BSWAP_OS) { 2820 /* Output must be sign-extended. */ 2821 if (rexw) { 2822 tcg_out_bswap64(s, a0); 2823 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48); 2824 } else { 2825 tcg_out_bswap32(s, a0); 2826 tcg_out_shifti(s, SHIFT_SAR, a0, 16); 2827 } 2828 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) { 2829 /* Output must be zero-extended, but input isn't. */ 2830 tcg_out_bswap32(s, a0); 2831 tcg_out_shifti(s, SHIFT_SHR, a0, 16); 2832 } else { 2833 tcg_out_rolw_8(s, a0); 2834 } 2835 break; 2836 OP_32_64(bswap32): 2837 tcg_out_bswap32(s, a0); 2838 if (rexw && (a2 & TCG_BSWAP_OS)) { 2839 tcg_out_ext32s(s, a0, a0); 2840 } 2841 break; 2842 2843 OP_32_64(neg): 2844 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0); 2845 break; 2846 OP_32_64(not): 2847 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); 2848 break; 2849 2850 case INDEX_op_qemu_ld_a64_i32: 2851 if (TCG_TARGET_REG_BITS == 32) { 2852 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2853 break; 2854 } 2855 /* fall through */ 2856 case INDEX_op_qemu_ld_a32_i32: 2857 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2858 break; 2859 case INDEX_op_qemu_ld_a32_i64: 2860 if (TCG_TARGET_REG_BITS == 64) { 2861 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2862 } else { 2863 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2864 } 2865 break; 2866 case INDEX_op_qemu_ld_a64_i64: 2867 if (TCG_TARGET_REG_BITS == 64) { 2868 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2869 } else { 2870 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2871 } 2872 break; 2873 case INDEX_op_qemu_ld_a32_i128: 2874 case INDEX_op_qemu_ld_a64_i128: 2875 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2876 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2877 break; 2878 2879 case INDEX_op_qemu_st_a64_i32: 2880 case INDEX_op_qemu_st8_a64_i32: 2881 if (TCG_TARGET_REG_BITS == 32) { 2882 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); 2883 break; 2884 } 2885 /* fall through */ 2886 case INDEX_op_qemu_st_a32_i32: 2887 case INDEX_op_qemu_st8_a32_i32: 2888 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); 2889 break; 2890 case INDEX_op_qemu_st_a32_i64: 2891 if (TCG_TARGET_REG_BITS == 64) { 2892 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2893 } else { 2894 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); 2895 } 2896 break; 2897 case INDEX_op_qemu_st_a64_i64: 2898 if (TCG_TARGET_REG_BITS == 64) { 2899 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); 2900 } else { 2901 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); 2902 } 2903 break; 2904 case INDEX_op_qemu_st_a32_i128: 2905 case INDEX_op_qemu_st_a64_i128: 2906 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 2907 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); 2908 break; 2909 2910 OP_32_64(mulu2): 2911 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); 2912 break; 2913 OP_32_64(muls2): 2914 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]); 2915 break; 2916 OP_32_64(add2): 2917 if (const_args[4]) { 2918 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1); 2919 } else { 2920 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]); 2921 } 2922 if (const_args[5]) { 2923 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1); 2924 } else { 2925 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]); 2926 } 2927 break; 2928 OP_32_64(sub2): 2929 if (const_args[4]) { 2930 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1); 2931 } else { 2932 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]); 2933 } 2934 if (const_args[5]) { 2935 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1); 2936 } else { 2937 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]); 2938 } 2939 break; 2940 2941#if TCG_TARGET_REG_BITS == 32 2942 case INDEX_op_brcond2_i32: 2943 tcg_out_brcond2(s, args, const_args, 0); 2944 break; 2945 case INDEX_op_setcond2_i32: 2946 tcg_out_setcond2(s, args, const_args); 2947 break; 2948#else /* TCG_TARGET_REG_BITS == 64 */ 2949 case INDEX_op_ld32s_i64: 2950 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2); 2951 break; 2952 case INDEX_op_ld_i64: 2953 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2); 2954 break; 2955 case INDEX_op_st_i64: 2956 if (const_args[0]) { 2957 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2); 2958 tcg_out32(s, a0); 2959 } else { 2960 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2); 2961 } 2962 break; 2963 2964 case INDEX_op_bswap64_i64: 2965 tcg_out_bswap64(s, a0); 2966 break; 2967 case INDEX_op_extrh_i64_i32: 2968 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32); 2969 break; 2970#endif 2971 2972 OP_32_64(deposit): 2973 if (args[3] == 0 && args[4] == 8) { 2974 /* load bits 0..7 */ 2975 if (const_a2) { 2976 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0), 2977 0, a0, 0); 2978 tcg_out8(s, a2); 2979 } else { 2980 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0); 2981 } 2982 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) { 2983 /* load bits 8..15 */ 2984 if (const_a2) { 2985 tcg_out8(s, OPC_MOVB_Ib + a0 + 4); 2986 tcg_out8(s, a2); 2987 } else { 2988 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4); 2989 } 2990 } else if (args[3] == 0 && args[4] == 16) { 2991 /* load bits 0..15 */ 2992 if (const_a2) { 2993 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0), 2994 0, a0, 0); 2995 tcg_out16(s, a2); 2996 } else { 2997 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0); 2998 } 2999 } else { 3000 g_assert_not_reached(); 3001 } 3002 break; 3003 3004 case INDEX_op_extract_i64: 3005 if (a2 + args[3] == 32) { 3006 /* This is a 32-bit zero-extending right shift. */ 3007 tcg_out_mov(s, TCG_TYPE_I32, a0, a1); 3008 tcg_out_shifti(s, SHIFT_SHR, a0, a2); 3009 break; 3010 } 3011 /* FALLTHRU */ 3012 case INDEX_op_extract_i32: 3013 /* On the off-chance that we can use the high-byte registers. 3014 Otherwise we emit the same ext16 + shift pattern that we 3015 would have gotten from the normal tcg-op.c expansion. */ 3016 tcg_debug_assert(a2 == 8 && args[3] == 8); 3017 if (a1 < 4 && a0 < 8) { 3018 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4); 3019 } else { 3020 tcg_out_ext16u(s, a0, a1); 3021 tcg_out_shifti(s, SHIFT_SHR, a0, 8); 3022 } 3023 break; 3024 3025 case INDEX_op_sextract_i32: 3026 /* We don't implement sextract_i64, as we cannot sign-extend to 3027 64-bits without using the REX prefix that explicitly excludes 3028 access to the high-byte registers. */ 3029 tcg_debug_assert(a2 == 8 && args[3] == 8); 3030 if (a1 < 4 && a0 < 8) { 3031 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4); 3032 } else { 3033 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1); 3034 tcg_out_shifti(s, SHIFT_SAR, a0, 8); 3035 } 3036 break; 3037 3038 OP_32_64(extract2): 3039 /* Note that SHRD outputs to the r/m operand. */ 3040 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0); 3041 tcg_out8(s, args[3]); 3042 break; 3043 3044 case INDEX_op_mb: 3045 tcg_out_mb(s, a0); 3046 break; 3047 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ 3048 case INDEX_op_mov_i64: 3049 case INDEX_op_call: /* Always emitted via tcg_out_call. */ 3050 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */ 3051 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */ 3052 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */ 3053 case INDEX_op_ext8s_i64: 3054 case INDEX_op_ext8u_i32: 3055 case INDEX_op_ext8u_i64: 3056 case INDEX_op_ext16s_i32: 3057 case INDEX_op_ext16s_i64: 3058 case INDEX_op_ext16u_i32: 3059 case INDEX_op_ext16u_i64: 3060 case INDEX_op_ext32s_i64: 3061 case INDEX_op_ext32u_i64: 3062 case INDEX_op_ext_i32_i64: 3063 case INDEX_op_extu_i32_i64: 3064 case INDEX_op_extrl_i64_i32: 3065 default: 3066 g_assert_not_reached(); 3067 } 3068 3069#undef OP_32_64 3070} 3071 3072static int const umin_insn[4] = { 3073 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ 3074}; 3075 3076static int const umax_insn[4] = { 3077 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ 3078}; 3079 3080static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece, 3081 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3082{ 3083 static int const cmpeq_insn[4] = { 3084 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ 3085 }; 3086 static int const cmpgt_insn[4] = { 3087 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ 3088 }; 3089 3090 enum { 3091 NEED_INV = 1, 3092 NEED_SWAP = 2, 3093 NEED_UMIN = 4, 3094 NEED_UMAX = 8, 3095 INVALID = 16, 3096 }; 3097 static const uint8_t cond_fixup[16] = { 3098 [0 ... 15] = INVALID, 3099 [TCG_COND_EQ] = 0, 3100 [TCG_COND_GT] = 0, 3101 [TCG_COND_NE] = NEED_INV, 3102 [TCG_COND_LE] = NEED_INV, 3103 [TCG_COND_LT] = NEED_SWAP, 3104 [TCG_COND_GE] = NEED_SWAP | NEED_INV, 3105 [TCG_COND_LEU] = NEED_UMIN, 3106 [TCG_COND_GTU] = NEED_UMIN | NEED_INV, 3107 [TCG_COND_GEU] = NEED_UMAX, 3108 [TCG_COND_LTU] = NEED_UMAX | NEED_INV, 3109 }; 3110 int fixup = cond_fixup[cond]; 3111 3112 assert(!(fixup & INVALID)); 3113 3114 if (fixup & NEED_INV) { 3115 cond = tcg_invert_cond(cond); 3116 } 3117 3118 if (fixup & NEED_SWAP) { 3119 TCGReg swap = v1; 3120 v1 = v2; 3121 v2 = swap; 3122 cond = tcg_swap_cond(cond); 3123 } 3124 3125 if (fixup & (NEED_UMIN | NEED_UMAX)) { 3126 int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]); 3127 3128 /* avx2 does not have 64-bit min/max; adjusted during expand. */ 3129 assert(vece <= MO_32); 3130 3131 tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type); 3132 v2 = TCG_TMP_VEC; 3133 cond = TCG_COND_EQ; 3134 } 3135 3136 switch (cond) { 3137 case TCG_COND_EQ: 3138 tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type); 3139 break; 3140 case TCG_COND_GT: 3141 tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type); 3142 break; 3143 default: 3144 g_assert_not_reached(); 3145 } 3146 return fixup & NEED_INV; 3147} 3148 3149static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3150 TCGReg v1, TCGReg v2, TCGCond cond) 3151{ 3152 static const int cmpm_insn[2][4] = { 3153 { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ }, 3154 { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ } 3155 }; 3156 static const int testm_insn[4] = { 3157 OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ 3158 }; 3159 static const int testnm_insn[4] = { 3160 OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ 3161 }; 3162 3163 static const int cond_ext[16] = { 3164 [TCG_COND_EQ] = 0, 3165 [TCG_COND_NE] = 4, 3166 [TCG_COND_LT] = 1, 3167 [TCG_COND_LTU] = 1, 3168 [TCG_COND_LE] = 2, 3169 [TCG_COND_LEU] = 2, 3170 [TCG_COND_NEVER] = 3, 3171 [TCG_COND_GE] = 5, 3172 [TCG_COND_GEU] = 5, 3173 [TCG_COND_GT] = 6, 3174 [TCG_COND_GTU] = 6, 3175 [TCG_COND_ALWAYS] = 7, 3176 }; 3177 3178 switch (cond) { 3179 case TCG_COND_TSTNE: 3180 tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type); 3181 break; 3182 case TCG_COND_TSTEQ: 3183 tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type); 3184 break; 3185 default: 3186 tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece], 3187 /* k1 */ 1, v1, v2, type); 3188 tcg_out8(s, cond_ext[cond]); 3189 break; 3190 } 3191} 3192 3193static void tcg_out_k1_to_vec(TCGContext *s, TCGType type, 3194 unsigned vece, TCGReg dest) 3195{ 3196 static const int movm_insn[] = { 3197 OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q 3198 }; 3199 tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type); 3200} 3201 3202static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, 3203 TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond) 3204{ 3205 /* 3206 * With avx512, we have a complete set of comparisons into mask. 3207 * Unless there's a single insn expansion for the comparision, 3208 * expand via a mask in k1. 3209 */ 3210 if ((vece <= MO_16 ? have_avx512bw : have_avx512dq) 3211 && cond != TCG_COND_EQ 3212 && cond != TCG_COND_LT 3213 && cond != TCG_COND_GT) { 3214 tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond); 3215 tcg_out_k1_to_vec(s, type, vece, v0); 3216 return; 3217 } 3218 3219 if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) { 3220 tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1); 3221 tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type); 3222 } 3223} 3224 3225static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, 3226 TCGReg v0, TCGReg c1, TCGReg c2, 3227 TCGReg v3, TCGReg v4, TCGCond cond) 3228{ 3229 static const int vpblendm_insn[] = { 3230 OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ 3231 }; 3232 bool z = false; 3233 3234 /* Swap to place constant in V4 to take advantage of zero-masking. */ 3235 if (!v3) { 3236 z = true; 3237 v3 = v4; 3238 cond = tcg_invert_cond(cond); 3239 } 3240 3241 tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); 3242 tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, 3243 /* k1 */1, z, type); 3244} 3245 3246static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, 3247 TCGReg v0, TCGReg c1, TCGReg c2, 3248 TCGReg v3, TCGReg v4, TCGCond cond) 3249{ 3250 bool inv; 3251 3252 if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { 3253 tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); 3254 return; 3255 } 3256 3257 inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); 3258 3259 /* 3260 * Since XMM0 is 16, the only way we get 0 into V3 3261 * is via the constant zero constraint. 3262 */ 3263 if (!v3) { 3264 if (inv) { 3265 tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type); 3266 } else { 3267 tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type); 3268 } 3269 } else { 3270 if (inv) { 3271 TCGReg swap = v3; 3272 v3 = v4; 3273 v4 = swap; 3274 } 3275 tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type); 3276 tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4); 3277 } 3278} 3279 3280static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, 3281 unsigned vecl, unsigned vece, 3282 const TCGArg args[TCG_MAX_OP_ARGS], 3283 const int const_args[TCG_MAX_OP_ARGS]) 3284{ 3285 static int const add_insn[4] = { 3286 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ 3287 }; 3288 static int const ssadd_insn[4] = { 3289 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 3290 }; 3291 static int const usadd_insn[4] = { 3292 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2 3293 }; 3294 static int const sub_insn[4] = { 3295 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ 3296 }; 3297 static int const sssub_insn[4] = { 3298 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 3299 }; 3300 static int const ussub_insn[4] = { 3301 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 3302 }; 3303 static int const mul_insn[4] = { 3304 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ 3305 }; 3306 static int const shift_imm_insn[4] = { 3307 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib 3308 }; 3309 static int const punpckl_insn[4] = { 3310 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ 3311 }; 3312 static int const punpckh_insn[4] = { 3313 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ 3314 }; 3315 static int const packss_insn[4] = { 3316 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 3317 }; 3318 static int const packus_insn[4] = { 3319 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 3320 }; 3321 static int const smin_insn[4] = { 3322 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ 3323 }; 3324 static int const smax_insn[4] = { 3325 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ 3326 }; 3327 static int const rotlv_insn[4] = { 3328 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ 3329 }; 3330 static int const rotrv_insn[4] = { 3331 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ 3332 }; 3333 static int const shlv_insn[4] = { 3334 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ 3335 }; 3336 static int const shrv_insn[4] = { 3337 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ 3338 }; 3339 static int const sarv_insn[4] = { 3340 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ 3341 }; 3342 static int const shls_insn[4] = { 3343 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ 3344 }; 3345 static int const shrs_insn[4] = { 3346 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ 3347 }; 3348 static int const sars_insn[4] = { 3349 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ 3350 }; 3351 static int const vpshldi_insn[4] = { 3352 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ 3353 }; 3354 static int const vpshldv_insn[4] = { 3355 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ 3356 }; 3357 static int const vpshrdv_insn[4] = { 3358 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ 3359 }; 3360 static int const abs_insn[4] = { 3361 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ 3362 }; 3363 3364 TCGType type = vecl + TCG_TYPE_V64; 3365 int insn, sub; 3366 TCGArg a0, a1, a2, a3; 3367 3368 a0 = args[0]; 3369 a1 = args[1]; 3370 a2 = args[2]; 3371 3372 switch (opc) { 3373 case INDEX_op_add_vec: 3374 insn = add_insn[vece]; 3375 goto gen_simd; 3376 case INDEX_op_ssadd_vec: 3377 insn = ssadd_insn[vece]; 3378 goto gen_simd; 3379 case INDEX_op_usadd_vec: 3380 insn = usadd_insn[vece]; 3381 goto gen_simd; 3382 case INDEX_op_sub_vec: 3383 insn = sub_insn[vece]; 3384 goto gen_simd; 3385 case INDEX_op_sssub_vec: 3386 insn = sssub_insn[vece]; 3387 goto gen_simd; 3388 case INDEX_op_ussub_vec: 3389 insn = ussub_insn[vece]; 3390 goto gen_simd; 3391 case INDEX_op_mul_vec: 3392 insn = mul_insn[vece]; 3393 goto gen_simd; 3394 case INDEX_op_and_vec: 3395 insn = OPC_PAND; 3396 goto gen_simd; 3397 case INDEX_op_or_vec: 3398 insn = OPC_POR; 3399 goto gen_simd; 3400 case INDEX_op_xor_vec: 3401 insn = OPC_PXOR; 3402 goto gen_simd; 3403 case INDEX_op_smin_vec: 3404 insn = smin_insn[vece]; 3405 goto gen_simd; 3406 case INDEX_op_umin_vec: 3407 insn = umin_insn[vece]; 3408 goto gen_simd; 3409 case INDEX_op_smax_vec: 3410 insn = smax_insn[vece]; 3411 goto gen_simd; 3412 case INDEX_op_umax_vec: 3413 insn = umax_insn[vece]; 3414 goto gen_simd; 3415 case INDEX_op_shlv_vec: 3416 insn = shlv_insn[vece]; 3417 goto gen_simd; 3418 case INDEX_op_shrv_vec: 3419 insn = shrv_insn[vece]; 3420 goto gen_simd; 3421 case INDEX_op_sarv_vec: 3422 insn = sarv_insn[vece]; 3423 goto gen_simd; 3424 case INDEX_op_rotlv_vec: 3425 insn = rotlv_insn[vece]; 3426 goto gen_simd; 3427 case INDEX_op_rotrv_vec: 3428 insn = rotrv_insn[vece]; 3429 goto gen_simd; 3430 case INDEX_op_shls_vec: 3431 insn = shls_insn[vece]; 3432 goto gen_simd; 3433 case INDEX_op_shrs_vec: 3434 insn = shrs_insn[vece]; 3435 goto gen_simd; 3436 case INDEX_op_sars_vec: 3437 insn = sars_insn[vece]; 3438 goto gen_simd; 3439 case INDEX_op_x86_punpckl_vec: 3440 insn = punpckl_insn[vece]; 3441 goto gen_simd; 3442 case INDEX_op_x86_punpckh_vec: 3443 insn = punpckh_insn[vece]; 3444 goto gen_simd; 3445 case INDEX_op_x86_packss_vec: 3446 insn = packss_insn[vece]; 3447 goto gen_simd; 3448 case INDEX_op_x86_packus_vec: 3449 insn = packus_insn[vece]; 3450 goto gen_simd; 3451 case INDEX_op_x86_vpshldv_vec: 3452 insn = vpshldv_insn[vece]; 3453 a1 = a2; 3454 a2 = args[3]; 3455 goto gen_simd; 3456 case INDEX_op_x86_vpshrdv_vec: 3457 insn = vpshrdv_insn[vece]; 3458 a1 = a2; 3459 a2 = args[3]; 3460 goto gen_simd; 3461#if TCG_TARGET_REG_BITS == 32 3462 case INDEX_op_dup2_vec: 3463 /* First merge the two 32-bit inputs to a single 64-bit element. */ 3464 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2); 3465 /* Then replicate the 64-bit elements across the rest of the vector. */ 3466 if (type != TCG_TYPE_V64) { 3467 tcg_out_dup_vec(s, type, MO_64, a0, a0); 3468 } 3469 break; 3470#endif 3471 case INDEX_op_abs_vec: 3472 insn = abs_insn[vece]; 3473 a2 = a1; 3474 a1 = 0; 3475 goto gen_simd; 3476 gen_simd: 3477 tcg_debug_assert(insn != OPC_UD2); 3478 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3479 break; 3480 3481 case INDEX_op_cmp_vec: 3482 tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]); 3483 break; 3484 3485 case INDEX_op_cmpsel_vec: 3486 tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2, 3487 args[3], args[4], args[5]); 3488 break; 3489 3490 case INDEX_op_andc_vec: 3491 insn = OPC_PANDN; 3492 tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type); 3493 break; 3494 3495 case INDEX_op_shli_vec: 3496 insn = shift_imm_insn[vece]; 3497 sub = 6; 3498 goto gen_shift; 3499 case INDEX_op_shri_vec: 3500 insn = shift_imm_insn[vece]; 3501 sub = 2; 3502 goto gen_shift; 3503 case INDEX_op_sari_vec: 3504 if (vece == MO_64) { 3505 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; 3506 } else { 3507 insn = shift_imm_insn[vece]; 3508 } 3509 sub = 4; 3510 goto gen_shift; 3511 case INDEX_op_rotli_vec: 3512 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ 3513 if (vece == MO_64) { 3514 insn |= P_VEXW; 3515 } 3516 sub = 1; 3517 goto gen_shift; 3518 gen_shift: 3519 tcg_debug_assert(vece != MO_8); 3520 tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type); 3521 tcg_out8(s, a2); 3522 break; 3523 3524 case INDEX_op_ld_vec: 3525 tcg_out_ld(s, type, a0, a1, a2); 3526 break; 3527 case INDEX_op_st_vec: 3528 tcg_out_st(s, type, a0, a1, a2); 3529 break; 3530 case INDEX_op_dupm_vec: 3531 tcg_out_dupm_vec(s, type, vece, a0, a1, a2); 3532 break; 3533 3534 case INDEX_op_x86_shufps_vec: 3535 insn = OPC_SHUFPS; 3536 sub = args[3]; 3537 goto gen_simd_imm8; 3538 case INDEX_op_x86_blend_vec: 3539 if (vece == MO_16) { 3540 insn = OPC_PBLENDW; 3541 } else if (vece == MO_32) { 3542 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); 3543 } else { 3544 g_assert_not_reached(); 3545 } 3546 sub = args[3]; 3547 goto gen_simd_imm8; 3548 case INDEX_op_x86_vperm2i128_vec: 3549 insn = OPC_VPERM2I128; 3550 sub = args[3]; 3551 goto gen_simd_imm8; 3552 case INDEX_op_x86_vpshldi_vec: 3553 insn = vpshldi_insn[vece]; 3554 sub = args[3]; 3555 goto gen_simd_imm8; 3556 3557 case INDEX_op_not_vec: 3558 insn = OPC_VPTERNLOGQ; 3559 a2 = a1; 3560 sub = 0x33; /* !B */ 3561 goto gen_simd_imm8; 3562 case INDEX_op_nor_vec: 3563 insn = OPC_VPTERNLOGQ; 3564 sub = 0x11; /* norCB */ 3565 goto gen_simd_imm8; 3566 case INDEX_op_nand_vec: 3567 insn = OPC_VPTERNLOGQ; 3568 sub = 0x77; /* nandCB */ 3569 goto gen_simd_imm8; 3570 case INDEX_op_eqv_vec: 3571 insn = OPC_VPTERNLOGQ; 3572 sub = 0x99; /* xnorCB */ 3573 goto gen_simd_imm8; 3574 case INDEX_op_orc_vec: 3575 insn = OPC_VPTERNLOGQ; 3576 sub = 0xdd; /* orB!C */ 3577 goto gen_simd_imm8; 3578 3579 case INDEX_op_bitsel_vec: 3580 insn = OPC_VPTERNLOGQ; 3581 a3 = args[3]; 3582 if (a0 == a1) { 3583 a1 = a2; 3584 a2 = a3; 3585 sub = 0xca; /* A?B:C */ 3586 } else if (a0 == a2) { 3587 a2 = a3; 3588 sub = 0xe2; /* B?A:C */ 3589 } else { 3590 tcg_out_mov(s, type, a0, a3); 3591 sub = 0xb8; /* B?C:A */ 3592 } 3593 goto gen_simd_imm8; 3594 3595 gen_simd_imm8: 3596 tcg_debug_assert(insn != OPC_UD2); 3597 tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type); 3598 tcg_out8(s, sub); 3599 break; 3600 3601 case INDEX_op_x86_psrldq_vec: 3602 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); 3603 tcg_out8(s, a2); 3604 break; 3605 3606 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ 3607 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ 3608 default: 3609 g_assert_not_reached(); 3610 } 3611} 3612 3613static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) 3614{ 3615 switch (op) { 3616 case INDEX_op_goto_ptr: 3617 return C_O0_I1(r); 3618 3619 case INDEX_op_ld8u_i32: 3620 case INDEX_op_ld8u_i64: 3621 case INDEX_op_ld8s_i32: 3622 case INDEX_op_ld8s_i64: 3623 case INDEX_op_ld16u_i32: 3624 case INDEX_op_ld16u_i64: 3625 case INDEX_op_ld16s_i32: 3626 case INDEX_op_ld16s_i64: 3627 case INDEX_op_ld_i32: 3628 case INDEX_op_ld32u_i64: 3629 case INDEX_op_ld32s_i64: 3630 case INDEX_op_ld_i64: 3631 return C_O1_I1(r, r); 3632 3633 case INDEX_op_st8_i32: 3634 case INDEX_op_st8_i64: 3635 return C_O0_I2(qi, r); 3636 3637 case INDEX_op_st16_i32: 3638 case INDEX_op_st16_i64: 3639 case INDEX_op_st_i32: 3640 case INDEX_op_st32_i64: 3641 return C_O0_I2(ri, r); 3642 3643 case INDEX_op_st_i64: 3644 return C_O0_I2(re, r); 3645 3646 case INDEX_op_add_i32: 3647 case INDEX_op_add_i64: 3648 return C_O1_I2(r, r, re); 3649 3650 case INDEX_op_sub_i32: 3651 case INDEX_op_sub_i64: 3652 case INDEX_op_mul_i32: 3653 case INDEX_op_mul_i64: 3654 case INDEX_op_or_i32: 3655 case INDEX_op_or_i64: 3656 case INDEX_op_xor_i32: 3657 case INDEX_op_xor_i64: 3658 return C_O1_I2(r, 0, re); 3659 3660 case INDEX_op_and_i32: 3661 case INDEX_op_and_i64: 3662 return C_O1_I2(r, 0, reZ); 3663 3664 case INDEX_op_andc_i32: 3665 case INDEX_op_andc_i64: 3666 return C_O1_I2(r, r, rI); 3667 3668 case INDEX_op_shl_i32: 3669 case INDEX_op_shl_i64: 3670 case INDEX_op_shr_i32: 3671 case INDEX_op_shr_i64: 3672 case INDEX_op_sar_i32: 3673 case INDEX_op_sar_i64: 3674 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci); 3675 3676 case INDEX_op_rotl_i32: 3677 case INDEX_op_rotl_i64: 3678 case INDEX_op_rotr_i32: 3679 case INDEX_op_rotr_i64: 3680 return C_O1_I2(r, 0, ci); 3681 3682 case INDEX_op_brcond_i32: 3683 case INDEX_op_brcond_i64: 3684 return C_O0_I2(r, reT); 3685 3686 case INDEX_op_bswap16_i32: 3687 case INDEX_op_bswap16_i64: 3688 case INDEX_op_bswap32_i32: 3689 case INDEX_op_bswap32_i64: 3690 case INDEX_op_bswap64_i64: 3691 case INDEX_op_neg_i32: 3692 case INDEX_op_neg_i64: 3693 case INDEX_op_not_i32: 3694 case INDEX_op_not_i64: 3695 case INDEX_op_extrh_i64_i32: 3696 return C_O1_I1(r, 0); 3697 3698 case INDEX_op_ext8s_i32: 3699 case INDEX_op_ext8s_i64: 3700 case INDEX_op_ext8u_i32: 3701 case INDEX_op_ext8u_i64: 3702 return C_O1_I1(r, q); 3703 3704 case INDEX_op_ext16s_i32: 3705 case INDEX_op_ext16s_i64: 3706 case INDEX_op_ext16u_i32: 3707 case INDEX_op_ext16u_i64: 3708 case INDEX_op_ext32s_i64: 3709 case INDEX_op_ext32u_i64: 3710 case INDEX_op_ext_i32_i64: 3711 case INDEX_op_extu_i32_i64: 3712 case INDEX_op_extrl_i64_i32: 3713 case INDEX_op_extract_i32: 3714 case INDEX_op_extract_i64: 3715 case INDEX_op_sextract_i32: 3716 case INDEX_op_ctpop_i32: 3717 case INDEX_op_ctpop_i64: 3718 return C_O1_I1(r, r); 3719 3720 case INDEX_op_extract2_i32: 3721 case INDEX_op_extract2_i64: 3722 return C_O1_I2(r, 0, r); 3723 3724 case INDEX_op_deposit_i32: 3725 case INDEX_op_deposit_i64: 3726 return C_O1_I2(q, 0, qi); 3727 3728 case INDEX_op_setcond_i32: 3729 case INDEX_op_setcond_i64: 3730 case INDEX_op_negsetcond_i32: 3731 case INDEX_op_negsetcond_i64: 3732 return C_O1_I2(q, r, reT); 3733 3734 case INDEX_op_movcond_i32: 3735 case INDEX_op_movcond_i64: 3736 return C_O1_I4(r, r, reT, r, 0); 3737 3738 case INDEX_op_div2_i32: 3739 case INDEX_op_div2_i64: 3740 case INDEX_op_divu2_i32: 3741 case INDEX_op_divu2_i64: 3742 return C_O2_I3(a, d, 0, 1, r); 3743 3744 case INDEX_op_mulu2_i32: 3745 case INDEX_op_mulu2_i64: 3746 case INDEX_op_muls2_i32: 3747 case INDEX_op_muls2_i64: 3748 return C_O2_I2(a, d, a, r); 3749 3750 case INDEX_op_add2_i32: 3751 case INDEX_op_add2_i64: 3752 case INDEX_op_sub2_i32: 3753 case INDEX_op_sub2_i64: 3754 return C_N1_O1_I4(r, r, 0, 1, re, re); 3755 3756 case INDEX_op_ctz_i32: 3757 case INDEX_op_ctz_i64: 3758 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3759 3760 case INDEX_op_clz_i32: 3761 case INDEX_op_clz_i64: 3762 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); 3763 3764 case INDEX_op_qemu_ld_a32_i32: 3765 return C_O1_I1(r, L); 3766 case INDEX_op_qemu_ld_a64_i32: 3767 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); 3768 3769 case INDEX_op_qemu_st_a32_i32: 3770 return C_O0_I2(L, L); 3771 case INDEX_op_qemu_st_a64_i32: 3772 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3773 case INDEX_op_qemu_st8_a32_i32: 3774 return C_O0_I2(s, L); 3775 case INDEX_op_qemu_st8_a64_i32: 3776 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); 3777 3778 case INDEX_op_qemu_ld_a32_i64: 3779 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); 3780 case INDEX_op_qemu_ld_a64_i64: 3781 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); 3782 3783 case INDEX_op_qemu_st_a32_i64: 3784 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); 3785 case INDEX_op_qemu_st_a64_i64: 3786 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); 3787 3788 case INDEX_op_qemu_ld_a32_i128: 3789 case INDEX_op_qemu_ld_a64_i128: 3790 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3791 return C_O2_I1(r, r, L); 3792 case INDEX_op_qemu_st_a32_i128: 3793 case INDEX_op_qemu_st_a64_i128: 3794 tcg_debug_assert(TCG_TARGET_REG_BITS == 64); 3795 return C_O0_I3(L, L, L); 3796 3797 case INDEX_op_brcond2_i32: 3798 return C_O0_I4(r, r, ri, ri); 3799 3800 case INDEX_op_setcond2_i32: 3801 return C_O1_I4(r, r, r, ri, ri); 3802 3803 case INDEX_op_ld_vec: 3804 case INDEX_op_dupm_vec: 3805 return C_O1_I1(x, r); 3806 3807 case INDEX_op_st_vec: 3808 return C_O0_I2(x, r); 3809 3810 case INDEX_op_add_vec: 3811 case INDEX_op_sub_vec: 3812 case INDEX_op_mul_vec: 3813 case INDEX_op_and_vec: 3814 case INDEX_op_or_vec: 3815 case INDEX_op_xor_vec: 3816 case INDEX_op_andc_vec: 3817 case INDEX_op_orc_vec: 3818 case INDEX_op_nand_vec: 3819 case INDEX_op_nor_vec: 3820 case INDEX_op_eqv_vec: 3821 case INDEX_op_ssadd_vec: 3822 case INDEX_op_usadd_vec: 3823 case INDEX_op_sssub_vec: 3824 case INDEX_op_ussub_vec: 3825 case INDEX_op_smin_vec: 3826 case INDEX_op_umin_vec: 3827 case INDEX_op_smax_vec: 3828 case INDEX_op_umax_vec: 3829 case INDEX_op_shlv_vec: 3830 case INDEX_op_shrv_vec: 3831 case INDEX_op_sarv_vec: 3832 case INDEX_op_rotlv_vec: 3833 case INDEX_op_rotrv_vec: 3834 case INDEX_op_shls_vec: 3835 case INDEX_op_shrs_vec: 3836 case INDEX_op_sars_vec: 3837 case INDEX_op_cmp_vec: 3838 case INDEX_op_x86_shufps_vec: 3839 case INDEX_op_x86_blend_vec: 3840 case INDEX_op_x86_packss_vec: 3841 case INDEX_op_x86_packus_vec: 3842 case INDEX_op_x86_vperm2i128_vec: 3843 case INDEX_op_x86_punpckl_vec: 3844 case INDEX_op_x86_punpckh_vec: 3845 case INDEX_op_x86_vpshldi_vec: 3846#if TCG_TARGET_REG_BITS == 32 3847 case INDEX_op_dup2_vec: 3848#endif 3849 return C_O1_I2(x, x, x); 3850 3851 case INDEX_op_abs_vec: 3852 case INDEX_op_dup_vec: 3853 case INDEX_op_not_vec: 3854 case INDEX_op_shli_vec: 3855 case INDEX_op_shri_vec: 3856 case INDEX_op_sari_vec: 3857 case INDEX_op_rotli_vec: 3858 case INDEX_op_x86_psrldq_vec: 3859 return C_O1_I1(x, x); 3860 3861 case INDEX_op_x86_vpshldv_vec: 3862 case INDEX_op_x86_vpshrdv_vec: 3863 return C_O1_I3(x, 0, x, x); 3864 3865 case INDEX_op_bitsel_vec: 3866 return C_O1_I3(x, x, x, x); 3867 case INDEX_op_cmpsel_vec: 3868 return C_O1_I4(x, x, x, xO, x); 3869 3870 default: 3871 g_assert_not_reached(); 3872 } 3873} 3874 3875int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) 3876{ 3877 switch (opc) { 3878 case INDEX_op_add_vec: 3879 case INDEX_op_sub_vec: 3880 case INDEX_op_and_vec: 3881 case INDEX_op_or_vec: 3882 case INDEX_op_xor_vec: 3883 case INDEX_op_andc_vec: 3884 case INDEX_op_orc_vec: 3885 case INDEX_op_nand_vec: 3886 case INDEX_op_nor_vec: 3887 case INDEX_op_eqv_vec: 3888 case INDEX_op_not_vec: 3889 case INDEX_op_bitsel_vec: 3890 return 1; 3891 case INDEX_op_cmp_vec: 3892 case INDEX_op_cmpsel_vec: 3893 return -1; 3894 3895 case INDEX_op_rotli_vec: 3896 return have_avx512vl && vece >= MO_32 ? 1 : -1; 3897 3898 case INDEX_op_shli_vec: 3899 case INDEX_op_shri_vec: 3900 /* We must expand the operation for MO_8. */ 3901 return vece == MO_8 ? -1 : 1; 3902 3903 case INDEX_op_sari_vec: 3904 switch (vece) { 3905 case MO_8: 3906 return -1; 3907 case MO_16: 3908 case MO_32: 3909 return 1; 3910 case MO_64: 3911 if (have_avx512vl) { 3912 return 1; 3913 } 3914 /* 3915 * We can emulate this for MO_64, but it does not pay off 3916 * unless we're producing at least 4 values. 3917 */ 3918 return type >= TCG_TYPE_V256 ? -1 : 0; 3919 } 3920 return 0; 3921 3922 case INDEX_op_shls_vec: 3923 case INDEX_op_shrs_vec: 3924 return vece >= MO_16; 3925 case INDEX_op_sars_vec: 3926 switch (vece) { 3927 case MO_16: 3928 case MO_32: 3929 return 1; 3930 case MO_64: 3931 return have_avx512vl; 3932 } 3933 return 0; 3934 case INDEX_op_rotls_vec: 3935 return vece >= MO_16 ? -1 : 0; 3936 3937 case INDEX_op_shlv_vec: 3938 case INDEX_op_shrv_vec: 3939 switch (vece) { 3940 case MO_16: 3941 return have_avx512bw; 3942 case MO_32: 3943 case MO_64: 3944 return have_avx2; 3945 } 3946 return 0; 3947 case INDEX_op_sarv_vec: 3948 switch (vece) { 3949 case MO_16: 3950 return have_avx512bw; 3951 case MO_32: 3952 return have_avx2; 3953 case MO_64: 3954 return have_avx512vl; 3955 } 3956 return 0; 3957 case INDEX_op_rotlv_vec: 3958 case INDEX_op_rotrv_vec: 3959 switch (vece) { 3960 case MO_16: 3961 return have_avx512vbmi2 ? -1 : 0; 3962 case MO_32: 3963 case MO_64: 3964 return have_avx512vl ? 1 : have_avx2 ? -1 : 0; 3965 } 3966 return 0; 3967 3968 case INDEX_op_mul_vec: 3969 switch (vece) { 3970 case MO_8: 3971 return -1; 3972 case MO_64: 3973 return have_avx512dq; 3974 } 3975 return 1; 3976 3977 case INDEX_op_ssadd_vec: 3978 case INDEX_op_usadd_vec: 3979 case INDEX_op_sssub_vec: 3980 case INDEX_op_ussub_vec: 3981 return vece <= MO_16; 3982 case INDEX_op_smin_vec: 3983 case INDEX_op_smax_vec: 3984 case INDEX_op_umin_vec: 3985 case INDEX_op_umax_vec: 3986 case INDEX_op_abs_vec: 3987 return vece <= MO_32 || have_avx512vl; 3988 3989 default: 3990 return 0; 3991 } 3992} 3993 3994static void expand_vec_shi(TCGType type, unsigned vece, bool right, 3995 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 3996{ 3997 uint8_t mask; 3998 3999 tcg_debug_assert(vece == MO_8); 4000 if (right) { 4001 mask = 0xff >> imm; 4002 tcg_gen_shri_vec(MO_16, v0, v1, imm); 4003 } else { 4004 mask = 0xff << imm; 4005 tcg_gen_shli_vec(MO_16, v0, v1, imm); 4006 } 4007 tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); 4008} 4009 4010static void expand_vec_sari(TCGType type, unsigned vece, 4011 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4012{ 4013 TCGv_vec t1, t2; 4014 4015 switch (vece) { 4016 case MO_8: 4017 /* Unpack to 16-bit, shift, and repack. */ 4018 t1 = tcg_temp_new_vec(type); 4019 t2 = tcg_temp_new_vec(type); 4020 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4021 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4022 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4023 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); 4024 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); 4025 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); 4026 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, 4027 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); 4028 tcg_temp_free_vec(t1); 4029 tcg_temp_free_vec(t2); 4030 break; 4031 4032 case MO_64: 4033 t1 = tcg_temp_new_vec(type); 4034 if (imm <= 32) { 4035 /* 4036 * We can emulate a small sign extend by performing an arithmetic 4037 * 32-bit shift and overwriting the high half of a 64-bit logical 4038 * shift. Note that the ISA says shift of 32 is valid, but TCG 4039 * does not, so we have to bound the smaller shift -- we get the 4040 * same result in the high half either way. 4041 */ 4042 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31)); 4043 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4044 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, 4045 tcgv_vec_arg(v0), tcgv_vec_arg(v0), 4046 tcgv_vec_arg(t1), 0xaa); 4047 } else { 4048 /* Otherwise we will need to use a compare vs 0 to produce 4049 * the sign-extend, shift and merge. 4050 */ 4051 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, 4052 tcg_constant_vec(type, MO_64, 0), v1); 4053 tcg_gen_shri_vec(MO_64, v0, v1, imm); 4054 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); 4055 tcg_gen_or_vec(MO_64, v0, v0, t1); 4056 } 4057 tcg_temp_free_vec(t1); 4058 break; 4059 4060 default: 4061 g_assert_not_reached(); 4062 } 4063} 4064 4065static void expand_vec_rotli(TCGType type, unsigned vece, 4066 TCGv_vec v0, TCGv_vec v1, TCGArg imm) 4067{ 4068 TCGv_vec t; 4069 4070 if (vece != MO_8 && have_avx512vbmi2) { 4071 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, 4072 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); 4073 return; 4074 } 4075 4076 t = tcg_temp_new_vec(type); 4077 tcg_gen_shli_vec(vece, t, v1, imm); 4078 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); 4079 tcg_gen_or_vec(vece, v0, v0, t); 4080 tcg_temp_free_vec(t); 4081} 4082 4083static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, 4084 TCGv_vec v1, TCGv_vec sh, bool right) 4085{ 4086 TCGv_vec t; 4087 4088 if (have_avx512vbmi2) { 4089 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, 4090 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), 4091 tcgv_vec_arg(v1), tcgv_vec_arg(sh)); 4092 return; 4093 } 4094 4095 t = tcg_temp_new_vec(type); 4096 tcg_gen_dupi_vec(vece, t, 8 << vece); 4097 tcg_gen_sub_vec(vece, t, t, sh); 4098 if (right) { 4099 tcg_gen_shlv_vec(vece, t, v1, t); 4100 tcg_gen_shrv_vec(vece, v0, v1, sh); 4101 } else { 4102 tcg_gen_shrv_vec(vece, t, v1, t); 4103 tcg_gen_shlv_vec(vece, v0, v1, sh); 4104 } 4105 tcg_gen_or_vec(vece, v0, v0, t); 4106 tcg_temp_free_vec(t); 4107} 4108 4109static void expand_vec_rotls(TCGType type, unsigned vece, 4110 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) 4111{ 4112 TCGv_vec t = tcg_temp_new_vec(type); 4113 4114 tcg_debug_assert(vece != MO_8); 4115 4116 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { 4117 tcg_gen_dup_i32_vec(vece, t, lsh); 4118 if (vece >= MO_32) { 4119 tcg_gen_rotlv_vec(vece, v0, v1, t); 4120 } else { 4121 expand_vec_rotv(type, vece, v0, v1, t, false); 4122 } 4123 } else { 4124 TCGv_i32 rsh = tcg_temp_new_i32(); 4125 4126 tcg_gen_neg_i32(rsh, lsh); 4127 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); 4128 tcg_gen_shls_vec(vece, t, v1, lsh); 4129 tcg_gen_shrs_vec(vece, v0, v1, rsh); 4130 tcg_gen_or_vec(vece, v0, v0, t); 4131 4132 tcg_temp_free_i32(rsh); 4133 } 4134 4135 tcg_temp_free_vec(t); 4136} 4137 4138static void expand_vec_mul(TCGType type, unsigned vece, 4139 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) 4140{ 4141 TCGv_vec t1, t2, t3, t4, zero; 4142 4143 tcg_debug_assert(vece == MO_8); 4144 4145 /* 4146 * Unpack v1 bytes to words, 0 | x. 4147 * Unpack v2 bytes to words, y | 0. 4148 * This leaves the 8-bit result, x * y, with 8 bits of right padding. 4149 * Shift logical right by 8 bits to clear the high 8 bytes before 4150 * using an unsigned saturated pack. 4151 * 4152 * The difference between the V64, V128 and V256 cases is merely how 4153 * we distribute the expansion between temporaries. 4154 */ 4155 switch (type) { 4156 case TCG_TYPE_V64: 4157 t1 = tcg_temp_new_vec(TCG_TYPE_V128); 4158 t2 = tcg_temp_new_vec(TCG_TYPE_V128); 4159 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4160 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4161 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4162 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, 4163 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4164 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4165 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4166 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, 4167 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); 4168 tcg_temp_free_vec(t1); 4169 tcg_temp_free_vec(t2); 4170 break; 4171 4172 case TCG_TYPE_V128: 4173 case TCG_TYPE_V256: 4174 t1 = tcg_temp_new_vec(type); 4175 t2 = tcg_temp_new_vec(type); 4176 t3 = tcg_temp_new_vec(type); 4177 t4 = tcg_temp_new_vec(type); 4178 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0); 4179 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4180 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4181 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, 4182 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4183 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4184 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero)); 4185 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, 4186 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2)); 4187 tcg_gen_mul_vec(MO_16, t1, t1, t2); 4188 tcg_gen_mul_vec(MO_16, t3, t3, t4); 4189 tcg_gen_shri_vec(MO_16, t1, t1, 8); 4190 tcg_gen_shri_vec(MO_16, t3, t3, 8); 4191 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, 4192 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); 4193 tcg_temp_free_vec(t1); 4194 tcg_temp_free_vec(t2); 4195 tcg_temp_free_vec(t3); 4196 tcg_temp_free_vec(t4); 4197 break; 4198 4199 default: 4200 g_assert_not_reached(); 4201 } 4202} 4203 4204static TCGCond expand_vec_cond(TCGType type, unsigned vece, 4205 TCGArg *a1, TCGArg *a2, TCGCond cond) 4206{ 4207 /* 4208 * Without AVX512, there are no 64-bit unsigned comparisons. 4209 * We must bias the inputs so that they become signed. 4210 * All other swapping and inversion are handled during code generation. 4211 */ 4212 if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) { 4213 TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1)); 4214 TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2)); 4215 TCGv_vec t1 = tcg_temp_new_vec(type); 4216 TCGv_vec t2 = tcg_temp_new_vec(type); 4217 TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1)); 4218 4219 tcg_gen_sub_vec(vece, t1, v1, t3); 4220 tcg_gen_sub_vec(vece, t2, v2, t3); 4221 *a1 = tcgv_vec_arg(t1); 4222 *a2 = tcgv_vec_arg(t2); 4223 cond = tcg_signed_cond(cond); 4224 } 4225 return cond; 4226} 4227 4228static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0, 4229 TCGArg a1, TCGArg a2, TCGCond cond) 4230{ 4231 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4232 /* Expand directly; do not recurse. */ 4233 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); 4234} 4235 4236static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0, 4237 TCGArg a1, TCGArg a2, 4238 TCGArg a3, TCGArg a4, TCGCond cond) 4239{ 4240 cond = expand_vec_cond(type, vece, &a1, &a2, cond); 4241 /* Expand directly; do not recurse. */ 4242 vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond); 4243} 4244 4245void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, 4246 TCGArg a0, ...) 4247{ 4248 va_list va; 4249 TCGArg a1, a2, a3, a4, a5; 4250 TCGv_vec v0, v1, v2; 4251 4252 va_start(va, a0); 4253 a1 = va_arg(va, TCGArg); 4254 a2 = va_arg(va, TCGArg); 4255 v0 = temp_tcgv_vec(arg_temp(a0)); 4256 v1 = temp_tcgv_vec(arg_temp(a1)); 4257 4258 switch (opc) { 4259 case INDEX_op_shli_vec: 4260 expand_vec_shi(type, vece, false, v0, v1, a2); 4261 break; 4262 case INDEX_op_shri_vec: 4263 expand_vec_shi(type, vece, true, v0, v1, a2); 4264 break; 4265 case INDEX_op_sari_vec: 4266 expand_vec_sari(type, vece, v0, v1, a2); 4267 break; 4268 4269 case INDEX_op_rotli_vec: 4270 expand_vec_rotli(type, vece, v0, v1, a2); 4271 break; 4272 4273 case INDEX_op_rotls_vec: 4274 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2))); 4275 break; 4276 4277 case INDEX_op_rotlv_vec: 4278 v2 = temp_tcgv_vec(arg_temp(a2)); 4279 expand_vec_rotv(type, vece, v0, v1, v2, false); 4280 break; 4281 case INDEX_op_rotrv_vec: 4282 v2 = temp_tcgv_vec(arg_temp(a2)); 4283 expand_vec_rotv(type, vece, v0, v1, v2, true); 4284 break; 4285 4286 case INDEX_op_mul_vec: 4287 v2 = temp_tcgv_vec(arg_temp(a2)); 4288 expand_vec_mul(type, vece, v0, v1, v2); 4289 break; 4290 4291 case INDEX_op_cmp_vec: 4292 a3 = va_arg(va, TCGArg); 4293 expand_vec_cmp(type, vece, a0, a1, a2, a3); 4294 break; 4295 4296 case INDEX_op_cmpsel_vec: 4297 a3 = va_arg(va, TCGArg); 4298 a4 = va_arg(va, TCGArg); 4299 a5 = va_arg(va, TCGArg); 4300 expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5); 4301 break; 4302 4303 default: 4304 break; 4305 } 4306 4307 va_end(va); 4308} 4309 4310static const int tcg_target_callee_save_regs[] = { 4311#if TCG_TARGET_REG_BITS == 64 4312 TCG_REG_RBP, 4313 TCG_REG_RBX, 4314#if defined(_WIN64) 4315 TCG_REG_RDI, 4316 TCG_REG_RSI, 4317#endif 4318 TCG_REG_R12, 4319 TCG_REG_R13, 4320 TCG_REG_R14, /* Currently used for the global env. */ 4321 TCG_REG_R15, 4322#else 4323 TCG_REG_EBP, /* Currently used for the global env. */ 4324 TCG_REG_EBX, 4325 TCG_REG_ESI, 4326 TCG_REG_EDI, 4327#endif 4328}; 4329 4330/* Compute frame size via macros, to share between tcg_target_qemu_prologue 4331 and tcg_register_jit. */ 4332 4333#define PUSH_SIZE \ 4334 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \ 4335 * (TCG_TARGET_REG_BITS / 8)) 4336 4337#define FRAME_SIZE \ 4338 ((PUSH_SIZE \ 4339 + TCG_STATIC_CALL_ARGS_SIZE \ 4340 + CPU_TEMP_BUF_NLONGS * sizeof(long) \ 4341 + TCG_TARGET_STACK_ALIGN - 1) \ 4342 & ~(TCG_TARGET_STACK_ALIGN - 1)) 4343 4344/* Generate global QEMU prologue and epilogue code */ 4345static void tcg_target_qemu_prologue(TCGContext *s) 4346{ 4347 int i, stack_addend; 4348 4349 /* TB prologue */ 4350 4351 /* Reserve some stack space, also for TCG temps. */ 4352 stack_addend = FRAME_SIZE - PUSH_SIZE; 4353 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, 4354 CPU_TEMP_BUF_NLONGS * sizeof(long)); 4355 4356 /* Save all callee saved registers. */ 4357 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) { 4358 tcg_out_push(s, tcg_target_callee_save_regs[i]); 4359 } 4360 4361 if (!tcg_use_softmmu && guest_base) { 4362 int seg = setup_guest_base_seg(); 4363 if (seg != 0) { 4364 x86_guest_base.seg = seg; 4365 } else if (guest_base == (int32_t)guest_base) { 4366 x86_guest_base.ofs = guest_base; 4367 } else { 4368 assert(TCG_TARGET_REG_BITS == 64); 4369 /* Choose R12 because, as a base, it requires a SIB byte. */ 4370 x86_guest_base.index = TCG_REG_R12; 4371 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); 4372 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); 4373 } 4374 } 4375 4376 if (TCG_TARGET_REG_BITS == 32) { 4377 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 4378 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); 4379 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4380 /* jmp *tb. */ 4381 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, 4382 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 4383 + stack_addend); 4384 } else { 4385 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); 4386 tcg_out_addi(s, TCG_REG_ESP, -stack_addend); 4387 /* jmp *tb. */ 4388 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); 4389 } 4390 4391 /* 4392 * Return path for goto_ptr. Set return value to 0, a-la exit_tb, 4393 * and fall through to the rest of the epilogue. 4394 */ 4395 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); 4396 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0); 4397 4398 /* TB epilogue */ 4399 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr); 4400 4401 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); 4402 4403 if (have_avx2) { 4404 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); 4405 } 4406 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { 4407 tcg_out_pop(s, tcg_target_callee_save_regs[i]); 4408 } 4409 tcg_out_opc(s, OPC_RET, 0, 0, 0); 4410} 4411 4412static void tcg_out_tb_start(TCGContext *s) 4413{ 4414 /* nothing to do */ 4415} 4416 4417static void tcg_out_nop_fill(tcg_insn_unit *p, int count) 4418{ 4419 memset(p, 0x90, count); 4420} 4421 4422static void tcg_target_init(TCGContext *s) 4423{ 4424 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; 4425 if (TCG_TARGET_REG_BITS == 64) { 4426 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; 4427 } 4428 if (have_avx1) { 4429 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; 4430 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; 4431 } 4432 if (have_avx2) { 4433 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; 4434 } 4435 4436 tcg_target_call_clobber_regs = ALL_VECTOR_REGS; 4437 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); 4438 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX); 4439 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX); 4440 if (TCG_TARGET_REG_BITS == 64) { 4441#if !defined(_WIN64) 4442 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI); 4443 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI); 4444#endif 4445 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8); 4446 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9); 4447 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10); 4448 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11); 4449 } 4450 4451 s->reserved_regs = 0; 4452 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); 4453 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); 4454#ifdef _WIN64 4455 /* These are call saved, and we don't save them, so don't use them. */ 4456 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); 4457 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7); 4458 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8); 4459 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9); 4460 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10); 4461 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11); 4462 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12); 4463 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13); 4464 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14); 4465 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15); 4466#endif 4467} 4468 4469typedef struct { 4470 DebugFrameHeader h; 4471 uint8_t fde_def_cfa[4]; 4472 uint8_t fde_reg_ofs[14]; 4473} DebugFrame; 4474 4475/* We're expecting a 2 byte uleb128 encoded value. */ 4476QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14)); 4477 4478#if !defined(__ELF__) 4479 /* Host machine without ELF. */ 4480#elif TCG_TARGET_REG_BITS == 64 4481#define ELF_HOST_MACHINE EM_X86_64 4482static const DebugFrame debug_frame = { 4483 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4484 .h.cie.id = -1, 4485 .h.cie.version = 1, 4486 .h.cie.code_align = 1, 4487 .h.cie.data_align = 0x78, /* sleb128 -8 */ 4488 .h.cie.return_column = 16, 4489 4490 /* Total FDE size does not include the "len" member. */ 4491 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4492 4493 .fde_def_cfa = { 4494 12, 7, /* DW_CFA_def_cfa %rsp, ... */ 4495 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4496 (FRAME_SIZE >> 7) 4497 }, 4498 .fde_reg_ofs = { 4499 0x90, 1, /* DW_CFA_offset, %rip, -8 */ 4500 /* The following ordering must match tcg_target_callee_save_regs. */ 4501 0x86, 2, /* DW_CFA_offset, %rbp, -16 */ 4502 0x83, 3, /* DW_CFA_offset, %rbx, -24 */ 4503 0x8c, 4, /* DW_CFA_offset, %r12, -32 */ 4504 0x8d, 5, /* DW_CFA_offset, %r13, -40 */ 4505 0x8e, 6, /* DW_CFA_offset, %r14, -48 */ 4506 0x8f, 7, /* DW_CFA_offset, %r15, -56 */ 4507 } 4508}; 4509#else 4510#define ELF_HOST_MACHINE EM_386 4511static const DebugFrame debug_frame = { 4512 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */ 4513 .h.cie.id = -1, 4514 .h.cie.version = 1, 4515 .h.cie.code_align = 1, 4516 .h.cie.data_align = 0x7c, /* sleb128 -4 */ 4517 .h.cie.return_column = 8, 4518 4519 /* Total FDE size does not include the "len" member. */ 4520 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset), 4521 4522 .fde_def_cfa = { 4523 12, 4, /* DW_CFA_def_cfa %esp, ... */ 4524 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */ 4525 (FRAME_SIZE >> 7) 4526 }, 4527 .fde_reg_ofs = { 4528 0x88, 1, /* DW_CFA_offset, %eip, -4 */ 4529 /* The following ordering must match tcg_target_callee_save_regs. */ 4530 0x85, 2, /* DW_CFA_offset, %ebp, -8 */ 4531 0x83, 3, /* DW_CFA_offset, %ebx, -12 */ 4532 0x86, 4, /* DW_CFA_offset, %esi, -16 */ 4533 0x87, 5, /* DW_CFA_offset, %edi, -20 */ 4534 } 4535}; 4536#endif 4537 4538#if defined(ELF_HOST_MACHINE) 4539void tcg_register_jit(const void *buf, size_t buf_size) 4540{ 4541 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); 4542} 4543#endif 4544