tcg/i386/tcg-target.c.inc

94     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
95        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
189         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
198         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
235          *    TESTQ -> TESTL   (uint32_t)
236          *    TESTQ -> BT      (is_power_of_2)
506 /* Group 1 opcode extensions for 0x80-0x83.
517 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
540 #define JCC_JMP (-1)
582         /* We should never be asking for both 16 and 64-bit operation.  */
643    the 32-bit compilation paths.  This method works with all versions of gcc,
663        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
674         /* VEX.m-mmmm */
709     /* The entire 4-byte evex prefix; with R' and V' set. */
782    We handle either RM and INDEX missing with a negative value.  In 64-bit
793             /* Try for a rip-relative addressing mode.  This has replaced
794                the 32-bit-mode absolute addressing encoding.  */
795             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
796             intptr_t disp = offset - pc;
805                rip-relative addressing.  */
844            field indicates no index register.  In 64-bit mode, the REX.X
882     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
888     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
895     /* Absolute for 32-bit, pc-relative for 64-bit.  */
904     /* Absolute for 32-bit, pc-relative for 64-bit.  */
1042     if (arg == -1) {
1053         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1063             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1065             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1077     if (arg == -1) {
1085         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1087         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1096     if (arg == 0 && !s->carry_live) {
1111     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1112     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1174        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1211         /* There is no instruction that can validate 8-byte alignment.  */
1218          * and stores use a 16-byte aligned offset.  Validate that the
1226          * The gvec infrastructure only requires 16-byte alignment,
1256         /* There is no instruction that can validate 8-byte alignment.  */
1263          * and stores use a 16-byte aligned offset.  Validate that the
1278          * The gvec infrastructure only requires 16-byte alignment,
1376     /* 32-bit mov zero extends.  */
1414         rexw = c & -8;
1427             if (val == 1 || val == -1) {
1431                      * The single-byte increment encodings are re-tasked
1443                  * Facilitate using an 8-bit immediate.  Carry is inverted
1447                 val = -128;
1459                 /* AND with no high bits set can use a 32-bit operation.  */
1510     if (l->has_value) {
1511         val = tcg_pcrel_diff(s, l->u.value_ptr);
1512         val1 = val - 2;
1514             if (opc == -1) {
1522             if (opc == -1) {
1524                 tcg_out32(s, val - 5);
1527                 tcg_out32(s, val - 6);
1531         if (opc == -1) {
1536         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1537         s->code_ptr += 1;
1539         if (opc == -1) {
1544         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1545         s->code_ptr += 4;
1744         /* If arg2 is -1, convert to LTU/GEU vs 1. */
1771          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1777         /* X - X - C = -C = (C ? -1 : 0) */
1780             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1783             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1786             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1813      * The XOR breaks any false dependency for the low-byte write to dest,
1940     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1946         /* rip-relative addressing into the constant pool.
1949            be able to re-use the pool constant for more calls.  */
1952         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1963     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
2019      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
2020      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
2031  * Even then, a scratch is only needed for l->raddr.  Rather than expose
2032  * a general-purpose scratch when we don't actually know it's available,
2041     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
2080     MemOp opc = get_memop(l->oi);
2081     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2084     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2086         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2093     tcg_out_jmp(s, l->raddr);
2102     MemOp opc = get_memop(l->oi);
2103     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2106     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2108         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2114     tcg_out_jmp(s, l->raddr);
2120     .index = -1
2171         h->index = TCG_REG_L0;
2172         h->ofs = 0;
2173         h->seg = 0;
2177     h->base = addr;
2178     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2179     a_mask = (1 << h->aa.align) - 1;
2188         unsigned s_mask = (1 << s_bits) - 1;
2193         ldst->is_ld = is_ld;
2194         ldst->oi = oi;
2195         ldst->addr_reg = addr;
2198             ttype = s->addr_type;
2209                        TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
2220          * check that we don't cross pages for the complete access.
2226                                  addr, s_mask - a_mask);
2237         ldst->label_ptr[0] = s->code_ptr;
2238         s->code_ptr += 4;
2247         ldst->is_ld = is_ld;
2248         ldst->oi = oi;
2249         ldst->addr_reg = addr;
2254         ldst->label_ptr[0] = s->code_ptr;
2255         s->code_ptr += 4;
2268     /* Do big-endian loads with movbe.  */
2286             /* There is no extending movbe; only low 16-bits are modified.  */
2356          * Without 16-byte atomicity, use integer regs.
2382          * With 16-byte atomicity, a vector load is required.
2383          * If we already have 16-byte alignment, then VMOVDQA always works.
2430     tcg_out_qemu_ld_direct(s, data, -1, h, type, get_memop(oi));
2433         ldst->type = type;
2434         ldst->datalo_reg = data;
2435         ldst->datahi_reg = -1;
2436         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2455         ldst->type = type;
2456         ldst->datalo_reg = datalo;
2457         ldst->datahi_reg = datahi;
2458         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2474      * Do big-endian stores with movbe or system-mode.
2475      * User-only without movbe will have its swapping done generically.
2519          * Without 16-byte atomicity, use integer regs.
2536          * With 16-byte atomicity, a vector store is required.
2537          * If we already have 16-byte alignment, then VMOVDQA always works.
2584     tcg_out_qemu_st_direct(s, data, -1, h, get_memop(oi));
2587         ldst->type = type;
2588         ldst->datalo_reg = data;
2589         ldst->datahi_reg = -1;
2590         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2617         ldst->type = type;
2618         ldst->datalo_reg = datalo;
2619         ldst->datahi_reg = datahi;
2620         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2646     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2648         tcg_out_nopn(s, gap - 1);
2666     uintptr_t addr = tb->jmp_target_addr[n];
2667     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2694         tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, -1, 0, a2);
2773         /* Implement 0 + 0 + C with -(x - x - c). */
2847         /* Since we have destroyed the flags from BSR, we have to re-test.  */
3159     /* For small constant 3-operand shift, use LEA.  */
3162             /* shl $1,a1,a0 -> lea (a1,a1),a0 */
3165             /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
3166             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
3286         /* Output must be sign-extended. */
3295         /* Output must be zero-extended, but input isn't. */
3409         /* This is a 32-bit zero-extending right shift.  */
3415          * On the off-chance that we can use the high-byte registers.
3417          * would have gotten from the normal tcg-op.c expansion.
3659         /* avx2 does not have 64-bit min/max; adjusted during expand. */
3751         tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
3765     /* Swap to place constant in V4 to take advantage of zero-masking. */
3807         tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
3994         /* First merge the two 32-bit inputs to a single 64-bit element. */
3996         /* Then replicate the 64-bit elements across the rest of the vector. */
4238         return -1;
4241         return have_avx512vl && vece >= MO_32 ? 1 : -1;
4246         return vece == MO_8 ? -1 : 1;
4251             return -1;
4263             return type >= TCG_TYPE_V256 ? -1 : 0;
4280         return vece >= MO_16 ? -1 : 0;
4306             return have_avx512vbmi2 ? -1 : 0;
4309             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
4316             return -1;
4362         /* Unpack to 16-bit, shift, and repack.  */
4382              * 32-bit shift and overwriting the high half of a 64-bit logical
4384              * does not, so we have to bound the smaller shift -- we get the
4394              * the sign-extend, shift and merge.
4399             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
4423     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
4472         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
4493      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
4553      * Without AVX512, there are no 64-bit unsigned comparisons.
4562         TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4686       + TCG_TARGET_STACK_ALIGN - 1) \
4687      & ~(TCG_TARGET_STACK_ALIGN - 1))
4697     stack_addend = FRAME_SIZE - PUSH_SIZE;
4717             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4724         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4731         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4737      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4740     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4744     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4751     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4796     s->reserved_regs = 0;
4797     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4798     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4801     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4802     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4803     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4804     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4805     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4806     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4807     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4808     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4809     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4810     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4828     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4829     .h.cie.id = -1,
4832     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4836     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4844         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4846         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4847         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4848         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4849         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4850         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4851         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4857     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4858     .h.cie.id = -1,
4861     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4865     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4873         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4875         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4876         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4877         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4878         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */