tcg/i386/tcg-target.c.inc

94     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
95        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
189         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
198         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
235          *    TESTQ -> TESTL   (uint32_t)
236          *    TESTQ -> BT      (is_power_of_2)
505 /* Group 1 opcode extensions for 0x80-0x83.
516 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
539 #define JCC_JMP (-1)
581         /* We should never be asking for both 16 and 64-bit operation.  */
642    the 32-bit compilation paths.  This method works with all versions of gcc,
662        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
673         /* VEX.m-mmmm */
708     /* The entire 4-byte evex prefix; with R' and V' set. */
781    We handle either RM and INDEX missing with a negative value.  In 64-bit
792             /* Try for a rip-relative addressing mode.  This has replaced
793                the 32-bit-mode absolute addressing encoding.  */
794             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
795             intptr_t disp = offset - pc;
804                rip-relative addressing.  */
843            field indicates no index register.  In 64-bit mode, the REX.X
881     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
887     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
894     /* Absolute for 32-bit, pc-relative for 64-bit.  */
903     /* Absolute for 32-bit, pc-relative for 64-bit.  */
1041     if (arg == -1) {
1052         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1062             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1064             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1076     if (arg == -1) {
1084         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1086         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1110     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1111     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1173        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1210         /* There is no instruction that can validate 8-byte alignment.  */
1217          * and stores use a 16-byte aligned offset.  Validate that the
1225          * The gvec infrastructure only requires 16-byte alignment,
1255         /* There is no instruction that can validate 8-byte alignment.  */
1262          * and stores use a 16-byte aligned offset.  Validate that the
1277          * The gvec infrastructure only requires 16-byte alignment,
1375     /* 32-bit mov zero extends.  */
1413         rexw = c & -8;
1426             if (val == 1 || val == -1) {
1430                      * The single-byte increment encodings are re-tasked
1442                  * Facilitate using an 8-bit immediate.  Carry is inverted
1446                 val = -128;
1458                 /* AND with no high bits set can use a 32-bit operation.  */
1509     if (l->has_value) {
1510         val = tcg_pcrel_diff(s, l->u.value_ptr);
1511         val1 = val - 2;
1513             if (opc == -1) {
1521             if (opc == -1) {
1523                 tcg_out32(s, val - 5);
1526                 tcg_out32(s, val - 6);
1530         if (opc == -1) {
1535         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1536         s->code_ptr += 1;
1538         if (opc == -1) {
1543         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1544         s->code_ptr += 4;
1707         /* If arg2 is -1, convert to LTU/GEU vs 1. */
1734          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1740         /* X - X - C = -C = (C ? -1 : 0) */
1743             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1746             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1749             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1776      * The XOR breaks any false dependency for the low-byte write to dest,
1892         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1900     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1906         /* rip-relative addressing into the constant pool.
1909            be able to re-use the pool constant for more calls.  */
1912         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1923     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1979      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1980      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1987  * Because i686 has no register parameters and because x86_64 has xchg
1991  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1992  * a general-purpose scratch when we don't actually know it's available,
2001     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
2040     MemOp opc = get_memop(l->oi);
2041     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2044     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2046         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2053     tcg_out_jmp(s, l->raddr);
2062     MemOp opc = get_memop(l->oi);
2063     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2066     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2068         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2074     tcg_out_jmp(s, l->raddr);
2080     .index = -1
2131         h->index = TCG_REG_L0;
2132         h->ofs = 0;
2133         h->seg = 0;
2137     h->base = addr;
2138     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2139     a_mask = (1 << h->aa.align) - 1;
2148         unsigned s_mask = (1 << s_bits) - 1;
2153         ldst->is_ld = is_ld;
2154         ldst->oi = oi;
2155         ldst->addr_reg = addr;
2158             ttype = s->addr_type;
2162                 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2171                        s->page_bits - CPU_TLB_ENTRY_BITS);
2182          * check that we don't cross pages for the complete access.
2188                                  addr, s_mask - a_mask);
2190         tlb_mask = s->page_mask | a_mask;
2199         ldst->label_ptr[0] = s->code_ptr;
2200         s->code_ptr += 4;
2209         ldst->is_ld = is_ld;
2210         ldst->oi = oi;
2211         ldst->addr_reg = addr;
2216         ldst->label_ptr[0] = s->code_ptr;
2217         s->code_ptr += 4;
2230     /* Do big-endian loads with movbe.  */
2248             /* There is no extending movbe; only low 16-bits are modified.  */
2318          * Without 16-byte atomicity, use integer regs.
2344          * With 16-byte atomicity, a vector load is required.
2345          * If we already have 16-byte alignment, then VMOVDQA always works.
2395         ldst->type = data_type;
2396         ldst->datalo_reg = datalo;
2397         ldst->datahi_reg = datahi;
2398         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2409      * Do big-endian stores with movbe or system-mode.
2410      * User-only without movbe will have its swapping done generically.
2454          * Without 16-byte atomicity, use integer regs.
2471          * With 16-byte atomicity, a vector store is required.
2472          * If we already have 16-byte alignment, then VMOVDQA always works.
2522         ldst->type = data_type;
2523         ldst->datalo_reg = datalo;
2524         ldst->datahi_reg = datahi;
2525         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2546     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2548         tcg_out_nopn(s, gap - 1);
2560     uintptr_t addr = tb->jmp_target_addr[n];
2561     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2597         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2604         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2646         /* For 3-operand addition, use LEA.  */
2650                 c3 = a2, a2 = -1;
2716         /* For small constant 3-operand shift, use LEA.  */
2717         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2718             if (a2 - 1 == 0) {
2719                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2722                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2723                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2787             /* Output must be sign-extended. */
2796             /* Output must be zero-extended, but input isn't. */
2818         tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I32);
2822             tcg_out_qemu_ld(s, a0, -1, a1, a2, TCG_TYPE_I64);
2834         tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I32);
2838             tcg_out_qemu_st(s, a0, -1, a1, a2, TCG_TYPE_I64);
2948             /* This is a 32-bit zero-extending right shift.  */
2961              * On the off-chance that we can use the high-byte registers.
2963              * would have gotten from the normal tcg-op.c expansion.
3095         /* avx2 does not have 64-bit min/max; adjusted during expand. */
3187         tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
3201     /* Swap to place constant in V4 to take advantage of zero-masking. */
3243         tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
3430         /* First merge the two 32-bit inputs to a single 64-bit element. */
3432         /* Then replicate the 64-bit elements across the rest of the vector. */
3850         return -1;
3853         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3858         return vece == MO_8 ? -1 : 1;
3863             return -1;
3875             return type >= TCG_TYPE_V256 ? -1 : 0;
3892         return vece >= MO_16 ? -1 : 0;
3918             return have_avx512vbmi2 ? -1 : 0;
3921             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3928             return -1;
3974         /* Unpack to 16-bit, shift, and repack.  */
3994              * 32-bit shift and overwriting the high half of a 64-bit logical
3996              * does not, so we have to bound the smaller shift -- we get the
4006              * the sign-extend, shift and merge.
4011             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
4035     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
4084         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
4105      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
4165      * Without AVX512, there are no 64-bit unsigned comparisons.
4174         TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4298       + TCG_TARGET_STACK_ALIGN - 1) \
4299      & ~(TCG_TARGET_STACK_ALIGN - 1))
4309     stack_addend = FRAME_SIZE - PUSH_SIZE;
4329             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4336         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4343         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4349      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4352     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4356     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4363     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4408     s->reserved_regs = 0;
4409     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4410     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4413     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4414     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4415     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4416     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4417     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4418     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4419     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4420     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4421     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4422     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4440     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4441     .h.cie.id = -1,
4444     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4448     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4456         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4458         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4459         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4460         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4461         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4462         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4463         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4469     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4470     .h.cie.id = -1,
4473     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4477     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4485         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4487         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4488         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4489         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4490         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */