xref: /openbmc/qemu/target/i386/tcg/emit.c.inc (revision d64db833d6e3cbe9ea5f36342480f920f3675cea)
1/*
2 * New-style TCG opcode generator for i386 instructions
3 *
4 *  Copyright (c) 2022 Red Hat, Inc.
5 *
6 * Author: Paolo Bonzini <pbonzini@redhat.com>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22/*
23 * Sometimes, knowing what the backend has can produce better code.
24 * The exact opcode to check depends on 32- vs. 64-bit.
25 */
26#ifdef TARGET_X86_64
27#define INDEX_op_extract2_tl            INDEX_op_extract2_i64
28#else
29#define INDEX_op_extract2_tl            INDEX_op_extract2_i32
30#endif
31
32#define MMX_OFFSET(reg)                        \
33  ({ assert((reg) >= 0 && (reg) <= 7);         \
34     offsetof(CPUX86State, fpregs[reg].mmx); })
35
36#define ZMM_OFFSET(reg)                        \
37  ({ assert((reg) >= 0 && (reg) <= 15);        \
38     offsetof(CPUX86State, xmm_regs[reg]); })
39
40typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
41typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
42typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
43typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
44                               TCGv_ptr reg_c);
45typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
46                                TCGv_ptr reg_c, TCGv_ptr reg_d);
47typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
48                               TCGv_i32 val);
49typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
50                                TCGv_ptr reg_c, TCGv_i32 val);
51typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
52typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c,
53                               TCGv_i32 val);
54typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
55                               TCGv val);
56typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
57                                 TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale);
58typedef void (*SSEFunc_0_eppppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
59                                  TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 flags);
60typedef void (*SSEFunc_0_eppppii)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
61                                  TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 even,
62                                  TCGv_i32 odd);
63
64static void gen_JMP_m(DisasContext *s, X86DecodedInsn *decode);
65static void gen_JMP(DisasContext *s, X86DecodedInsn *decode);
66
67static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
68{
69    return tcg_constant_i32(val);
70}
71
72static void gen_NM_exception(DisasContext *s)
73{
74    gen_exception(s, EXCP07_PREX);
75}
76
77static void gen_lea_modrm(DisasContext *s, X86DecodedInsn *decode)
78{
79    AddressParts *mem = &decode->mem;
80    TCGv ea;
81
82    ea = gen_lea_modrm_1(s, *mem, decode->e.vex_class == 12);
83    if (decode->e.special == X86_SPECIAL_BitTest) {
84        MemOp ot = decode->op[1].ot;
85        int poslen = 8 << ot;
86        int opn = decode->op[2].n;
87        TCGv ofs = tcg_temp_new();
88
89        /* Extract memory displacement from the second operand.  */
90        assert(decode->op[2].unit == X86_OP_INT && decode->op[2].ot != MO_8);
91        tcg_gen_sextract_tl(ofs, cpu_regs[opn], 3, poslen - 3);
92        tcg_gen_andi_tl(ofs, ofs, -1 << ot);
93        tcg_gen_add_tl(s->A0, ea, ofs);
94        ea = s->A0;
95    }
96
97    gen_lea_v_seg(s, ea, mem->def_seg, s->override);
98}
99
100static inline int mmx_offset(MemOp ot)
101{
102    switch (ot) {
103    case MO_8:
104        return offsetof(MMXReg, MMX_B(0));
105    case MO_16:
106        return offsetof(MMXReg, MMX_W(0));
107    case MO_32:
108        return offsetof(MMXReg, MMX_L(0));
109    case MO_64:
110        return offsetof(MMXReg, MMX_Q(0));
111    default:
112        g_assert_not_reached();
113    }
114}
115
116static inline int xmm_offset(MemOp ot)
117{
118    switch (ot) {
119    case MO_8:
120        return offsetof(ZMMReg, ZMM_B(0));
121    case MO_16:
122        return offsetof(ZMMReg, ZMM_W(0));
123    case MO_32:
124        return offsetof(ZMMReg, ZMM_L(0));
125    case MO_64:
126        return offsetof(ZMMReg, ZMM_Q(0));
127    case MO_128:
128        return offsetof(ZMMReg, ZMM_X(0));
129    case MO_256:
130        return offsetof(ZMMReg, ZMM_Y(0));
131    default:
132        g_assert_not_reached();
133    }
134}
135
136static int vector_reg_offset(X86DecodedOp *op)
137{
138    assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
139
140    if (op->unit == X86_OP_MMX) {
141        return op->offset - mmx_offset(op->ot);
142    } else {
143        return op->offset - xmm_offset(op->ot);
144    }
145}
146
147static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n)
148{
149    int base_ofs = vector_reg_offset(op);
150    switch(ot) {
151    case MO_8:
152        if (op->unit == X86_OP_MMX) {
153            return base_ofs + offsetof(MMXReg, MMX_B(n));
154        } else {
155            return base_ofs + offsetof(ZMMReg, ZMM_B(n));
156        }
157    case MO_16:
158        if (op->unit == X86_OP_MMX) {
159            return base_ofs + offsetof(MMXReg, MMX_W(n));
160        } else {
161            return base_ofs + offsetof(ZMMReg, ZMM_W(n));
162        }
163    case MO_32:
164        if (op->unit == X86_OP_MMX) {
165            return base_ofs + offsetof(MMXReg, MMX_L(n));
166        } else {
167            return base_ofs + offsetof(ZMMReg, ZMM_L(n));
168        }
169    case MO_64:
170        if (op->unit == X86_OP_MMX) {
171            return base_ofs;
172        } else {
173            return base_ofs + offsetof(ZMMReg, ZMM_Q(n));
174        }
175    case MO_128:
176        assert(op->unit == X86_OP_SSE);
177        return base_ofs + offsetof(ZMMReg, ZMM_X(n));
178    case MO_256:
179        assert(op->unit == X86_OP_SSE);
180        return base_ofs + offsetof(ZMMReg, ZMM_Y(n));
181    default:
182        g_assert_not_reached();
183    }
184}
185
186static void compute_mmx_offset(X86DecodedOp *op)
187{
188    if (!op->has_ea) {
189        op->offset = MMX_OFFSET(op->n) + mmx_offset(op->ot);
190    } else {
191        op->offset = offsetof(CPUX86State, mmx_t0) + mmx_offset(op->ot);
192    }
193}
194
195static void compute_xmm_offset(X86DecodedOp *op)
196{
197    if (!op->has_ea) {
198        op->offset = ZMM_OFFSET(op->n) + xmm_offset(op->ot);
199    } else {
200        op->offset = offsetof(CPUX86State, xmm_t0) + xmm_offset(op->ot);
201    }
202}
203
204static void gen_load_sse(DisasContext *s, TCGv temp, MemOp ot, int dest_ofs, bool aligned)
205{
206    switch(ot) {
207    case MO_8:
208        gen_op_ld_v(s, MO_8, temp, s->A0);
209        tcg_gen_st8_tl(temp, tcg_env, dest_ofs);
210        break;
211    case MO_16:
212        gen_op_ld_v(s, MO_16, temp, s->A0);
213        tcg_gen_st16_tl(temp, tcg_env, dest_ofs);
214        break;
215    case MO_32:
216        gen_op_ld_v(s, MO_32, temp, s->A0);
217        tcg_gen_st32_tl(temp, tcg_env, dest_ofs);
218        break;
219    case MO_64:
220        gen_ldq_env_A0(s, dest_ofs);
221        break;
222    case MO_128:
223        gen_ldo_env_A0(s, dest_ofs, aligned);
224        break;
225    case MO_256:
226        gen_ldy_env_A0(s, dest_ofs, aligned);
227        break;
228    default:
229        g_assert_not_reached();
230    }
231}
232
233static bool sse_needs_alignment(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
234{
235    switch (decode->e.vex_class) {
236    case 2:
237    case 4:
238        if ((s->prefix & PREFIX_VEX) ||
239            decode->e.vex_special == X86_VEX_SSEUnaligned) {
240            /* MOST legacy SSE instructions require aligned memory operands, but not all.  */
241            return false;
242        }
243        /* fall through */
244    case 1:
245        return ot >= MO_128;
246
247    default:
248        return false;
249    }
250}
251
252static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
253{
254    X86DecodedOp *op = &decode->op[opn];
255
256    switch (op->unit) {
257    case X86_OP_SKIP:
258        return;
259    case X86_OP_SEG:
260        tcg_gen_ld32u_tl(v, tcg_env,
261                         offsetof(CPUX86State,segs[op->n].selector));
262        break;
263#ifndef CONFIG_USER_ONLY
264    case X86_OP_CR:
265        if (op->n == 8) {
266            translator_io_start(&s->base);
267            gen_helper_read_cr8(v, tcg_env);
268        } else {
269            tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, cr[op->n]));
270        }
271        break;
272    case X86_OP_DR:
273        /* CR4.DE tested in the helper.  */
274        gen_helper_get_dr(v, tcg_env, tcg_constant_i32(op->n));
275        break;
276#endif
277    case X86_OP_INT:
278        if (op->has_ea) {
279            if (v == s->T0 && decode->e.special == X86_SPECIAL_SExtT0) {
280                gen_op_ld_v(s, op->ot | MO_SIGN, v, s->A0);
281            } else {
282                gen_op_ld_v(s, op->ot, v, s->A0);
283            }
284
285        } else if (op->ot < MO_TL && v == s->T0 &&
286                   (decode->e.special == X86_SPECIAL_SExtT0 ||
287                    decode->e.special == X86_SPECIAL_ZExtT0)) {
288            if (op->ot == MO_8 && byte_reg_is_xH(s, op->n)) {
289                if (decode->e.special == X86_SPECIAL_SExtT0) {
290                    tcg_gen_sextract_tl(v, cpu_regs[op->n - 4], 8, 8);
291                } else {
292                    tcg_gen_extract_tl(v, cpu_regs[op->n - 4], 8, 8);
293                }
294            } else {
295                if (decode->e.special == X86_SPECIAL_SExtT0) {
296                    tcg_gen_ext_tl(v, cpu_regs[op->n], op->ot | MO_SIGN);
297                } else {
298                    tcg_gen_ext_tl(v, cpu_regs[op->n], op->ot);
299                }
300            }
301
302        } else {
303            gen_op_mov_v_reg(s, op->ot, v, op->n);
304        }
305        break;
306    case X86_OP_IMM:
307        tcg_gen_movi_tl(v, op->imm);
308        break;
309
310    case X86_OP_MMX:
311        compute_mmx_offset(op);
312        goto load_vector;
313
314    case X86_OP_SSE:
315        compute_xmm_offset(op);
316    load_vector:
317        if (op->has_ea) {
318            bool aligned = sse_needs_alignment(s, decode, op->ot);
319            gen_load_sse(s, v, op->ot, op->offset, aligned);
320        }
321        break;
322
323    default:
324        g_assert_not_reached();
325    }
326}
327
328static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn)
329{
330    X86DecodedOp *op = &decode->op[opn];
331
332    assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
333    if (op->v_ptr) {
334        return op->v_ptr;
335    }
336    op->v_ptr = tcg_temp_new_ptr();
337
338    /* The temporary points to the MMXReg or ZMMReg.  */
339    tcg_gen_addi_ptr(op->v_ptr, tcg_env, vector_reg_offset(op));
340    return op->v_ptr;
341}
342
343#define OP_PTR0 op_ptr(decode, 0)
344#define OP_PTR1 op_ptr(decode, 1)
345#define OP_PTR2 op_ptr(decode, 2)
346
347static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
348{
349    X86DecodedOp *op = &decode->op[opn];
350    switch (op->unit) {
351    case X86_OP_SKIP:
352        break;
353    case X86_OP_SEG:
354        /* Note that gen_movl_seg takes care of interrupt shadow and TF.  */
355        gen_movl_seg(s, op->n, s->T0);
356        break;
357    case X86_OP_INT:
358        if (op->has_ea) {
359            gen_op_st_v(s, op->ot, v, s->A0);
360        } else {
361            gen_op_mov_reg_v(s, op->ot, op->n, v);
362        }
363        break;
364    case X86_OP_MMX:
365        break;
366    case X86_OP_SSE:
367        if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot <= MO_128) {
368            tcg_gen_gvec_dup_imm(MO_64,
369                                 offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)),
370                                 16, 16, 0);
371        }
372        break;
373#ifndef CONFIG_USER_ONLY
374    case X86_OP_CR:
375        if (op->n == 8) {
376            translator_io_start(&s->base);
377        }
378        gen_helper_write_crN(tcg_env, tcg_constant_i32(op->n), v);
379        s->base.is_jmp = DISAS_EOB_NEXT;
380        break;
381    case X86_OP_DR:
382        /* CR4.DE tested in the helper.  */
383        gen_helper_set_dr(tcg_env, tcg_constant_i32(op->n), v);
384        s->base.is_jmp = DISAS_EOB_NEXT;
385        break;
386#endif
387    default:
388        g_assert_not_reached();
389    }
390    op->unit = X86_OP_SKIP;
391}
392
393static inline int vector_len(DisasContext *s, X86DecodedInsn *decode)
394{
395    if (decode->e.special == X86_SPECIAL_MMX &&
396        !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
397        return 8;
398    }
399    return s->vex_l ? 32 : 16;
400}
401
402static void prepare_update1_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op)
403{
404    decode->cc_dst = s->T0;
405    decode->cc_op = op;
406}
407
408static void prepare_update2_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op)
409{
410    decode->cc_src = s->T1;
411    decode->cc_dst = s->T0;
412    decode->cc_op = op;
413}
414
415static void prepare_update_cc_incdec(X86DecodedInsn *decode, DisasContext *s, CCOp op)
416{
417    gen_compute_eflags_c(s, s->T1);
418    prepare_update2_cc(decode, s, op);
419}
420
421static void prepare_update3_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op, TCGv reg)
422{
423    decode->cc_src2 = reg;
424    decode->cc_src = s->T1;
425    decode->cc_dst = s->T0;
426    decode->cc_op = op;
427}
428
429/* Set up decode->cc_* to modify CF while keeping other flags unchanged.  */
430static void prepare_update_cf(X86DecodedInsn *decode, DisasContext *s, TCGv cf)
431{
432    switch (s->cc_op) {
433    case CC_OP_ADOX:
434    case CC_OP_ADCOX:
435        decode->cc_src2 = cpu_cc_src2;
436        decode->cc_src = cpu_cc_src;
437        decode->cc_op = CC_OP_ADCOX;
438        break;
439
440    case CC_OP_EFLAGS:
441    case CC_OP_ADCX:
442        decode->cc_src = cpu_cc_src;
443        decode->cc_op = CC_OP_ADCX;
444        break;
445
446    default:
447        decode->cc_src = tcg_temp_new();
448        gen_mov_eflags(s, decode->cc_src);
449        decode->cc_op = CC_OP_ADCX;
450        break;
451    }
452    decode->cc_dst = cf;
453}
454
455static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs)
456{
457    MemOp ot = decode->op[0].ot;
458    int vec_len = vector_len(s, decode);
459    bool aligned = sse_needs_alignment(s, decode, ot);
460
461    if (!decode->op[0].has_ea) {
462        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len);
463        return;
464    }
465
466    switch (ot) {
467    case MO_64:
468        gen_stq_env_A0(s, src_ofs);
469        break;
470    case MO_128:
471        gen_sto_env_A0(s, src_ofs, aligned);
472        break;
473    case MO_256:
474        gen_sty_env_A0(s, src_ofs, aligned);
475        break;
476    default:
477        g_assert_not_reached();
478    }
479}
480
481static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b)
482{
483    gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b);
484}
485
486#define FN_3DNOW_MOVE ((SSEFunc_0_epp) (uintptr_t) 1)
487static const SSEFunc_0_epp fns_3dnow[] = {
488    [0x0c] = gen_helper_pi2fw,
489    [0x0d] = gen_helper_pi2fd,
490    [0x1c] = gen_helper_pf2iw,
491    [0x1d] = gen_helper_pf2id,
492    [0x8a] = gen_helper_pfnacc,
493    [0x8e] = gen_helper_pfpnacc,
494    [0x90] = gen_helper_pfcmpge,
495    [0x94] = gen_helper_pfmin,
496    [0x96] = gen_helper_pfrcp,
497    [0x97] = gen_helper_pfrsqrt,
498    [0x9a] = gen_helper_pfsub,
499    [0x9e] = gen_helper_pfadd,
500    [0xa0] = gen_helper_pfcmpgt,
501    [0xa4] = gen_helper_pfmax,
502    [0xa6] = FN_3DNOW_MOVE, /* PFRCPIT1; no need to actually increase precision */
503    [0xa7] = FN_3DNOW_MOVE, /* PFRSQIT1 */
504    [0xb6] = FN_3DNOW_MOVE, /* PFRCPIT2 */
505    [0xaa] = gen_helper_pfsubr,
506    [0xae] = gen_helper_pfacc,
507    [0xb0] = gen_helper_pfcmpeq,
508    [0xb4] = gen_helper_pfmul,
509    [0xb7] = gen_helper_pmulhrw_mmx,
510    [0xbb] = gen_helper_pswapd,
511    [0xbf] = gen_helper_pavgusb,
512};
513
514static void gen_3dnow(DisasContext *s, X86DecodedInsn *decode)
515{
516    uint8_t b = decode->immediate;
517    SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL;
518
519    if (!fn) {
520        gen_illegal_opcode(s);
521        return;
522    }
523    if (s->flags & HF_TS_MASK) {
524        gen_NM_exception(s);
525        return;
526    }
527    if (s->flags & HF_EM_MASK) {
528        gen_illegal_opcode(s);
529        return;
530    }
531
532    gen_helper_enter_mmx(tcg_env);
533    if (fn == FN_3DNOW_MOVE) {
534       tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset);
535       tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset);
536    } else {
537       fn(tcg_env, OP_PTR0, OP_PTR1);
538    }
539}
540
541/*
542 * 00 = v*ps Vps, Hps, Wpd
543 * 66 = v*pd Vpd, Hpd, Wps
544 * f3 = v*ss Vss, Hss, Wps
545 * f2 = v*sd Vsd, Hsd, Wps
546 */
547static inline void gen_unary_fp_sse(DisasContext *s, X86DecodedInsn *decode,
548                              SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm,
549                              SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm,
550                              SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
551{
552    if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
553        SSEFunc_0_eppp fn = s->prefix & PREFIX_REPZ ? ss : sd;
554        if (!fn) {
555            gen_illegal_opcode(s);
556            return;
557        }
558        fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
559    } else {
560        SSEFunc_0_epp ps, pd, fn;
561        ps = s->vex_l ? ps_ymm : ps_xmm;
562        pd = s->vex_l ? pd_ymm : pd_xmm;
563        fn = s->prefix & PREFIX_DATA ? pd : ps;
564        if (!fn) {
565            gen_illegal_opcode(s);
566            return;
567        }
568        fn(tcg_env, OP_PTR0, OP_PTR2);
569    }
570}
571#define UNARY_FP_SSE(uname, lname)                                                 \
572static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
573{                                                                                  \
574    gen_unary_fp_sse(s, decode,                                                    \
575                     gen_helper_##lname##pd_xmm,                                   \
576                     gen_helper_##lname##ps_xmm,                                   \
577                     gen_helper_##lname##pd_ymm,                                   \
578                     gen_helper_##lname##ps_ymm,                                   \
579                     gen_helper_##lname##sd,                                       \
580                     gen_helper_##lname##ss);                                      \
581}
582UNARY_FP_SSE(VSQRT, sqrt)
583
584/*
585 * 00 = v*ps Vps, Hps, Wpd
586 * 66 = v*pd Vpd, Hpd, Wps
587 * f3 = v*ss Vss, Hss, Wps
588 * f2 = v*sd Vsd, Hsd, Wps
589 */
590static inline void gen_fp_sse(DisasContext *s, X86DecodedInsn *decode,
591                              SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
592                              SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm,
593                              SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
594{
595    SSEFunc_0_eppp ps, pd, fn;
596    if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
597        fn = s->prefix & PREFIX_REPZ ? ss : sd;
598    } else {
599        ps = s->vex_l ? ps_ymm : ps_xmm;
600        pd = s->vex_l ? pd_ymm : pd_xmm;
601        fn = s->prefix & PREFIX_DATA ? pd : ps;
602    }
603    if (fn) {
604        fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
605    } else {
606        gen_illegal_opcode(s);
607    }
608}
609
610#define FP_SSE(uname, lname)                                                       \
611static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
612{                                                                                  \
613    gen_fp_sse(s, decode,                                                          \
614               gen_helper_##lname##pd_xmm,                                         \
615               gen_helper_##lname##ps_xmm,                                         \
616               gen_helper_##lname##pd_ymm,                                         \
617               gen_helper_##lname##ps_ymm,                                         \
618               gen_helper_##lname##sd,                                             \
619               gen_helper_##lname##ss);                                            \
620}
621FP_SSE(VADD, add)
622FP_SSE(VMUL, mul)
623FP_SSE(VSUB, sub)
624FP_SSE(VMIN, min)
625FP_SSE(VDIV, div)
626FP_SSE(VMAX, max)
627
628#define FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, even, odd)                         \
629static void gen_##uname##Px(DisasContext *s, X86DecodedInsn *decode)               \
630{                                                                                  \
631    SSEFunc_0_eppppii xmm = s->vex_w ? gen_helper_fma4pd_xmm : gen_helper_fma4ps_xmm; \
632    SSEFunc_0_eppppii ymm = s->vex_w ? gen_helper_fma4pd_ymm : gen_helper_fma4ps_ymm; \
633    SSEFunc_0_eppppii fn = s->vex_l ? ymm : xmm;                                   \
634                                                                                   \
635    fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2,                                         \
636       tcg_constant_i32(even),                                                     \
637       tcg_constant_i32((even) ^ (odd)));                                          \
638}
639
640#define FMA_SSE(uname, ptr0, ptr1, ptr2, flags)                                    \
641FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, flags, flags)                              \
642static void gen_##uname##Sx(DisasContext *s, X86DecodedInsn *decode)               \
643{                                                                                  \
644    SSEFunc_0_eppppi fn = s->vex_w ? gen_helper_fma4sd : gen_helper_fma4ss;        \
645                                                                                   \
646    fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2,                                         \
647       tcg_constant_i32(flags));                                                   \
648}                                                                                  \
649
650FMA_SSE(VFMADD231,  OP_PTR1, OP_PTR2, OP_PTR0, 0)
651FMA_SSE(VFMADD213,  OP_PTR1, OP_PTR0, OP_PTR2, 0)
652FMA_SSE(VFMADD132,  OP_PTR0, OP_PTR2, OP_PTR1, 0)
653
654FMA_SSE(VFNMADD231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_product)
655FMA_SSE(VFNMADD213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_product)
656FMA_SSE(VFNMADD132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_product)
657
658FMA_SSE(VFMSUB231,  OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c)
659FMA_SSE(VFMSUB213,  OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c)
660FMA_SSE(VFMSUB132,  OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c)
661
662FMA_SSE(VFNMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c|float_muladd_negate_product)
663FMA_SSE(VFNMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c|float_muladd_negate_product)
664FMA_SSE(VFNMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c|float_muladd_negate_product)
665
666FMA_SSE_PACKED(VFMADDSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c, 0)
667FMA_SSE_PACKED(VFMADDSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c, 0)
668FMA_SSE_PACKED(VFMADDSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c, 0)
669
670FMA_SSE_PACKED(VFMSUBADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0, float_muladd_negate_c)
671FMA_SSE_PACKED(VFMSUBADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0, float_muladd_negate_c)
672FMA_SSE_PACKED(VFMSUBADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0, float_muladd_negate_c)
673
674#define FP_UNPACK_SSE(uname, lname)                                                \
675static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
676{                                                                                  \
677    /* PS maps to the DQ integer instruction, PD maps to QDQ.  */                  \
678    gen_fp_sse(s, decode,                                                          \
679               gen_helper_##lname##qdq_xmm,                                        \
680               gen_helper_##lname##dq_xmm,                                         \
681               gen_helper_##lname##qdq_ymm,                                        \
682               gen_helper_##lname##dq_ymm,                                         \
683               NULL, NULL);                                                        \
684}
685FP_UNPACK_SSE(VUNPCKLPx, punpckl)
686FP_UNPACK_SSE(VUNPCKHPx, punpckh)
687
688/*
689 * 00 = v*ps Vps, Wpd
690 * f3 = v*ss Vss, Wps
691 */
692static inline void gen_unary_fp32_sse(DisasContext *s, X86DecodedInsn *decode,
693                                      SSEFunc_0_epp ps_xmm,
694                                      SSEFunc_0_epp ps_ymm,
695                                      SSEFunc_0_eppp ss)
696{
697    if ((s->prefix & (PREFIX_DATA | PREFIX_REPNZ)) != 0) {
698        goto illegal_op;
699    } else if (s->prefix & PREFIX_REPZ) {
700        if (!ss) {
701            goto illegal_op;
702        }
703        ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
704    } else {
705        SSEFunc_0_epp fn = s->vex_l ? ps_ymm : ps_xmm;
706        if (!fn) {
707            goto illegal_op;
708        }
709        fn(tcg_env, OP_PTR0, OP_PTR2);
710    }
711    return;
712
713illegal_op:
714    gen_illegal_opcode(s);
715}
716#define UNARY_FP32_SSE(uname, lname)                                               \
717static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
718{                                                                                  \
719    gen_unary_fp32_sse(s, decode,                                                  \
720                       gen_helper_##lname##ps_xmm,                                 \
721                       gen_helper_##lname##ps_ymm,                                 \
722                       gen_helper_##lname##ss);                                    \
723}
724UNARY_FP32_SSE(VRSQRT, rsqrt)
725UNARY_FP32_SSE(VRCP, rcp)
726
727/*
728 * 66 = v*pd Vpd, Hpd, Wpd
729 * f2 = v*ps Vps, Hps, Wps
730 */
731static inline void gen_horizontal_fp_sse(DisasContext *s, X86DecodedInsn *decode,
732                                         SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
733                                         SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
734{
735    SSEFunc_0_eppp ps, pd, fn;
736    ps = s->vex_l ? ps_ymm : ps_xmm;
737    pd = s->vex_l ? pd_ymm : pd_xmm;
738    fn = s->prefix & PREFIX_DATA ? pd : ps;
739    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
740}
741#define HORIZONTAL_FP_SSE(uname, lname)                                            \
742static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
743{                                                                                  \
744    gen_horizontal_fp_sse(s, decode,                                               \
745                          gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm,  \
746                          gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
747}
748HORIZONTAL_FP_SSE(VHADD, hadd)
749HORIZONTAL_FP_SSE(VHSUB, hsub)
750HORIZONTAL_FP_SSE(VADDSUB, addsub)
751
752static inline void gen_ternary_sse(DisasContext *s, X86DecodedInsn *decode,
753                                   int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm)
754{
755    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
756    TCGv_ptr ptr3 = tcg_temp_new_ptr();
757
758    /* The format of the fourth input is Lx */
759    tcg_gen_addi_ptr(ptr3, tcg_env, ZMM_OFFSET(op3));
760    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
761}
762#define TERNARY_SSE(uname, uvname, lname)                                          \
763static void gen_##uvname(DisasContext *s, X86DecodedInsn *decode)                  \
764{                                                                                  \
765    gen_ternary_sse(s, decode, (uint8_t)decode->immediate >> 4,                    \
766                    gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);           \
767}                                                                                  \
768static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
769{                                                                                  \
770    gen_ternary_sse(s, decode, 0,                                                  \
771                  gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);             \
772}
773TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
774TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
775TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
776
777static inline void gen_binary_imm_sse(DisasContext *s, X86DecodedInsn *decode,
778                                      SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
779{
780    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
781    if (!s->vex_l) {
782        xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
783    } else {
784        ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
785    }
786}
787
788#define BINARY_IMM_SSE(uname, lname)                                               \
789static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
790{                                                                                  \
791    gen_binary_imm_sse(s, decode,                                                  \
792                       gen_helper_##lname##_xmm,                                   \
793                       gen_helper_##lname##_ymm);                                  \
794}
795
796BINARY_IMM_SSE(VBLENDPD,   blendpd)
797BINARY_IMM_SSE(VBLENDPS,   blendps)
798BINARY_IMM_SSE(VPBLENDW,   pblendw)
799BINARY_IMM_SSE(VDDPS,      dpps)
800#define gen_helper_dppd_ymm NULL
801BINARY_IMM_SSE(VDDPD,      dppd)
802BINARY_IMM_SSE(VMPSADBW,   mpsadbw)
803BINARY_IMM_SSE(PCLMULQDQ,  pclmulqdq)
804
805
806#define UNARY_INT_GVEC(uname, func, ...)                                           \
807static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
808{                                                                                  \
809    int vec_len = vector_len(s, decode);                                          \
810                                                                                   \
811    func(__VA_ARGS__, decode->op[0].offset,                                        \
812         decode->op[2].offset, vec_len, vec_len);                                  \
813}
814UNARY_INT_GVEC(PABSB,          tcg_gen_gvec_abs, MO_8)
815UNARY_INT_GVEC(PABSW,          tcg_gen_gvec_abs, MO_16)
816UNARY_INT_GVEC(PABSD,          tcg_gen_gvec_abs, MO_32)
817UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128)
818UNARY_INT_GVEC(VPBROADCASTB,   tcg_gen_gvec_dup_mem, MO_8)
819UNARY_INT_GVEC(VPBROADCASTW,   tcg_gen_gvec_dup_mem, MO_16)
820UNARY_INT_GVEC(VPBROADCASTD,   tcg_gen_gvec_dup_mem, MO_32)
821UNARY_INT_GVEC(VPBROADCASTQ,   tcg_gen_gvec_dup_mem, MO_64)
822
823
824#define BINARY_INT_GVEC(uname, func, ...)                                          \
825static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
826{                                                                                  \
827    int vec_len = vector_len(s, decode);                                          \
828                                                                                   \
829    func(__VA_ARGS__,                                                              \
830         decode->op[0].offset, decode->op[1].offset,                               \
831         decode->op[2].offset, vec_len, vec_len);                                  \
832}
833
834BINARY_INT_GVEC(PADDB,   tcg_gen_gvec_add, MO_8)
835BINARY_INT_GVEC(PADDW,   tcg_gen_gvec_add, MO_16)
836BINARY_INT_GVEC(PADDD,   tcg_gen_gvec_add, MO_32)
837BINARY_INT_GVEC(PADDQ,   tcg_gen_gvec_add, MO_64)
838BINARY_INT_GVEC(PADDSB,  tcg_gen_gvec_ssadd, MO_8)
839BINARY_INT_GVEC(PADDSW,  tcg_gen_gvec_ssadd, MO_16)
840BINARY_INT_GVEC(PADDUSB, tcg_gen_gvec_usadd, MO_8)
841BINARY_INT_GVEC(PADDUSW, tcg_gen_gvec_usadd, MO_16)
842BINARY_INT_GVEC(PAND,    tcg_gen_gvec_and, MO_64)
843BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8)
844BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32)
845BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16)
846BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64)
847BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8)
848BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16)
849BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32)
850BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64)
851BINARY_INT_GVEC(PMAXSB,  tcg_gen_gvec_smax, MO_8)
852BINARY_INT_GVEC(PMAXSW,  tcg_gen_gvec_smax, MO_16)
853BINARY_INT_GVEC(PMAXSD,  tcg_gen_gvec_smax, MO_32)
854BINARY_INT_GVEC(PMAXUB,  tcg_gen_gvec_umax, MO_8)
855BINARY_INT_GVEC(PMAXUW,  tcg_gen_gvec_umax, MO_16)
856BINARY_INT_GVEC(PMAXUD,  tcg_gen_gvec_umax, MO_32)
857BINARY_INT_GVEC(PMINSB,  tcg_gen_gvec_smin, MO_8)
858BINARY_INT_GVEC(PMINSW,  tcg_gen_gvec_smin, MO_16)
859BINARY_INT_GVEC(PMINSD,  tcg_gen_gvec_smin, MO_32)
860BINARY_INT_GVEC(PMINUB,  tcg_gen_gvec_umin, MO_8)
861BINARY_INT_GVEC(PMINUW,  tcg_gen_gvec_umin, MO_16)
862BINARY_INT_GVEC(PMINUD,  tcg_gen_gvec_umin, MO_32)
863BINARY_INT_GVEC(PMULLW,  tcg_gen_gvec_mul, MO_16)
864BINARY_INT_GVEC(PMULLD,  tcg_gen_gvec_mul, MO_32)
865BINARY_INT_GVEC(POR,     tcg_gen_gvec_or, MO_64)
866BINARY_INT_GVEC(PSUBB,   tcg_gen_gvec_sub, MO_8)
867BINARY_INT_GVEC(PSUBW,   tcg_gen_gvec_sub, MO_16)
868BINARY_INT_GVEC(PSUBD,   tcg_gen_gvec_sub, MO_32)
869BINARY_INT_GVEC(PSUBQ,   tcg_gen_gvec_sub, MO_64)
870BINARY_INT_GVEC(PSUBSB,  tcg_gen_gvec_sssub, MO_8)
871BINARY_INT_GVEC(PSUBSW,  tcg_gen_gvec_sssub, MO_16)
872BINARY_INT_GVEC(PSUBUSB, tcg_gen_gvec_ussub, MO_8)
873BINARY_INT_GVEC(PSUBUSW, tcg_gen_gvec_ussub, MO_16)
874BINARY_INT_GVEC(PXOR,    tcg_gen_gvec_xor, MO_64)
875
876
877/*
878 * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
879 * 66 = vp* Vx, Hx, Wx
880 *
881 * These are really the same encoding, because 1) V is the same as P when VEX.V
882 * is not present 2) P and Q are the same as H and W apart from MM/XMM
883 */
884static inline void gen_binary_int_sse(DisasContext *s, X86DecodedInsn *decode,
885                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
886{
887    assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
888
889    if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
890        /* VEX encoding is not applicable to MMX instructions.  */
891        gen_illegal_opcode(s);
892        return;
893    }
894    if (!(s->prefix & PREFIX_DATA)) {
895        mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
896    } else if (!s->vex_l) {
897        xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
898    } else {
899        ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
900    }
901}
902
903
904#define BINARY_INT_MMX(uname, lname)                                               \
905static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
906{                                                                                  \
907    gen_binary_int_sse(s, decode,                                                  \
908                          gen_helper_##lname##_mmx,                                \
909                          gen_helper_##lname##_xmm,                                \
910                          gen_helper_##lname##_ymm);                               \
911}
912BINARY_INT_MMX(PUNPCKLBW,  punpcklbw)
913BINARY_INT_MMX(PUNPCKLWD,  punpcklwd)
914BINARY_INT_MMX(PUNPCKLDQ,  punpckldq)
915BINARY_INT_MMX(PACKSSWB,   packsswb)
916BINARY_INT_MMX(PACKUSWB,   packuswb)
917BINARY_INT_MMX(PUNPCKHBW,  punpckhbw)
918BINARY_INT_MMX(PUNPCKHWD,  punpckhwd)
919BINARY_INT_MMX(PUNPCKHDQ,  punpckhdq)
920BINARY_INT_MMX(PACKSSDW,   packssdw)
921
922BINARY_INT_MMX(PAVGB,   pavgb)
923BINARY_INT_MMX(PAVGW,   pavgw)
924BINARY_INT_MMX(PMADDWD, pmaddwd)
925BINARY_INT_MMX(PMULHUW, pmulhuw)
926BINARY_INT_MMX(PMULHW,  pmulhw)
927BINARY_INT_MMX(PMULUDQ, pmuludq)
928BINARY_INT_MMX(PSADBW,  psadbw)
929
930BINARY_INT_MMX(PSLLW_r, psllw)
931BINARY_INT_MMX(PSLLD_r, pslld)
932BINARY_INT_MMX(PSLLQ_r, psllq)
933BINARY_INT_MMX(PSRLW_r, psrlw)
934BINARY_INT_MMX(PSRLD_r, psrld)
935BINARY_INT_MMX(PSRLQ_r, psrlq)
936BINARY_INT_MMX(PSRAW_r, psraw)
937BINARY_INT_MMX(PSRAD_r, psrad)
938
939BINARY_INT_MMX(PHADDW,    phaddw)
940BINARY_INT_MMX(PHADDSW,   phaddsw)
941BINARY_INT_MMX(PHADDD,    phaddd)
942BINARY_INT_MMX(PHSUBW,    phsubw)
943BINARY_INT_MMX(PHSUBSW,   phsubsw)
944BINARY_INT_MMX(PHSUBD,    phsubd)
945BINARY_INT_MMX(PMADDUBSW, pmaddubsw)
946BINARY_INT_MMX(PSHUFB,    pshufb)
947BINARY_INT_MMX(PSIGNB,    psignb)
948BINARY_INT_MMX(PSIGNW,    psignw)
949BINARY_INT_MMX(PSIGND,    psignd)
950BINARY_INT_MMX(PMULHRSW,  pmulhrsw)
951
952/* Instructions with no MMX equivalent.  */
953#define BINARY_INT_SSE(uname, lname)                                               \
954static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
955{                                                                                  \
956    gen_binary_int_sse(s, decode,                                                  \
957                          NULL,                                                    \
958                          gen_helper_##lname##_xmm,                                \
959                          gen_helper_##lname##_ymm);                               \
960}
961
962/* Instructions with no MMX equivalent.  */
963BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
964BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
965BINARY_INT_SSE(VPACKUSDW,  packusdw)
966BINARY_INT_SSE(VPERMILPS,  vpermilps)
967BINARY_INT_SSE(VPERMILPD,  vpermilpd)
968BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd)
969BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq)
970
971BINARY_INT_SSE(PMULDQ,    pmuldq)
972
973BINARY_INT_SSE(VAESDEC, aesdec)
974BINARY_INT_SSE(VAESDECLAST, aesdeclast)
975BINARY_INT_SSE(VAESENC, aesenc)
976BINARY_INT_SSE(VAESENCLAST, aesenclast)
977
978#define UNARY_CMP_SSE(uname, lname)                                                \
979static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
980{                                                                                  \
981    if (!s->vex_l) {                                                               \
982        gen_helper_##lname##_xmm(tcg_env, OP_PTR1, OP_PTR2);                       \
983    } else {                                                                       \
984        gen_helper_##lname##_ymm(tcg_env, OP_PTR1, OP_PTR2);                       \
985    }                                                                              \
986    assume_cc_op(s, CC_OP_EFLAGS);                                                  \
987}
988UNARY_CMP_SSE(VPTEST,     ptest)
989UNARY_CMP_SSE(VTESTPS,    vtestps)
990UNARY_CMP_SSE(VTESTPD,    vtestpd)
991
992static inline void gen_unary_int_sse(DisasContext *s, X86DecodedInsn *decode,
993                                     SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
994{
995    if (!s->vex_l) {
996        xmm(tcg_env, OP_PTR0, OP_PTR2);
997    } else {
998        ymm(tcg_env, OP_PTR0, OP_PTR2);
999    }
1000}
1001
1002#define UNARY_INT_SSE(uname, lname)                                                \
1003static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1004{                                                                                  \
1005    gen_unary_int_sse(s, decode,                                                   \
1006                      gen_helper_##lname##_xmm,                                    \
1007                      gen_helper_##lname##_ymm);                                   \
1008}
1009
1010UNARY_INT_SSE(VPMOVSXBW,    pmovsxbw)
1011UNARY_INT_SSE(VPMOVSXBD,    pmovsxbd)
1012UNARY_INT_SSE(VPMOVSXBQ,    pmovsxbq)
1013UNARY_INT_SSE(VPMOVSXWD,    pmovsxwd)
1014UNARY_INT_SSE(VPMOVSXWQ,    pmovsxwq)
1015UNARY_INT_SSE(VPMOVSXDQ,    pmovsxdq)
1016
1017UNARY_INT_SSE(VPMOVZXBW,    pmovzxbw)
1018UNARY_INT_SSE(VPMOVZXBD,    pmovzxbd)
1019UNARY_INT_SSE(VPMOVZXBQ,    pmovzxbq)
1020UNARY_INT_SSE(VPMOVZXWD,    pmovzxwd)
1021UNARY_INT_SSE(VPMOVZXWQ,    pmovzxwq)
1022UNARY_INT_SSE(VPMOVZXDQ,    pmovzxdq)
1023
1024UNARY_INT_SSE(VMOVSLDUP,    pmovsldup)
1025UNARY_INT_SSE(VMOVSHDUP,    pmovshdup)
1026UNARY_INT_SSE(VMOVDDUP,     pmovdldup)
1027
1028UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
1029UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
1030UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
1031UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps)
1032UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq)
1033UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq)
1034UNARY_INT_SSE(VCVTPH2PS, cvtph2ps)
1035
1036
1037static inline void gen_unary_imm_sse(DisasContext *s, X86DecodedInsn *decode,
1038                                     SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
1039{
1040    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1041    if (!s->vex_l) {
1042        xmm(OP_PTR0, OP_PTR1, imm);
1043    } else {
1044        ymm(OP_PTR0, OP_PTR1, imm);
1045    }
1046}
1047
1048#define UNARY_IMM_SSE(uname, lname)                                                \
1049static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1050{                                                                                  \
1051    gen_unary_imm_sse(s, decode,                                                   \
1052                      gen_helper_##lname##_xmm,                                    \
1053                      gen_helper_##lname##_ymm);                                   \
1054}
1055
1056UNARY_IMM_SSE(PSHUFD,     pshufd)
1057UNARY_IMM_SSE(PSHUFHW,    pshufhw)
1058UNARY_IMM_SSE(PSHUFLW,    pshuflw)
1059#define gen_helper_vpermq_xmm NULL
1060UNARY_IMM_SSE(VPERMQ,      vpermq)
1061UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm)
1062UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm)
1063
1064static inline void gen_unary_imm_fp_sse(DisasContext *s, X86DecodedInsn *decode,
1065                                        SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm)
1066{
1067    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1068    if (!s->vex_l) {
1069        xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
1070    } else {
1071        ymm(tcg_env, OP_PTR0, OP_PTR1, imm);
1072    }
1073}
1074
1075#define UNARY_IMM_FP_SSE(uname, lname)                                             \
1076static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1077{                                                                                  \
1078    gen_unary_imm_fp_sse(s, decode,                                                \
1079                      gen_helper_##lname##_xmm,                                    \
1080                      gen_helper_##lname##_ymm);                                   \
1081}
1082
1083UNARY_IMM_FP_SSE(VROUNDPS,    roundps)
1084UNARY_IMM_FP_SSE(VROUNDPD,    roundpd)
1085
1086static inline void gen_vexw_avx(DisasContext *s, X86DecodedInsn *decode,
1087                                SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
1088                                SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
1089{
1090    SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm;
1091    SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm;
1092    SSEFunc_0_eppp fn = s->vex_w ? q : d;
1093    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
1094}
1095
1096/* VEX.W affects whether to operate on 32- or 64-bit elements.  */
1097#define VEXW_AVX(uname, lname)                                                     \
1098static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1099{                                                                                  \
1100    gen_vexw_avx(s, decode,                                                        \
1101                 gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
1102                 gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
1103}
1104VEXW_AVX(VPSLLV,    vpsllv)
1105VEXW_AVX(VPSRLV,    vpsrlv)
1106VEXW_AVX(VPSRAV,    vpsrav)
1107VEXW_AVX(VPMASKMOV, vpmaskmov)
1108
1109/* Same as above, but with extra arguments to the helper.  */
1110static inline void gen_vsib_avx(DisasContext *s, X86DecodedInsn *decode,
1111                                SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
1112                                SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
1113{
1114    SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
1115    SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
1116    SSEFunc_0_epppti fn = s->vex_w ? q : d;
1117    TCGv_i32 scale = tcg_constant_i32(decode->mem.scale);
1118    TCGv_ptr index = tcg_temp_new_ptr();
1119
1120    /* Pass third input as (index, base, scale) */
1121    tcg_gen_addi_ptr(index, tcg_env, ZMM_OFFSET(decode->mem.index));
1122    fn(tcg_env, OP_PTR0, OP_PTR1, index, s->A0, scale);
1123
1124    /*
1125     * There are two output operands, so zero OP1's high 128 bits
1126     * in the VEX.128 case.
1127     */
1128    if (!s->vex_l) {
1129        int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1);
1130        tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0);
1131    }
1132}
1133#define VSIB_AVX(uname, lname)                                                     \
1134static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1135{                                                                                  \
1136    gen_vsib_avx(s, decode,                                                        \
1137                 gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
1138                 gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
1139}
1140VSIB_AVX(VPGATHERD, vpgatherd)
1141VSIB_AVX(VPGATHERQ, vpgatherq)
1142
1143static void gen_AAA(DisasContext *s, X86DecodedInsn *decode)
1144{
1145    gen_update_cc_op(s);
1146    gen_helper_aaa(tcg_env);
1147    assume_cc_op(s, CC_OP_EFLAGS);
1148}
1149
1150static void gen_AAD(DisasContext *s, X86DecodedInsn *decode)
1151{
1152    gen_helper_aad(s->T0, s->T0, s->T1);
1153    prepare_update1_cc(decode, s, CC_OP_LOGICB);
1154}
1155
1156static void gen_AAM(DisasContext *s, X86DecodedInsn *decode)
1157{
1158    if (decode->immediate == 0) {
1159        gen_exception(s, EXCP00_DIVZ);
1160    } else {
1161        gen_helper_aam(s->T0, s->T0, s->T1);
1162        prepare_update1_cc(decode, s, CC_OP_LOGICB);
1163    }
1164}
1165
1166static void gen_AAS(DisasContext *s, X86DecodedInsn *decode)
1167{
1168    gen_update_cc_op(s);
1169    gen_helper_aas(tcg_env);
1170    assume_cc_op(s, CC_OP_EFLAGS);
1171}
1172
1173static void gen_ADD(DisasContext *s, X86DecodedInsn *decode);
1174static void gen_ADC(DisasContext *s, X86DecodedInsn *decode)
1175{
1176    MemOp ot = decode->op[1].ot;
1177    TCGv c_in;
1178
1179    /*
1180     * Try to avoid CC_OP_ADC by transforming as follows:
1181     * CC_ADC: src1 = dst + c_in, src2 = 0, src3 = c_in
1182     * CC_ADD: src1 = dst + c_in, src2 = c_in (no src3)
1183     *
1184     * In general src2 vs. src3 matters when computing AF and OF, but not here:
1185     * - AF is bit 4 of dst^src1^src2, which is bit 4 of dst^src1 in both cases
1186     * - OF is a function of the two MSBs, and in both cases they are zero for src2
1187     */
1188    if (decode->e.op2 == X86_TYPE_I && decode->immediate == 0) {
1189        gen_compute_eflags_c(s, s->T1);
1190        gen_ADD(s, decode);
1191        return;
1192    }
1193
1194    c_in = tcg_temp_new();
1195    gen_compute_eflags_c(s, c_in);
1196    if (s->prefix & PREFIX_LOCK) {
1197        tcg_gen_add_tl(s->T0, c_in, s->T1);
1198        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T0,
1199                                    s->mem_index, ot | MO_LE);
1200    } else {
1201        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1202        tcg_gen_add_tl(s->T0, s->T0, c_in);
1203    }
1204    prepare_update3_cc(decode, s, CC_OP_ADCB + ot, c_in);
1205}
1206
1207static void gen_ADCOX(DisasContext *s, X86DecodedInsn *decode, int cc_op)
1208{
1209    MemOp ot = decode->op[0].ot;
1210    TCGv carry_in = NULL;
1211    TCGv *carry_out = (cc_op == CC_OP_ADCX ? &decode->cc_dst : &decode->cc_src2);
1212    TCGv zero;
1213
1214    decode->cc_op = cc_op;
1215    *carry_out = tcg_temp_new();
1216    if (CC_OP_HAS_EFLAGS(s->cc_op)) {
1217        decode->cc_src = cpu_cc_src;
1218
1219        /* Re-use the carry-out from a previous round?  */
1220        if (s->cc_op == cc_op || s->cc_op == CC_OP_ADCOX) {
1221            carry_in = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
1222        }
1223
1224        /* Preserve the opposite carry from previous rounds?  */
1225        if (s->cc_op != cc_op && s->cc_op != CC_OP_EFLAGS) {
1226            decode->cc_op = CC_OP_ADCOX;
1227            if (carry_out == &decode->cc_dst) {
1228                decode->cc_src2 = cpu_cc_src2;
1229            } else {
1230                decode->cc_dst = cpu_cc_dst;
1231            }
1232        }
1233    } else {
1234        decode->cc_src = tcg_temp_new();
1235        gen_mov_eflags(s, decode->cc_src);
1236    }
1237
1238    if (!carry_in) {
1239        /* Get carry_in out of EFLAGS.  */
1240        carry_in = tcg_temp_new();
1241        tcg_gen_extract_tl(carry_in, decode->cc_src,
1242            ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
1243    }
1244
1245    switch (ot) {
1246#ifdef TARGET_X86_64
1247    case MO_32:
1248        /* If TL is 64-bit just do everything in 64-bit arithmetic.  */
1249        tcg_gen_ext32u_tl(s->T0, s->T0);
1250        tcg_gen_ext32u_tl(s->T1, s->T1);
1251        tcg_gen_add_i64(s->T0, s->T0, s->T1);
1252        tcg_gen_add_i64(s->T0, s->T0, carry_in);
1253        tcg_gen_shri_i64(*carry_out, s->T0, 32);
1254        break;
1255#endif
1256    default:
1257        zero = tcg_constant_tl(0);
1258        tcg_gen_add2_tl(s->T0, *carry_out, s->T0, zero, carry_in, zero);
1259        tcg_gen_add2_tl(s->T0, *carry_out, s->T0, *carry_out, s->T1, zero);
1260        break;
1261    }
1262}
1263
1264static void gen_ADCX(DisasContext *s, X86DecodedInsn *decode)
1265{
1266    gen_ADCOX(s, decode, CC_OP_ADCX);
1267}
1268
1269static void gen_ADD(DisasContext *s, X86DecodedInsn *decode)
1270{
1271    MemOp ot = decode->op[1].ot;
1272
1273    if (s->prefix & PREFIX_LOCK) {
1274        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T1,
1275                                    s->mem_index, ot | MO_LE);
1276    } else {
1277        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1278    }
1279    prepare_update2_cc(decode, s, CC_OP_ADDB + ot);
1280}
1281
1282static void gen_ADOX(DisasContext *s, X86DecodedInsn *decode)
1283{
1284    gen_ADCOX(s, decode, CC_OP_ADOX);
1285}
1286
1287static void gen_AND(DisasContext *s, X86DecodedInsn *decode)
1288{
1289    MemOp ot = decode->op[1].ot;
1290
1291    if (s->prefix & PREFIX_LOCK) {
1292        tcg_gen_atomic_and_fetch_tl(s->T0, s->A0, s->T1,
1293                                    s->mem_index, ot | MO_LE);
1294    } else {
1295        tcg_gen_and_tl(s->T0, s->T0, s->T1);
1296    }
1297    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1298}
1299
1300static void gen_ANDN(DisasContext *s, X86DecodedInsn *decode)
1301{
1302    MemOp ot = decode->op[0].ot;
1303
1304    tcg_gen_andc_tl(s->T0, s->T1, s->T0);
1305    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1306}
1307
1308static void gen_ARPL(DisasContext *s, X86DecodedInsn *decode)
1309{
1310    TCGv zf = tcg_temp_new();
1311    TCGv flags = tcg_temp_new();
1312
1313    gen_mov_eflags(s, flags);
1314
1315    /* Compute adjusted DST in T1, merging in SRC[RPL].  */
1316    tcg_gen_deposit_tl(s->T1, s->T0, s->T1, 0, 2);
1317
1318    /* Z flag set if DST[RPL] < SRC[RPL] */
1319    tcg_gen_setcond_tl(TCG_COND_LTU, zf, s->T0, s->T1);
1320    tcg_gen_deposit_tl(flags, flags, zf, ctz32(CC_Z), 1);
1321
1322    /* Place maximum RPL in DST */
1323    tcg_gen_umax_tl(s->T0, s->T0, s->T1);
1324
1325    decode->cc_src = flags;
1326    decode->cc_op = CC_OP_EFLAGS;
1327}
1328
1329static void gen_BEXTR(DisasContext *s, X86DecodedInsn *decode)
1330{
1331    MemOp ot = decode->op[0].ot;
1332    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
1333    TCGv zero = tcg_constant_tl(0);
1334    TCGv mone = tcg_constant_tl(-1);
1335
1336    /*
1337     * Extract START, and shift the operand.
1338     * Shifts larger than operand size get zeros.
1339     */
1340    tcg_gen_ext8u_tl(s->A0, s->T1);
1341    tcg_gen_shr_tl(s->T0, s->T0, s->A0);
1342
1343    tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero);
1344
1345    /*
1346     * Extract the LEN into an inverse mask.  Lengths larger than
1347     * operand size get all zeros, length 0 gets all ones.
1348     */
1349    tcg_gen_extract_tl(s->A0, s->T1, 8, 8);
1350    tcg_gen_shl_tl(s->T1, mone, s->A0);
1351    tcg_gen_movcond_tl(TCG_COND_LEU, s->T1, s->A0, bound, s->T1, zero);
1352    tcg_gen_andc_tl(s->T0, s->T0, s->T1);
1353
1354    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1355}
1356
1357static void gen_BLSI(DisasContext *s, X86DecodedInsn *decode)
1358{
1359    MemOp ot = decode->op[0].ot;
1360
1361    /* input in T1, which is ready for prepare_update2_cc  */
1362    tcg_gen_neg_tl(s->T0, s->T1);
1363    tcg_gen_and_tl(s->T0, s->T0, s->T1);
1364    prepare_update2_cc(decode, s, CC_OP_BLSIB + ot);
1365}
1366
1367static void gen_BLSMSK(DisasContext *s, X86DecodedInsn *decode)
1368{
1369    MemOp ot = decode->op[0].ot;
1370
1371    /* input in T1, which is ready for prepare_update2_cc  */
1372    tcg_gen_subi_tl(s->T0, s->T1, 1);
1373    tcg_gen_xor_tl(s->T0, s->T0, s->T1);
1374    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1375}
1376
1377static void gen_BLSR(DisasContext *s, X86DecodedInsn *decode)
1378{
1379    MemOp ot = decode->op[0].ot;
1380
1381    /* input in T1, which is ready for prepare_update2_cc  */
1382    tcg_gen_subi_tl(s->T0, s->T1, 1);
1383    tcg_gen_and_tl(s->T0, s->T0, s->T1);
1384    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1385}
1386
1387static void gen_BOUND(DisasContext *s, X86DecodedInsn *decode)
1388{
1389    TCGv_i32 op = tcg_temp_new_i32();
1390    tcg_gen_trunc_tl_i32(op, s->T0);
1391    if (decode->op[1].ot == MO_16) {
1392        gen_helper_boundw(tcg_env, s->A0, op);
1393    } else {
1394        gen_helper_boundl(tcg_env, s->A0, op);
1395    }
1396}
1397
1398/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
1399static void gen_BSF(DisasContext *s, X86DecodedInsn *decode)
1400{
1401    MemOp ot = decode->op[0].ot;
1402
1403    /* Only the Z bit is defined and it is related to the input.  */
1404    decode->cc_dst = tcg_temp_new();
1405    decode->cc_op = CC_OP_LOGICB + ot;
1406    tcg_gen_mov_tl(decode->cc_dst, s->T0);
1407
1408    /*
1409     * The manual says that the output is undefined when the
1410     * input is zero, but real hardware leaves it unchanged, and
1411     * real programs appear to depend on that.  Accomplish this
1412     * by passing the output as the value to return upon zero.
1413     */
1414    tcg_gen_ctz_tl(s->T0, s->T0, s->T1);
1415}
1416
1417/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
1418static void gen_BSR(DisasContext *s, X86DecodedInsn *decode)
1419{
1420    MemOp ot = decode->op[0].ot;
1421
1422    /* Only the Z bit is defined and it is related to the input.  */
1423    decode->cc_dst = tcg_temp_new();
1424    decode->cc_op = CC_OP_LOGICB + ot;
1425    tcg_gen_mov_tl(decode->cc_dst, s->T0);
1426
1427    /*
1428     * The manual says that the output is undefined when the
1429     * input is zero, but real hardware leaves it unchanged, and
1430     * real programs appear to depend on that.  Accomplish this
1431     * by passing the output as the value to return upon zero.
1432     * Plus, return the bit index of the first 1 bit.
1433     */
1434    tcg_gen_xori_tl(s->T1, s->T1, TARGET_LONG_BITS - 1);
1435    tcg_gen_clz_tl(s->T0, s->T0, s->T1);
1436    tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1);
1437}
1438
1439static void gen_BSWAP(DisasContext *s, X86DecodedInsn *decode)
1440{
1441#ifdef TARGET_X86_64
1442    if (s->dflag == MO_64) {
1443        tcg_gen_bswap64_i64(s->T0, s->T0);
1444        return;
1445    }
1446#endif
1447    tcg_gen_bswap32_tl(s->T0, s->T0, TCG_BSWAP_OZ);
1448}
1449
1450static TCGv gen_bt_mask(DisasContext *s, X86DecodedInsn *decode)
1451{
1452    MemOp ot = decode->op[1].ot;
1453    TCGv mask = tcg_temp_new();
1454
1455    tcg_gen_andi_tl(s->T1, s->T1, (8 << ot) - 1);
1456    tcg_gen_shl_tl(mask, tcg_constant_tl(1), s->T1);
1457    return mask;
1458}
1459
1460/* Expects truncated bit index in COUNT, 1 << COUNT in MASK.  */
1461static void gen_bt_flags(DisasContext *s, X86DecodedInsn *decode, TCGv src,
1462                         TCGv count, TCGv mask)
1463{
1464    TCGv cf;
1465
1466    /*
1467     * C is the result of the test, Z is unchanged, and the others
1468     * are all undefined.
1469     */
1470    if (s->cc_op == CC_OP_DYNAMIC || CC_OP_HAS_EFLAGS(s->cc_op)) {
1471        /* Generate EFLAGS and replace the C bit.  */
1472        cf = tcg_temp_new();
1473        tcg_gen_setcond_tl(TCG_COND_TSTNE, cf, src, mask);
1474        prepare_update_cf(decode, s, cf);
1475    } else {
1476        /*
1477         * Z was going to be computed from the non-zero status of CC_DST.
1478         * We can get that same Z value (and the new C value) by leaving
1479         * CC_DST alone, setting CC_SRC, and using a CC_OP_SAR of the
1480         * same width.
1481         */
1482        decode->cc_src = tcg_temp_new();
1483        decode->cc_dst = cpu_cc_dst;
1484        decode->cc_op = CC_OP_SARB + cc_op_size(s->cc_op);
1485        tcg_gen_shr_tl(decode->cc_src, src, count);
1486    }
1487}
1488
1489static void gen_BT(DisasContext *s, X86DecodedInsn *decode)
1490{
1491    TCGv count = s->T1;
1492    TCGv mask;
1493
1494    /*
1495     * Try to ensure that the rhs of the TSTNE condition is a constant (and a
1496     * power of two), as that is more readily available on most TCG backends.
1497     *
1498     * For immediate bit number gen_bt_mask()'s output is already a constant;
1499     * for register bit number, shift the source right and check bit 0.
1500     */
1501    if (decode->e.op2 == X86_TYPE_I) {
1502        mask = gen_bt_mask(s, decode);
1503    } else {
1504        MemOp ot = decode->op[1].ot;
1505
1506        tcg_gen_andi_tl(s->T1, s->T1, (8 << ot) - 1);
1507        tcg_gen_shr_tl(s->T0, s->T0, s->T1);
1508
1509        count = tcg_constant_tl(0);
1510        mask = tcg_constant_tl(1);
1511    }
1512    gen_bt_flags(s, decode, s->T0, count, mask);
1513}
1514
1515static void gen_BTC(DisasContext *s, X86DecodedInsn *decode)
1516{
1517    MemOp ot = decode->op[0].ot;
1518    TCGv old = tcg_temp_new();
1519    TCGv mask = gen_bt_mask(s, decode);
1520
1521    if (s->prefix & PREFIX_LOCK) {
1522        tcg_gen_atomic_fetch_xor_tl(old, s->A0, mask, s->mem_index, ot | MO_LE);
1523    } else {
1524        tcg_gen_mov_tl(old, s->T0);
1525        tcg_gen_xor_tl(s->T0, s->T0, mask);
1526    }
1527
1528    gen_bt_flags(s, decode, old, s->T1, mask);
1529}
1530
1531static void gen_BTR(DisasContext *s, X86DecodedInsn *decode)
1532{
1533    MemOp ot = decode->op[0].ot;
1534    TCGv old = tcg_temp_new();
1535    TCGv mask = gen_bt_mask(s, decode);
1536
1537    if (s->prefix & PREFIX_LOCK) {
1538        TCGv maskc = tcg_temp_new();
1539        tcg_gen_not_tl(maskc, mask);
1540        tcg_gen_atomic_fetch_and_tl(old, s->A0, maskc, s->mem_index, ot | MO_LE);
1541    } else {
1542        tcg_gen_mov_tl(old, s->T0);
1543        tcg_gen_andc_tl(s->T0, s->T0, mask);
1544    }
1545
1546    gen_bt_flags(s, decode, old, s->T1, mask);
1547}
1548
1549static void gen_BTS(DisasContext *s, X86DecodedInsn *decode)
1550{
1551    MemOp ot = decode->op[0].ot;
1552    TCGv old = tcg_temp_new();
1553    TCGv mask = gen_bt_mask(s, decode);
1554
1555    if (s->prefix & PREFIX_LOCK) {
1556        tcg_gen_atomic_fetch_or_tl(old, s->A0, mask, s->mem_index, ot | MO_LE);
1557    } else {
1558        tcg_gen_mov_tl(old, s->T0);
1559        tcg_gen_or_tl(s->T0, s->T0, mask);
1560    }
1561
1562    gen_bt_flags(s, decode, old, s->T1, mask);
1563}
1564
1565static void gen_BZHI(DisasContext *s, X86DecodedInsn *decode)
1566{
1567    MemOp ot = decode->op[0].ot;
1568    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
1569    TCGv zero = tcg_constant_tl(0);
1570    TCGv mone = tcg_constant_tl(-1);
1571
1572    tcg_gen_ext8u_tl(s->T1, s->T1);
1573
1574    tcg_gen_shl_tl(s->A0, mone, s->T1);
1575    tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->T1, bound, s->A0, zero);
1576    tcg_gen_andc_tl(s->T0, s->T0, s->A0);
1577    /*
1578     * Note that since we're using BMILG (in order to get O
1579     * cleared) we need to store the inverse into C.
1580     */
1581    tcg_gen_setcond_tl(TCG_COND_LEU, s->T1, s->T1, bound);
1582    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1583}
1584
1585static void gen_CALL(DisasContext *s, X86DecodedInsn *decode)
1586{
1587    gen_push_v(s, eip_next_tl(s));
1588    gen_JMP(s, decode);
1589}
1590
1591static void gen_CALL_m(DisasContext *s, X86DecodedInsn *decode)
1592{
1593    gen_push_v(s, eip_next_tl(s));
1594    gen_JMP_m(s, decode);
1595}
1596
1597static void gen_CALLF(DisasContext *s, X86DecodedInsn *decode)
1598{
1599    gen_far_call(s);
1600}
1601
1602static void gen_CALLF_m(DisasContext *s, X86DecodedInsn *decode)
1603{
1604    MemOp ot = decode->op[1].ot;
1605
1606    gen_op_ld_v(s, ot, s->T0, s->A0);
1607    gen_add_A0_im(s, 1 << ot);
1608    gen_op_ld_v(s, MO_16, s->T1, s->A0);
1609    gen_far_call(s);
1610}
1611
1612static void gen_CBW(DisasContext *s, X86DecodedInsn *decode)
1613{
1614    MemOp src_ot = decode->op[0].ot - 1;
1615
1616    tcg_gen_ext_tl(s->T0, s->T0, src_ot | MO_SIGN);
1617}
1618
1619static void gen_CLC(DisasContext *s, X86DecodedInsn *decode)
1620{
1621    gen_compute_eflags(s);
1622    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
1623}
1624
1625static void gen_CLD(DisasContext *s, X86DecodedInsn *decode)
1626{
1627    tcg_gen_st_i32(tcg_constant_i32(1), tcg_env, offsetof(CPUX86State, df));
1628}
1629
1630static void gen_CLI(DisasContext *s, X86DecodedInsn *decode)
1631{
1632    gen_reset_eflags(s, IF_MASK);
1633}
1634
1635static void gen_CLTS(DisasContext *s, X86DecodedInsn *decode)
1636{
1637    gen_helper_clts(tcg_env);
1638    /* abort block because static cpu state changed */
1639    s->base.is_jmp = DISAS_EOB_NEXT;
1640}
1641
1642static void gen_CMC(DisasContext *s, X86DecodedInsn *decode)
1643{
1644    gen_compute_eflags(s);
1645    tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
1646}
1647
1648static void gen_CMOVcc(DisasContext *s, X86DecodedInsn *decode)
1649{
1650    gen_cmovcc(s, decode->b & 0xf, s->T0, s->T1);
1651}
1652
1653static void gen_CMPccXADD(DisasContext *s, X86DecodedInsn *decode)
1654{
1655    TCGLabel *label_top = gen_new_label();
1656    TCGLabel *label_bottom = gen_new_label();
1657    TCGv oldv = tcg_temp_new();
1658    TCGv newv = tcg_temp_new();
1659    TCGv cmpv = tcg_temp_new();
1660    TCGCond cond;
1661
1662    TCGv cmp_lhs, cmp_rhs;
1663    MemOp ot, ot_full;
1664
1665    int jcc_op = (decode->b >> 1) & 7;
1666    static const TCGCond cond_table[8] = {
1667        [JCC_O] = TCG_COND_LT,  /* test sign bit by comparing against 0 */
1668        [JCC_B] = TCG_COND_LTU,
1669        [JCC_Z] = TCG_COND_EQ,
1670        [JCC_BE] = TCG_COND_LEU,
1671        [JCC_S] = TCG_COND_LT,  /* test sign bit by comparing against 0 */
1672        [JCC_P] = TCG_COND_TSTEQ,  /* even parity - tests low bit of popcount */
1673        [JCC_L] = TCG_COND_LT,
1674        [JCC_LE] = TCG_COND_LE,
1675    };
1676
1677    cond = cond_table[jcc_op];
1678    if (decode->b & 1) {
1679        cond = tcg_invert_cond(cond);
1680    }
1681
1682    ot = decode->op[0].ot;
1683    ot_full = ot | MO_LE;
1684    if (jcc_op >= JCC_S) {
1685        /*
1686         * Sign-extend values before subtracting for S, P (zero/sign extension
1687         * does not matter there) L, LE and their inverses.
1688         */
1689        ot_full |= MO_SIGN;
1690    }
1691
1692    /*
1693     * cmpv will be moved to cc_src *after* cpu_regs[] is written back, so use
1694     * tcg_gen_ext_tl instead of gen_ext_tl.
1695     */
1696    tcg_gen_ext_tl(cmpv, cpu_regs[decode->op[1].n], ot_full);
1697
1698    /*
1699     * Cmpxchg loop starts here.
1700     * - s->T1: addition operand (from decoder)
1701     * - s->A0: dest address (from decoder)
1702     * - s->cc_srcT: memory operand (lhs for comparison)
1703     * - cmpv: rhs for comparison
1704     */
1705    gen_set_label(label_top);
1706    gen_op_ld_v(s, ot_full, s->cc_srcT, s->A0);
1707    tcg_gen_sub_tl(s->T0, s->cc_srcT, cmpv);
1708
1709    /* Compute the comparison result by hand, to avoid clobbering cc_*.  */
1710    switch (jcc_op) {
1711    case JCC_O:
1712        /* (src1 ^ src2) & (src1 ^ dst). newv is only used here for a moment */
1713        cmp_lhs = tcg_temp_new(), cmp_rhs = tcg_constant_tl(0);
1714        tcg_gen_xor_tl(newv, s->cc_srcT, s->T0);
1715        tcg_gen_xor_tl(cmp_lhs, s->cc_srcT, cmpv);
1716        tcg_gen_and_tl(cmp_lhs, cmp_lhs, newv);
1717        tcg_gen_sextract_tl(cmp_lhs, cmp_lhs, 0, 8 << ot);
1718        break;
1719
1720    case JCC_P:
1721        cmp_lhs = tcg_temp_new(), cmp_rhs = tcg_constant_tl(1);
1722        tcg_gen_ext8u_tl(cmp_lhs, s->T0);
1723        tcg_gen_ctpop_tl(cmp_lhs, cmp_lhs);
1724        break;
1725
1726    case JCC_S:
1727        cmp_lhs = tcg_temp_new(), cmp_rhs = tcg_constant_tl(0);
1728        tcg_gen_sextract_tl(cmp_lhs, s->T0, 0, 8 << ot);
1729        break;
1730
1731    default:
1732        cmp_lhs = s->cc_srcT, cmp_rhs = cmpv;
1733        break;
1734    }
1735
1736    /* Compute new value: if condition does not hold, just store back s->cc_srcT */
1737    tcg_gen_add_tl(newv, s->cc_srcT, s->T1);
1738    tcg_gen_movcond_tl(cond, newv, cmp_lhs, cmp_rhs, newv, s->cc_srcT);
1739    tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, s->cc_srcT, newv, s->mem_index, ot_full);
1740
1741    /* Exit unconditionally if cmpxchg succeeded.  */
1742    tcg_gen_brcond_tl(TCG_COND_EQ, oldv, s->cc_srcT, label_bottom);
1743
1744    /* Try again if there was actually a store to make.  */
1745    tcg_gen_brcond_tl(cond, cmp_lhs, cmp_rhs, label_top);
1746    gen_set_label(label_bottom);
1747
1748    /* Store old value to registers only after a successful store.  */
1749    gen_writeback(s, decode, 1, s->cc_srcT);
1750
1751    decode->cc_dst = s->T0;
1752    decode->cc_src = cmpv;
1753    decode->cc_op = CC_OP_SUBB + ot;
1754}
1755
1756static void gen_CMPS(DisasContext *s, X86DecodedInsn *decode)
1757{
1758    MemOp ot = decode->op[2].ot;
1759    gen_repz_nz(s, ot, gen_cmps);
1760}
1761
1762static void gen_CMPXCHG(DisasContext *s, X86DecodedInsn *decode)
1763{
1764    MemOp ot = decode->op[2].ot;
1765    TCGv cmpv = tcg_temp_new();
1766    TCGv oldv = tcg_temp_new();
1767    TCGv newv = tcg_temp_new();
1768    TCGv dest;
1769
1770    tcg_gen_ext_tl(cmpv, cpu_regs[R_EAX], ot);
1771    tcg_gen_ext_tl(newv, s->T1, ot);
1772    if (s->prefix & PREFIX_LOCK) {
1773        tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, cmpv, newv,
1774                                  s->mem_index, ot | MO_LE);
1775    } else {
1776        tcg_gen_ext_tl(oldv, s->T0, ot);
1777        if (decode->op[0].has_ea) {
1778            /*
1779             * Perform an unconditional store cycle like physical cpu;
1780             * must be before changing accumulator to ensure
1781             * idempotency if the store faults and the instruction
1782             * is restarted
1783             */
1784            tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
1785            gen_op_st_v(s, ot, newv, s->A0);
1786        } else {
1787            /*
1788             * Unlike the memory case, where "the destination operand receives
1789             * a write cycle without regard to the result of the comparison",
1790             * rm must not be touched altogether if the write fails, including
1791             * not zero-extending it on 64-bit processors.  So, precompute
1792             * the result of a successful writeback and perform the movcond
1793             * directly on cpu_regs.  In case rm is part of RAX, note that this
1794             * movcond and the one below are mutually exclusive is executed.
1795             */
1796            dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, newv, newv);
1797            tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, newv, dest);
1798        }
1799        decode->op[0].unit = X86_OP_SKIP;
1800    }
1801
1802    /* Write RAX only if the cmpxchg fails.  */
1803    dest = gen_op_deposit_reg_v(s, ot, R_EAX, s->T0, oldv);
1804    tcg_gen_movcond_tl(TCG_COND_NE, dest, oldv, cmpv, s->T0, dest);
1805
1806    tcg_gen_mov_tl(s->cc_srcT, cmpv);
1807    tcg_gen_sub_tl(cmpv, cmpv, oldv);
1808    decode->cc_dst = cmpv;
1809    decode->cc_src = oldv;
1810    decode->cc_op = CC_OP_SUBB + ot;
1811}
1812
1813static void gen_CMPXCHG16B(DisasContext *s, X86DecodedInsn *decode)
1814{
1815#ifdef TARGET_X86_64
1816    MemOp mop = MO_LE | MO_128 | MO_ALIGN;
1817    TCGv_i64 t0, t1;
1818    TCGv_i128 cmp, val;
1819
1820    cmp = tcg_temp_new_i128();
1821    val = tcg_temp_new_i128();
1822    tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
1823    tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
1824
1825    /* Only require atomic with LOCK; non-parallel handled in generator. */
1826    if (s->prefix & PREFIX_LOCK) {
1827        tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
1828    } else {
1829        tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
1830    }
1831
1832    tcg_gen_extr_i128_i64(s->T0, s->T1, val);
1833
1834    /* Determine success after the fact. */
1835    t0 = tcg_temp_new_i64();
1836    t1 = tcg_temp_new_i64();
1837    tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]);
1838    tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]);
1839    tcg_gen_or_i64(t0, t0, t1);
1840
1841    /* Update Z. */
1842    gen_compute_eflags(s);
1843    tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0);
1844    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1);
1845
1846    /*
1847     * Extract the result values for the register pair.  We may do this
1848     * unconditionally, because on success (Z=1), the old value matches
1849     * the previous value in RDX:RAX.
1850     */
1851    tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0);
1852    tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1);
1853#else
1854    abort();
1855#endif
1856}
1857
1858static void gen_CMPXCHG8B(DisasContext *s, X86DecodedInsn *decode)
1859{
1860    TCGv_i64 cmp, val, old;
1861    TCGv Z;
1862
1863    cmp = tcg_temp_new_i64();
1864    val = tcg_temp_new_i64();
1865    old = tcg_temp_new_i64();
1866
1867    /* Construct the comparison values from the register pair. */
1868    tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
1869    tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
1870
1871    /* Only require atomic with LOCK; non-parallel handled in generator. */
1872    if (s->prefix & PREFIX_LOCK) {
1873        tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_LEUQ);
1874    } else {
1875        tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
1876                                      s->mem_index, MO_LEUQ);
1877    }
1878
1879    /* Compute the required value of Z. */
1880    tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
1881    Z = tcg_temp_new();
1882    tcg_gen_trunc_i64_tl(Z, cmp);
1883
1884    /*
1885     * Extract the result values for the register pair.
1886     * For 32-bit, we may do this unconditionally, because on success (Z=1),
1887     * the old value matches the previous value in EDX:EAX.  For x86_64,
1888     * the store must be conditional, because we must leave the source
1889     * registers unchanged on success, and zero-extend the writeback
1890     * on failure (Z=0).
1891     */
1892    if (TARGET_LONG_BITS == 32) {
1893        tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
1894    } else {
1895        TCGv zero = tcg_constant_tl(0);
1896
1897        tcg_gen_extr_i64_tl(s->T0, s->T1, old);
1898        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
1899                           s->T0, cpu_regs[R_EAX]);
1900        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
1901                           s->T1, cpu_regs[R_EDX]);
1902    }
1903
1904    /* Update Z. */
1905    gen_compute_eflags(s);
1906    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
1907}
1908
1909static void gen_CPUID(DisasContext *s, X86DecodedInsn *decode)
1910{
1911    gen_update_cc_op(s);
1912    gen_update_eip_cur(s);
1913    gen_helper_cpuid(tcg_env);
1914}
1915
1916static void gen_CRC32(DisasContext *s, X86DecodedInsn *decode)
1917{
1918    MemOp ot = decode->op[2].ot;
1919    TCGv_i32 tmp = tcg_temp_new_i32();
1920
1921    tcg_gen_trunc_tl_i32(tmp, s->T0);
1922    gen_helper_crc32(s->T0, tmp, s->T1, tcg_constant_i32(8 << ot));
1923}
1924
1925static void gen_CVTPI2Px(DisasContext *s, X86DecodedInsn *decode)
1926{
1927    gen_helper_enter_mmx(tcg_env);
1928    if (s->prefix & PREFIX_DATA) {
1929        gen_helper_cvtpi2pd(tcg_env, OP_PTR0, OP_PTR2);
1930    } else {
1931        gen_helper_cvtpi2ps(tcg_env, OP_PTR0, OP_PTR2);
1932    }
1933}
1934
1935static void gen_CVTPx2PI(DisasContext *s, X86DecodedInsn *decode)
1936{
1937    gen_helper_enter_mmx(tcg_env);
1938    if (s->prefix & PREFIX_DATA) {
1939        gen_helper_cvtpd2pi(tcg_env, OP_PTR0, OP_PTR2);
1940    } else {
1941        gen_helper_cvtps2pi(tcg_env, OP_PTR0, OP_PTR2);
1942    }
1943}
1944
1945static void gen_CVTTPx2PI(DisasContext *s, X86DecodedInsn *decode)
1946{
1947    gen_helper_enter_mmx(tcg_env);
1948    if (s->prefix & PREFIX_DATA) {
1949        gen_helper_cvttpd2pi(tcg_env, OP_PTR0, OP_PTR2);
1950    } else {
1951        gen_helper_cvttps2pi(tcg_env, OP_PTR0, OP_PTR2);
1952    }
1953}
1954
1955static void gen_CWD(DisasContext *s, X86DecodedInsn *decode)
1956{
1957    int shift = 8 << decode->op[0].ot;
1958
1959    tcg_gen_sextract_tl(s->T0, s->T0, shift - 1, 1);
1960}
1961
1962static void gen_DAA(DisasContext *s, X86DecodedInsn *decode)
1963{
1964    gen_update_cc_op(s);
1965    gen_helper_daa(tcg_env);
1966    assume_cc_op(s, CC_OP_EFLAGS);
1967}
1968
1969static void gen_DAS(DisasContext *s, X86DecodedInsn *decode)
1970{
1971    gen_update_cc_op(s);
1972    gen_helper_das(tcg_env);
1973    assume_cc_op(s, CC_OP_EFLAGS);
1974}
1975
1976static void gen_DEC(DisasContext *s, X86DecodedInsn *decode)
1977{
1978    MemOp ot = decode->op[1].ot;
1979
1980    tcg_gen_movi_tl(s->T1, -1);
1981    if (s->prefix & PREFIX_LOCK) {
1982        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T1,
1983                                    s->mem_index, ot | MO_LE);
1984    } else {
1985        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1986    }
1987    prepare_update_cc_incdec(decode, s, CC_OP_DECB + ot);
1988}
1989
1990static void gen_DIV(DisasContext *s, X86DecodedInsn *decode)
1991{
1992    MemOp ot = decode->op[1].ot;
1993
1994    switch(ot) {
1995    case MO_8:
1996        gen_helper_divb_AL(tcg_env, s->T0);
1997        break;
1998    case MO_16:
1999        gen_helper_divw_AX(tcg_env, s->T0);
2000        break;
2001    default:
2002    case MO_32:
2003        gen_helper_divl_EAX(tcg_env, s->T0);
2004        break;
2005#ifdef TARGET_X86_64
2006    case MO_64:
2007        gen_helper_divq_EAX(tcg_env, s->T0);
2008        break;
2009#endif
2010    }
2011}
2012
2013static void gen_EMMS(DisasContext *s, X86DecodedInsn *decode)
2014{
2015    gen_helper_emms(tcg_env);
2016}
2017
2018static void gen_ENTER(DisasContext *s, X86DecodedInsn *decode)
2019{
2020   gen_enter(s, decode->op[1].imm, decode->op[2].imm);
2021}
2022
2023static void gen_EXTRQ_i(DisasContext *s, X86DecodedInsn *decode)
2024{
2025    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
2026    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
2027
2028    gen_helper_extrq_i(tcg_env, OP_PTR0, index, length);
2029}
2030
2031static void gen_EXTRQ_r(DisasContext *s, X86DecodedInsn *decode)
2032{
2033    gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
2034}
2035
2036static void gen_FXRSTOR(DisasContext *s, X86DecodedInsn *decode)
2037{
2038    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
2039        gen_NM_exception(s);
2040    } else {
2041        gen_helper_fxrstor(tcg_env, s->A0);
2042    }
2043}
2044
2045static void gen_FXSAVE(DisasContext *s, X86DecodedInsn *decode)
2046{
2047    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
2048        gen_NM_exception(s);
2049    } else {
2050        gen_helper_fxsave(tcg_env, s->A0);
2051    }
2052}
2053
2054static void gen_HLT(DisasContext *s, X86DecodedInsn *decode)
2055{
2056#ifdef CONFIG_SYSTEM_ONLY
2057    gen_update_cc_op(s);
2058    gen_update_eip_next(s);
2059    gen_helper_hlt(tcg_env);
2060    s->base.is_jmp = DISAS_NORETURN;
2061#endif
2062}
2063
2064static void gen_IDIV(DisasContext *s, X86DecodedInsn *decode)
2065{
2066    MemOp ot = decode->op[1].ot;
2067
2068    switch(ot) {
2069    case MO_8:
2070        gen_helper_idivb_AL(tcg_env, s->T0);
2071        break;
2072    case MO_16:
2073        gen_helper_idivw_AX(tcg_env, s->T0);
2074        break;
2075    default:
2076    case MO_32:
2077        gen_helper_idivl_EAX(tcg_env, s->T0);
2078        break;
2079#ifdef TARGET_X86_64
2080    case MO_64:
2081        gen_helper_idivq_EAX(tcg_env, s->T0);
2082        break;
2083#endif
2084    }
2085}
2086
2087static void gen_IMUL3(DisasContext *s, X86DecodedInsn *decode)
2088{
2089    MemOp ot = decode->op[0].ot;
2090    TCGv cc_src_rhs;
2091
2092    switch (ot) {
2093    case MO_16:
2094        /* s->T0 already sign-extended */
2095        tcg_gen_ext16s_tl(s->T1, s->T1);
2096        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2097        /* Compare the full result to the extension of the truncated result.  */
2098        tcg_gen_ext16s_tl(s->T1, s->T0);
2099        cc_src_rhs = s->T0;
2100        break;
2101
2102    case MO_32:
2103#ifdef TARGET_X86_64
2104        if (TCG_TARGET_REG_BITS == 64) {
2105            /*
2106             * This produces fewer TCG ops, and better code if flags are needed,
2107             * but it requires a 64-bit multiply even if they are not.  Use it
2108             * only if the target has 64-bits registers.
2109             *
2110             * s->T0 is already sign-extended.
2111             */
2112            tcg_gen_ext32s_tl(s->T1, s->T1);
2113            tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2114            /* Compare the full result to the extension of the truncated result.  */
2115            tcg_gen_ext32s_tl(s->T1, s->T0);
2116            cc_src_rhs = s->T0;
2117        } else {
2118            /* Variant that only needs a 32-bit widening multiply.  */
2119            TCGv_i32 hi = tcg_temp_new_i32();
2120            TCGv_i32 lo = tcg_temp_new_i32();
2121            tcg_gen_trunc_tl_i32(lo, s->T0);
2122            tcg_gen_trunc_tl_i32(hi, s->T1);
2123            tcg_gen_muls2_i32(lo, hi, lo, hi);
2124            tcg_gen_extu_i32_tl(s->T0, lo);
2125
2126            cc_src_rhs = tcg_temp_new();
2127            tcg_gen_extu_i32_tl(cc_src_rhs, hi);
2128            /* Compare the high part to the sign bit of the truncated result */
2129            tcg_gen_sari_i32(lo, lo, 31);
2130            tcg_gen_extu_i32_tl(s->T1, lo);
2131        }
2132        break;
2133
2134    case MO_64:
2135#endif
2136        cc_src_rhs = tcg_temp_new();
2137        tcg_gen_muls2_tl(s->T0, cc_src_rhs, s->T0, s->T1);
2138        /* Compare the high part to the sign bit of the truncated result */
2139        tcg_gen_sari_tl(s->T1, s->T0, TARGET_LONG_BITS - 1);
2140        break;
2141
2142    default:
2143        g_assert_not_reached();
2144    }
2145
2146    tcg_gen_sub_tl(s->T1, s->T1, cc_src_rhs);
2147    prepare_update2_cc(decode, s, CC_OP_MULB + ot);
2148}
2149
2150static void gen_IMUL(DisasContext *s, X86DecodedInsn *decode)
2151{
2152    MemOp ot = decode->op[1].ot;
2153    TCGv cc_src_rhs;
2154
2155    switch (ot) {
2156    case MO_8:
2157        /* s->T0 already sign-extended */
2158        tcg_gen_ext8s_tl(s->T1, s->T1);
2159        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2160        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
2161        /* Compare the full result to the extension of the truncated result.  */
2162        tcg_gen_ext8s_tl(s->T1, s->T0);
2163        cc_src_rhs = s->T0;
2164        break;
2165
2166    case MO_16:
2167        /* s->T0 already sign-extended */
2168        tcg_gen_ext16s_tl(s->T1, s->T1);
2169        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2170        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
2171        tcg_gen_shri_tl(s->T1, s->T0, 16);
2172        gen_op_mov_reg_v(s, MO_16, R_EDX, s->T1);
2173        /* Compare the full result to the extension of the truncated result.  */
2174        tcg_gen_ext16s_tl(s->T1, s->T0);
2175        cc_src_rhs = s->T0;
2176        break;
2177
2178    case MO_32:
2179#ifdef TARGET_X86_64
2180        /* s->T0 already sign-extended */
2181        tcg_gen_ext32s_tl(s->T1, s->T1);
2182        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2183        tcg_gen_ext32u_tl(cpu_regs[R_EAX], s->T0);
2184        tcg_gen_shri_tl(cpu_regs[R_EDX], s->T0, 32);
2185        /* Compare the full result to the extension of the truncated result.  */
2186        tcg_gen_ext32s_tl(s->T1, s->T0);
2187        cc_src_rhs = s->T0;
2188        break;
2189
2190    case MO_64:
2191#endif
2192        tcg_gen_muls2_tl(s->T0, cpu_regs[R_EDX], s->T0, s->T1);
2193        tcg_gen_mov_tl(cpu_regs[R_EAX], s->T0);
2194
2195        /* Compare the high part to the sign bit of the truncated result */
2196        tcg_gen_negsetcondi_tl(TCG_COND_LT, s->T1, s->T0, 0);
2197        cc_src_rhs = cpu_regs[R_EDX];
2198        break;
2199
2200    default:
2201        g_assert_not_reached();
2202    }
2203
2204    tcg_gen_sub_tl(s->T1, s->T1, cc_src_rhs);
2205    prepare_update2_cc(decode, s, CC_OP_MULB + ot);
2206}
2207
2208static void gen_IN(DisasContext *s, X86DecodedInsn *decode)
2209{
2210    MemOp ot = decode->op[0].ot;
2211    TCGv_i32 port = tcg_temp_new_i32();
2212
2213    tcg_gen_trunc_tl_i32(port, s->T0);
2214    tcg_gen_ext16u_i32(port, port);
2215    if (!gen_check_io(s, ot, port, SVM_IOIO_TYPE_MASK)) {
2216        return;
2217    }
2218    translator_io_start(&s->base);
2219    gen_helper_in_func(ot, s->T0, port);
2220    gen_writeback(s, decode, 0, s->T0);
2221    gen_bpt_io(s, port, ot);
2222}
2223
2224static void gen_INC(DisasContext *s, X86DecodedInsn *decode)
2225{
2226    MemOp ot = decode->op[1].ot;
2227
2228    tcg_gen_movi_tl(s->T1, 1);
2229    if (s->prefix & PREFIX_LOCK) {
2230        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T1,
2231                                    s->mem_index, ot | MO_LE);
2232    } else {
2233        tcg_gen_add_tl(s->T0, s->T0, s->T1);
2234    }
2235    prepare_update_cc_incdec(decode, s, CC_OP_INCB + ot);
2236}
2237
2238static void gen_INS(DisasContext *s, X86DecodedInsn *decode)
2239{
2240    MemOp ot = decode->op[1].ot;
2241    TCGv_i32 port = tcg_temp_new_i32();
2242
2243    tcg_gen_trunc_tl_i32(port, s->T1);
2244    tcg_gen_ext16u_i32(port, port);
2245    if (!gen_check_io(s, ot, port,
2246                      SVM_IOIO_TYPE_MASK | SVM_IOIO_STR_MASK)) {
2247        return;
2248    }
2249
2250    translator_io_start(&s->base);
2251    gen_repz(s, ot, gen_ins);
2252}
2253
2254static void gen_INSERTQ_i(DisasContext *s, X86DecodedInsn *decode)
2255{
2256    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
2257    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
2258
2259    gen_helper_insertq_i(tcg_env, OP_PTR0, OP_PTR1, index, length);
2260}
2261
2262static void gen_INSERTQ_r(DisasContext *s, X86DecodedInsn *decode)
2263{
2264    gen_helper_insertq_r(tcg_env, OP_PTR0, OP_PTR2);
2265}
2266
2267static void gen_INT(DisasContext *s, X86DecodedInsn *decode)
2268{
2269    gen_interrupt(s, decode->immediate);
2270}
2271
2272static void gen_INT1(DisasContext *s, X86DecodedInsn *decode)
2273{
2274    gen_update_cc_op(s);
2275    gen_update_eip_next(s);
2276    gen_helper_icebp(tcg_env);
2277    s->base.is_jmp = DISAS_NORETURN;
2278}
2279
2280static void gen_INT3(DisasContext *s, X86DecodedInsn *decode)
2281{
2282    gen_interrupt(s, EXCP03_INT3);
2283}
2284
2285static void gen_INTO(DisasContext *s, X86DecodedInsn *decode)
2286{
2287    gen_update_cc_op(s);
2288    gen_update_eip_cur(s);
2289    gen_helper_into(tcg_env, cur_insn_len_i32(s));
2290}
2291
2292static void gen_IRET(DisasContext *s, X86DecodedInsn *decode)
2293{
2294    if (!PE(s) || VM86(s)) {
2295        gen_helper_iret_real(tcg_env, tcg_constant_i32(s->dflag - 1));
2296    } else {
2297        gen_helper_iret_protected(tcg_env, tcg_constant_i32(s->dflag - 1),
2298                                  eip_next_i32(s));
2299    }
2300    assume_cc_op(s, CC_OP_EFLAGS);
2301    s->base.is_jmp = DISAS_EOB_ONLY;
2302}
2303
2304static void gen_Jcc(DisasContext *s, X86DecodedInsn *decode)
2305{
2306    TCGLabel *taken = gen_new_label();
2307
2308    gen_bnd_jmp(s);
2309    gen_jcc(s, decode->b & 0xf, taken);
2310    gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
2311}
2312
2313static void gen_JCXZ(DisasContext *s, X86DecodedInsn *decode)
2314{
2315    TCGLabel *taken = gen_new_label();
2316
2317    gen_update_cc_op(s);
2318    gen_op_jz_ecx(s, taken);
2319    gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
2320}
2321
2322static void gen_JMP(DisasContext *s, X86DecodedInsn *decode)
2323{
2324    gen_update_cc_op(s);
2325    gen_jmp_rel(s, s->dflag, decode->immediate, 0);
2326}
2327
2328static void gen_JMP_m(DisasContext *s, X86DecodedInsn *decode)
2329{
2330    gen_op_jmp_v(s, s->T0);
2331    gen_bnd_jmp(s);
2332    s->base.is_jmp = DISAS_JUMP;
2333}
2334
2335static void gen_JMPF(DisasContext *s, X86DecodedInsn *decode)
2336{
2337    gen_far_jmp(s);
2338}
2339
2340static void gen_JMPF_m(DisasContext *s, X86DecodedInsn *decode)
2341{
2342    MemOp ot = decode->op[1].ot;
2343
2344    gen_op_ld_v(s, ot, s->T0, s->A0);
2345    gen_add_A0_im(s, 1 << ot);
2346    gen_op_ld_v(s, MO_16, s->T1, s->A0);
2347    gen_far_jmp(s);
2348}
2349
2350static void gen_LAHF(DisasContext *s, X86DecodedInsn *decode)
2351{
2352    if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
2353        return gen_illegal_opcode(s);
2354    }
2355    gen_compute_eflags(s);
2356    /* Note: gen_compute_eflags() only gives the condition codes */
2357    tcg_gen_ori_tl(s->T0, cpu_cc_src, 0x02);
2358    tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8);
2359}
2360
2361static void gen_LAR(DisasContext *s, X86DecodedInsn *decode)
2362{
2363    MemOp ot = decode->op[0].ot;
2364    TCGv result = tcg_temp_new();
2365    TCGv dest;
2366
2367    gen_compute_eflags(s);
2368    gen_update_cc_op(s);
2369    gen_helper_lar(result, tcg_env, s->T0);
2370
2371    /* Perform writeback here to skip it if ZF=0.  */
2372    decode->op[0].unit = X86_OP_SKIP;
2373    dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, result, result);
2374    tcg_gen_movcond_tl(TCG_COND_TSTNE, dest, cpu_cc_src, tcg_constant_tl(CC_Z),
2375                       result, dest);
2376}
2377
2378static void gen_LDMXCSR(DisasContext *s, X86DecodedInsn *decode)
2379{
2380    TCGv_i32 tmp = tcg_temp_new_i32();
2381
2382    tcg_gen_trunc_tl_i32(tmp, s->T0);
2383    gen_helper_ldmxcsr(tcg_env, tmp);
2384}
2385
2386static void gen_lxx_seg(DisasContext *s, X86DecodedInsn *decode, int seg)
2387{
2388    MemOp ot = decode->op[0].ot;
2389
2390    /* Offset already in s->T0.  */
2391    gen_add_A0_im(s, 1 << ot);
2392    gen_op_ld_v(s, MO_16, s->T1, s->A0);
2393
2394    /* load the segment here to handle exceptions properly */
2395    gen_movl_seg(s, seg, s->T1);
2396}
2397
2398static void gen_LDS(DisasContext *s, X86DecodedInsn *decode)
2399{
2400    gen_lxx_seg(s, decode, R_DS);
2401}
2402
2403static void gen_LEA(DisasContext *s, X86DecodedInsn *decode)
2404{
2405    TCGv ea = gen_lea_modrm_1(s, decode->mem, false);
2406    gen_lea_v_seg_dest(s, s->aflag, s->T0, ea, -1, -1);
2407}
2408
2409static void gen_LEAVE(DisasContext *s, X86DecodedInsn *decode)
2410{
2411    gen_leave(s);
2412}
2413
2414static void gen_LES(DisasContext *s, X86DecodedInsn *decode)
2415{
2416    gen_lxx_seg(s, decode, R_ES);
2417}
2418
2419static void gen_LFENCE(DisasContext *s, X86DecodedInsn *decode)
2420{
2421    tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
2422}
2423
2424static void gen_LFS(DisasContext *s, X86DecodedInsn *decode)
2425{
2426    gen_lxx_seg(s, decode, R_FS);
2427}
2428
2429static void gen_LGS(DisasContext *s, X86DecodedInsn *decode)
2430{
2431    gen_lxx_seg(s, decode, R_GS);
2432}
2433
2434static void gen_LODS(DisasContext *s, X86DecodedInsn *decode)
2435{
2436    MemOp ot = decode->op[1].ot;
2437    gen_repz(s, ot, gen_lods);
2438}
2439
2440static void gen_LOOP(DisasContext *s, X86DecodedInsn *decode)
2441{
2442    TCGLabel *taken = gen_new_label();
2443
2444    gen_update_cc_op(s);
2445    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
2446    gen_op_jnz_ecx(s, taken);
2447    gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
2448}
2449
2450static void gen_LOOPE(DisasContext *s, X86DecodedInsn *decode)
2451{
2452    TCGLabel *taken = gen_new_label();
2453    TCGLabel *not_taken = gen_new_label();
2454
2455    gen_update_cc_op(s);
2456    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
2457    gen_op_jz_ecx(s, not_taken);
2458    gen_jcc(s, (JCC_Z << 1), taken); /* jz taken */
2459    gen_conditional_jump_labels(s, decode->immediate, not_taken, taken);
2460}
2461
2462static void gen_LOOPNE(DisasContext *s, X86DecodedInsn *decode)
2463{
2464    TCGLabel *taken = gen_new_label();
2465    TCGLabel *not_taken = gen_new_label();
2466
2467    gen_update_cc_op(s);
2468    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
2469    gen_op_jz_ecx(s, not_taken);
2470    gen_jcc(s, (JCC_Z << 1) | 1, taken); /* jnz taken */
2471    gen_conditional_jump_labels(s, decode->immediate, not_taken, taken);
2472}
2473
2474static void gen_LSL(DisasContext *s, X86DecodedInsn *decode)
2475{
2476    MemOp ot = decode->op[0].ot;
2477    TCGv result = tcg_temp_new();
2478    TCGv dest;
2479
2480    gen_compute_eflags(s);
2481    gen_update_cc_op(s);
2482    gen_helper_lsl(result, tcg_env, s->T0);
2483
2484    /* Perform writeback here to skip it if ZF=0.  */
2485    decode->op[0].unit = X86_OP_SKIP;
2486    dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, result, result);
2487    tcg_gen_movcond_tl(TCG_COND_TSTNE, dest, cpu_cc_src, tcg_constant_tl(CC_Z),
2488                       result, dest);
2489}
2490
2491static void gen_LSS(DisasContext *s, X86DecodedInsn *decode)
2492{
2493    gen_lxx_seg(s, decode, R_SS);
2494}
2495
2496static void gen_LZCNT(DisasContext *s, X86DecodedInsn *decode)
2497{
2498    MemOp ot = decode->op[0].ot;
2499
2500    /* C bit (cc_src) is defined related to the input.  */
2501    decode->cc_src = tcg_temp_new();
2502    decode->cc_dst = s->T0;
2503    decode->cc_op = CC_OP_BMILGB + ot;
2504    tcg_gen_mov_tl(decode->cc_src, s->T0);
2505
2506    /*
2507     * Reduce the target_ulong result by the number of zeros that
2508     * we expect to find at the top.
2509     */
2510    tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS);
2511    tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - (8 << ot));
2512}
2513
2514static void gen_MFENCE(DisasContext *s, X86DecodedInsn *decode)
2515{
2516    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
2517}
2518
2519static void gen_MOV(DisasContext *s, X86DecodedInsn *decode)
2520{
2521    /* nothing to do! */
2522}
2523#define gen_NOP gen_MOV
2524
2525static void gen_MASKMOV(DisasContext *s, X86DecodedInsn *decode)
2526{
2527    gen_lea_v_seg(s, cpu_regs[R_EDI], R_DS, s->override);
2528
2529    if (s->prefix & PREFIX_DATA) {
2530        gen_helper_maskmov_xmm(tcg_env, OP_PTR1, OP_PTR2, s->A0);
2531    } else {
2532        gen_helper_maskmov_mmx(tcg_env, OP_PTR1, OP_PTR2, s->A0);
2533    }
2534}
2535
2536static void gen_MOVBE(DisasContext *s, X86DecodedInsn *decode)
2537{
2538    MemOp ot = decode->op[0].ot;
2539
2540    /* M operand type does not load/store */
2541    if (decode->e.op0 == X86_TYPE_M) {
2542        tcg_gen_qemu_st_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
2543    } else {
2544        tcg_gen_qemu_ld_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
2545    }
2546}
2547
2548static void gen_MOVD_from(DisasContext *s, X86DecodedInsn *decode)
2549{
2550    MemOp ot = decode->op[2].ot;
2551
2552    switch (ot) {
2553    case MO_32:
2554#ifdef TARGET_X86_64
2555        tcg_gen_ld32u_tl(s->T0, tcg_env, decode->op[2].offset);
2556        break;
2557    case MO_64:
2558#endif
2559        tcg_gen_ld_tl(s->T0, tcg_env, decode->op[2].offset);
2560        break;
2561    default:
2562        abort();
2563    }
2564}
2565
2566static void gen_MOVD_to(DisasContext *s, X86DecodedInsn *decode)
2567{
2568    MemOp ot = decode->op[2].ot;
2569    int vec_len = vector_len(s, decode);
2570    int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0);
2571
2572    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2573
2574    switch (ot) {
2575    case MO_32:
2576#ifdef TARGET_X86_64
2577        tcg_gen_st32_tl(s->T1, tcg_env, lo_ofs);
2578        break;
2579    case MO_64:
2580#endif
2581        tcg_gen_st_tl(s->T1, tcg_env, lo_ofs);
2582        break;
2583    default:
2584        g_assert_not_reached();
2585    }
2586}
2587
2588static void gen_MOVDQ(DisasContext *s, X86DecodedInsn *decode)
2589{
2590    gen_store_sse(s, decode, decode->op[2].offset);
2591}
2592
2593static void gen_MOVMSK(DisasContext *s, X86DecodedInsn *decode)
2594{
2595    typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn;
2596    TCGv_i32 tmp = tcg_temp_new_i32();
2597
2598    ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm;
2599    pd = s->vex_l ? gen_helper_movmskpd_ymm : gen_helper_movmskpd_xmm;
2600    fn = s->prefix & PREFIX_DATA ? pd : ps;
2601    fn(tmp, tcg_env, OP_PTR2);
2602    tcg_gen_extu_i32_tl(s->T0, tmp);
2603}
2604
2605static void gen_MOVQ(DisasContext *s, X86DecodedInsn *decode)
2606{
2607    int vec_len = vector_len(s, decode);
2608    int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
2609
2610    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
2611    if (decode->op[0].has_ea) {
2612        tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
2613    } else {
2614        /*
2615         * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
2616         * seem to work, but it does not on big-endian platforms; the cleared parts
2617         * are always at higher addresses, but cross-endian emulation inverts the
2618         * byte order so that the cleared parts need to be at *lower* addresses.
2619         * Because oprsz is 8, we see this here even for SSE; but more in general,
2620         * it disqualifies using oprsz < maxsz to emulate VEX128.
2621         */
2622        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2623        tcg_gen_st_i64(s->tmp1_i64, tcg_env, lo_ofs);
2624    }
2625}
2626
2627static void gen_MOVq_dq(DisasContext *s, X86DecodedInsn *decode)
2628{
2629    gen_helper_enter_mmx(tcg_env);
2630    /* Otherwise the same as any other movq.  */
2631    return gen_MOVQ(s, decode);
2632}
2633
2634static void gen_MOVS(DisasContext *s, X86DecodedInsn *decode)
2635{
2636    MemOp ot = decode->op[2].ot;
2637    gen_repz(s, ot, gen_movs);
2638}
2639
2640static void gen_MUL(DisasContext *s, X86DecodedInsn *decode)
2641{
2642    MemOp ot = decode->op[1].ot;
2643
2644    switch (ot) {
2645    case MO_8:
2646        /* s->T0 already zero-extended */
2647        tcg_gen_ext8u_tl(s->T1, s->T1);
2648        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2649        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
2650        tcg_gen_andi_tl(s->T1, s->T0, 0xff00);
2651        decode->cc_dst = s->T0;
2652        decode->cc_src = s->T1;
2653        break;
2654
2655    case MO_16:
2656        /* s->T0 already zero-extended */
2657        tcg_gen_ext16u_tl(s->T1, s->T1);
2658        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2659        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
2660        tcg_gen_shri_tl(s->T1, s->T0, 16);
2661        gen_op_mov_reg_v(s, MO_16, R_EDX, s->T1);
2662        decode->cc_dst = s->T0;
2663        decode->cc_src = s->T1;
2664        break;
2665
2666    case MO_32:
2667#ifdef TARGET_X86_64
2668        /* s->T0 already zero-extended */
2669        tcg_gen_ext32u_tl(s->T1, s->T1);
2670        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2671        tcg_gen_ext32u_tl(cpu_regs[R_EAX], s->T0);
2672        tcg_gen_shri_tl(cpu_regs[R_EDX], s->T0, 32);
2673        decode->cc_dst = cpu_regs[R_EAX];
2674        decode->cc_src = cpu_regs[R_EDX];
2675        break;
2676
2677    case MO_64:
2678#endif
2679        tcg_gen_mulu2_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], s->T0, s->T1);
2680        decode->cc_dst = cpu_regs[R_EAX];
2681        decode->cc_src = cpu_regs[R_EDX];
2682        break;
2683
2684    default:
2685        g_assert_not_reached();
2686    }
2687
2688    decode->cc_op = CC_OP_MULB + ot;
2689}
2690
2691static void gen_MULX(DisasContext *s, X86DecodedInsn *decode)
2692{
2693    MemOp ot = decode->op[0].ot;
2694
2695    /* low part of result in VEX.vvvv, high in MODRM */
2696    switch (ot) {
2697    case MO_32:
2698#ifdef TARGET_X86_64
2699        {
2700            TCGv_i32 t0 = tcg_temp_new_i32();
2701            TCGv_i32 t1 = tcg_temp_new_i32();
2702
2703            tcg_gen_trunc_tl_i32(t0, s->T0);
2704            tcg_gen_trunc_tl_i32(t1, s->T1);
2705            tcg_gen_mulu2_i32(t0, t1, t0, t1);
2706            tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], t0);
2707            tcg_gen_extu_i32_tl(s->T0, t1);
2708            break;
2709        }
2710
2711    case MO_64:
2712#endif
2713        tcg_gen_mulu2_tl(cpu_regs[s->vex_v], s->T0, s->T0, s->T1);
2714        break;
2715
2716    default:
2717        g_assert_not_reached();
2718    }
2719}
2720
2721static void gen_NEG(DisasContext *s, X86DecodedInsn *decode)
2722{
2723    MemOp ot = decode->op[0].ot;
2724    TCGv oldv = tcg_temp_new();
2725
2726    if (s->prefix & PREFIX_LOCK) {
2727        TCGv newv = tcg_temp_new();
2728        TCGv cmpv = tcg_temp_new();
2729        TCGLabel *label1 = gen_new_label();
2730
2731        gen_set_label(label1);
2732        gen_op_ld_v(s, ot, oldv, s->A0);
2733        tcg_gen_neg_tl(newv, oldv);
2734        tcg_gen_atomic_cmpxchg_tl(cmpv, s->A0, oldv, newv,
2735                                  s->mem_index, ot | MO_LE);
2736        tcg_gen_brcond_tl(TCG_COND_NE, oldv, cmpv, label1);
2737    } else {
2738        tcg_gen_mov_tl(oldv, s->T0);
2739    }
2740    tcg_gen_neg_tl(s->T0, oldv);
2741
2742    decode->cc_dst = s->T0;
2743    decode->cc_src = oldv;
2744    tcg_gen_movi_tl(s->cc_srcT, 0);
2745    decode->cc_op = CC_OP_SUBB + ot;
2746}
2747
2748static void gen_NOT(DisasContext *s, X86DecodedInsn *decode)
2749{
2750    MemOp ot = decode->op[0].ot;
2751
2752    if (s->prefix & PREFIX_LOCK) {
2753        tcg_gen_movi_tl(s->T0, ~0);
2754        tcg_gen_atomic_xor_fetch_tl(s->T0, s->A0, s->T0,
2755                                    s->mem_index, ot | MO_LE);
2756    } else {
2757        tcg_gen_not_tl(s->T0, s->T0);
2758    }
2759}
2760
2761static void gen_OR(DisasContext *s, X86DecodedInsn *decode)
2762{
2763    MemOp ot = decode->op[1].ot;
2764
2765    if (s->prefix & PREFIX_LOCK) {
2766        tcg_gen_atomic_or_fetch_tl(s->T0, s->A0, s->T1,
2767                                   s->mem_index, ot | MO_LE);
2768    } else {
2769        tcg_gen_or_tl(s->T0, s->T0, s->T1);
2770    }
2771    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
2772}
2773
2774static void gen_OUT(DisasContext *s, X86DecodedInsn *decode)
2775{
2776    MemOp ot = decode->op[1].ot;
2777    TCGv_i32 port = tcg_temp_new_i32();
2778    TCGv_i32 value = tcg_temp_new_i32();
2779
2780    tcg_gen_trunc_tl_i32(port, s->T1);
2781    tcg_gen_ext16u_i32(port, port);
2782    if (!gen_check_io(s, ot, port, 0)) {
2783        return;
2784    }
2785    tcg_gen_trunc_tl_i32(value, s->T0);
2786    translator_io_start(&s->base);
2787    gen_helper_out_func(ot, port, value);
2788    gen_bpt_io(s, port, ot);
2789}
2790
2791static void gen_OUTS(DisasContext *s, X86DecodedInsn *decode)
2792{
2793    MemOp ot = decode->op[1].ot;
2794    TCGv_i32 port = tcg_temp_new_i32();
2795
2796    tcg_gen_trunc_tl_i32(port, s->T1);
2797    tcg_gen_ext16u_i32(port, port);
2798    if (!gen_check_io(s, ot, port, SVM_IOIO_STR_MASK)) {
2799        return;
2800    }
2801
2802    translator_io_start(&s->base);
2803    gen_repz(s, ot, gen_outs);
2804}
2805
2806static void gen_PALIGNR(DisasContext *s, X86DecodedInsn *decode)
2807{
2808    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2809    if (!(s->prefix & PREFIX_DATA)) {
2810        gen_helper_palignr_mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2811    } else if (!s->vex_l) {
2812        gen_helper_palignr_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2813    } else {
2814        gen_helper_palignr_ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2815    }
2816}
2817
2818static void gen_PANDN(DisasContext *s, X86DecodedInsn *decode)
2819{
2820    int vec_len = vector_len(s, decode);
2821
2822    /* Careful, operand order is reversed!  */
2823    tcg_gen_gvec_andc(MO_64,
2824                      decode->op[0].offset, decode->op[2].offset,
2825                      decode->op[1].offset, vec_len, vec_len);
2826}
2827
2828static void gen_PAUSE(DisasContext *s, X86DecodedInsn *decode)
2829{
2830    gen_update_cc_op(s);
2831    gen_update_eip_next(s);
2832    gen_helper_pause(tcg_env);
2833    s->base.is_jmp = DISAS_NORETURN;
2834}
2835
2836static void gen_PCMPESTRI(DisasContext *s, X86DecodedInsn *decode)
2837{
2838    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2839    gen_helper_pcmpestri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2840    assume_cc_op(s, CC_OP_EFLAGS);
2841}
2842
2843static void gen_PCMPESTRM(DisasContext *s, X86DecodedInsn *decode)
2844{
2845    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2846    gen_helper_pcmpestrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2847    assume_cc_op(s, CC_OP_EFLAGS);
2848    if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
2849        tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
2850                             16, 16, 0);
2851    }
2852}
2853
2854static void gen_PCMPISTRI(DisasContext *s, X86DecodedInsn *decode)
2855{
2856    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2857    gen_helper_pcmpistri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2858    assume_cc_op(s, CC_OP_EFLAGS);
2859}
2860
2861static void gen_PCMPISTRM(DisasContext *s, X86DecodedInsn *decode)
2862{
2863    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2864    gen_helper_pcmpistrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2865    assume_cc_op(s, CC_OP_EFLAGS);
2866    if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
2867        tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
2868                             16, 16, 0);
2869    }
2870}
2871
2872static void gen_PDEP(DisasContext *s, X86DecodedInsn *decode)
2873{
2874    gen_helper_pdep(s->T0, s->T0, s->T1);
2875}
2876
2877static void gen_PEXT(DisasContext *s, X86DecodedInsn *decode)
2878{
2879    gen_helper_pext(s->T0, s->T0, s->T1);
2880}
2881
2882static inline void gen_pextr(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
2883{
2884    int vec_len = vector_len(s, decode);
2885    int mask = (vec_len >> ot) - 1;
2886    int val = decode->immediate & mask;
2887
2888    switch (ot) {
2889    case MO_8:
2890        tcg_gen_ld8u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2891        break;
2892    case MO_16:
2893        tcg_gen_ld16u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2894        break;
2895    case MO_32:
2896#ifdef TARGET_X86_64
2897        tcg_gen_ld32u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2898        break;
2899    case MO_64:
2900#endif
2901        tcg_gen_ld_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2902        break;
2903    default:
2904        abort();
2905    }
2906}
2907
2908static void gen_PEXTRB(DisasContext *s, X86DecodedInsn *decode)
2909{
2910    gen_pextr(s, decode, MO_8);
2911}
2912
2913static void gen_PEXTRW(DisasContext *s, X86DecodedInsn *decode)
2914{
2915    gen_pextr(s, decode, MO_16);
2916}
2917
2918static void gen_PEXTR(DisasContext *s, X86DecodedInsn *decode)
2919{
2920    MemOp ot = decode->op[0].ot;
2921    gen_pextr(s, decode, ot);
2922}
2923
2924static inline void gen_pinsr(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
2925{
2926    int vec_len = vector_len(s, decode);
2927    int mask = (vec_len >> ot) - 1;
2928    int val = decode->immediate & mask;
2929
2930    if (decode->op[1].offset != decode->op[0].offset) {
2931        assert(vec_len == 16);
2932        gen_store_sse(s, decode, decode->op[1].offset);
2933    }
2934
2935    switch (ot) {
2936    case MO_8:
2937        tcg_gen_st8_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2938        break;
2939    case MO_16:
2940        tcg_gen_st16_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2941        break;
2942    case MO_32:
2943#ifdef TARGET_X86_64
2944        tcg_gen_st32_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2945        break;
2946    case MO_64:
2947#endif
2948        tcg_gen_st_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2949        break;
2950    default:
2951        abort();
2952    }
2953}
2954
2955static void gen_PINSRB(DisasContext *s, X86DecodedInsn *decode)
2956{
2957    gen_pinsr(s, decode, MO_8);
2958}
2959
2960static void gen_PINSRW(DisasContext *s, X86DecodedInsn *decode)
2961{
2962    gen_pinsr(s, decode, MO_16);
2963}
2964
2965static void gen_PINSR(DisasContext *s, X86DecodedInsn *decode)
2966{
2967    gen_pinsr(s, decode, decode->op[2].ot);
2968}
2969
2970static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
2971{
2972    TCGv_i64 t = tcg_temp_new_i64();
2973
2974    tcg_gen_andi_i64(d, s, 0x8080808080808080ull);
2975
2976    /*
2977     * After each shift+or pair:
2978     * 0:  a.......b.......c.......d.......e.......f.......g.......h.......
2979     * 7:  ab......bc......cd......de......ef......fg......gh......h.......
2980     * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h.......
2981     * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h.......
2982     * The result is left in the high bits of the word.
2983     */
2984    tcg_gen_shli_i64(t, d, 7);
2985    tcg_gen_or_i64(d, d, t);
2986    tcg_gen_shli_i64(t, d, 14);
2987    tcg_gen_or_i64(d, d, t);
2988    tcg_gen_shli_i64(t, d, 28);
2989    tcg_gen_or_i64(d, d, t);
2990}
2991
2992static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s)
2993{
2994    TCGv_vec t = tcg_temp_new_vec_matching(d);
2995    TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80);
2996
2997    /* See above */
2998    tcg_gen_and_vec(vece, d, s, m);
2999    tcg_gen_shli_vec(vece, t, d, 7);
3000    tcg_gen_or_vec(vece, d, d, t);
3001    tcg_gen_shli_vec(vece, t, d, 14);
3002    tcg_gen_or_vec(vece, d, d, t);
3003    tcg_gen_shli_vec(vece, t, d, 28);
3004    tcg_gen_or_vec(vece, d, d, t);
3005}
3006
3007static void gen_PMOVMSKB(DisasContext *s, X86DecodedInsn *decode)
3008{
3009    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
3010    static const GVecGen2 g = {
3011        .fni8 = gen_pmovmskb_i64,
3012        .fniv = gen_pmovmskb_vec,
3013        .opt_opc = vecop_list,
3014        .vece = MO_64,
3015        .prefer_i64 = TCG_TARGET_REG_BITS == 64
3016    };
3017    MemOp ot = decode->op[2].ot;
3018    int vec_len = vector_len(s, decode);
3019    TCGv t = tcg_temp_new();
3020
3021    tcg_gen_gvec_2(offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), decode->op[2].offset,
3022                   vec_len, vec_len, &g);
3023    tcg_gen_ld8u_tl(s->T0, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
3024    while (vec_len > 8) {
3025        vec_len -= 8;
3026        if (tcg_op_supported(INDEX_op_extract2_tl, TCG_TYPE_TL, 0)) {
3027            /*
3028             * Load the next byte of the result into the high byte of T.
3029             * TCG does a similar expansion of deposit to shl+extract2; by
3030             * loading the whole word, the shift left is avoided.
3031             */
3032#ifdef TARGET_X86_64
3033            tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_Q((vec_len - 1) / 8)));
3034#else
3035            tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L((vec_len - 1) / 4)));
3036#endif
3037
3038            tcg_gen_extract2_tl(s->T0, t, s->T0, TARGET_LONG_BITS - 8);
3039        } else {
3040            /*
3041             * The _previous_ value is deposited into bits 8 and higher of t.  Because
3042             * those bits are known to be zero after ld8u, this becomes a shift+or
3043             * if deposit is not available.
3044             */
3045            tcg_gen_ld8u_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
3046            tcg_gen_deposit_tl(s->T0, t, s->T0, 8, TARGET_LONG_BITS - 8);
3047        }
3048    }
3049}
3050
3051static void gen_POP(DisasContext *s, X86DecodedInsn *decode)
3052{
3053    X86DecodedOp *op = &decode->op[0];
3054    MemOp ot = gen_pop_T0(s);
3055
3056    assert(ot >= op->ot);
3057    if (op->has_ea || op->unit == X86_OP_SEG) {
3058        /* NOTE: order is important for MMU exceptions */
3059        gen_writeback(s, decode, 0, s->T0);
3060    }
3061
3062    /* NOTE: writing back registers after update is important for pop %sp */
3063    gen_pop_update(s, ot);
3064}
3065
3066static void gen_POPA(DisasContext *s, X86DecodedInsn *decode)
3067{
3068    gen_popa(s);
3069}
3070
3071static void gen_POPCNT(DisasContext *s, X86DecodedInsn *decode)
3072{
3073    decode->cc_dst = tcg_temp_new();
3074    decode->cc_op = CC_OP_POPCNT;
3075
3076    tcg_gen_mov_tl(decode->cc_dst, s->T0);
3077    tcg_gen_ctpop_tl(s->T0, s->T0);
3078}
3079
3080static void gen_POPF(DisasContext *s, X86DecodedInsn *decode)
3081{
3082    MemOp ot;
3083    int mask = TF_MASK | AC_MASK | ID_MASK | NT_MASK;
3084
3085    if (CPL(s) == 0) {
3086        mask |= IF_MASK | IOPL_MASK;
3087    } else if (CPL(s) <= IOPL(s)) {
3088        mask |= IF_MASK;
3089    }
3090    if (s->dflag == MO_16) {
3091        mask &= 0xffff;
3092    }
3093
3094    ot = gen_pop_T0(s);
3095    gen_helper_write_eflags(tcg_env, s->T0, tcg_constant_i32(mask));
3096    gen_pop_update(s, ot);
3097    set_cc_op(s, CC_OP_EFLAGS);
3098    /* abort translation because TF/AC flag may change */
3099    s->base.is_jmp = DISAS_EOB_NEXT;
3100}
3101
3102static void gen_PSHUFW(DisasContext *s, X86DecodedInsn *decode)
3103{
3104    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
3105    gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm);
3106}
3107
3108static void gen_PSRLW_i(DisasContext *s, X86DecodedInsn *decode)
3109{
3110    int vec_len = vector_len(s, decode);
3111
3112    if (decode->immediate >= 16) {
3113        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
3114    } else {
3115        tcg_gen_gvec_shri(MO_16,
3116                          decode->op[0].offset, decode->op[1].offset,
3117                          decode->immediate, vec_len, vec_len);
3118    }
3119}
3120
3121static void gen_PSLLW_i(DisasContext *s, X86DecodedInsn *decode)
3122{
3123    int vec_len = vector_len(s, decode);
3124
3125    if (decode->immediate >= 16) {
3126        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
3127    } else {
3128        tcg_gen_gvec_shli(MO_16,
3129                          decode->op[0].offset, decode->op[1].offset,
3130                          decode->immediate, vec_len, vec_len);
3131    }
3132}
3133
3134static void gen_PSRAW_i(DisasContext *s, X86DecodedInsn *decode)
3135{
3136    int vec_len = vector_len(s, decode);
3137
3138    if (decode->immediate >= 16) {
3139        decode->immediate = 15;
3140    }
3141    tcg_gen_gvec_sari(MO_16,
3142                      decode->op[0].offset, decode->op[1].offset,
3143                      decode->immediate, vec_len, vec_len);
3144}
3145
3146static void gen_PSRLD_i(DisasContext *s, X86DecodedInsn *decode)
3147{
3148    int vec_len = vector_len(s, decode);
3149
3150    if (decode->immediate >= 32) {
3151        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
3152    } else {
3153        tcg_gen_gvec_shri(MO_32,
3154                          decode->op[0].offset, decode->op[1].offset,
3155                          decode->immediate, vec_len, vec_len);
3156    }
3157}
3158
3159static void gen_PSLLD_i(DisasContext *s, X86DecodedInsn *decode)
3160{
3161    int vec_len = vector_len(s, decode);
3162
3163    if (decode->immediate >= 32) {
3164        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
3165    } else {
3166        tcg_gen_gvec_shli(MO_32,
3167                          decode->op[0].offset, decode->op[1].offset,
3168                          decode->immediate, vec_len, vec_len);
3169    }
3170}
3171
3172static void gen_PSRAD_i(DisasContext *s, X86DecodedInsn *decode)
3173{
3174    int vec_len = vector_len(s, decode);
3175
3176    if (decode->immediate >= 32) {
3177        decode->immediate = 31;
3178    }
3179    tcg_gen_gvec_sari(MO_32,
3180                      decode->op[0].offset, decode->op[1].offset,
3181                      decode->immediate, vec_len, vec_len);
3182}
3183
3184static void gen_PSRLQ_i(DisasContext *s, X86DecodedInsn *decode)
3185{
3186    int vec_len = vector_len(s, decode);
3187
3188    if (decode->immediate >= 64) {
3189        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
3190    } else {
3191        tcg_gen_gvec_shri(MO_64,
3192                          decode->op[0].offset, decode->op[1].offset,
3193                          decode->immediate, vec_len, vec_len);
3194    }
3195}
3196
3197static void gen_PSLLQ_i(DisasContext *s, X86DecodedInsn *decode)
3198{
3199    int vec_len = vector_len(s, decode);
3200
3201    if (decode->immediate >= 64) {
3202        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
3203    } else {
3204        tcg_gen_gvec_shli(MO_64,
3205                          decode->op[0].offset, decode->op[1].offset,
3206                          decode->immediate, vec_len, vec_len);
3207    }
3208}
3209
3210static TCGv_ptr make_imm8u_xmm_vec(uint8_t imm, int vec_len)
3211{
3212    MemOp ot = vec_len == 16 ? MO_128 : MO_256;
3213    TCGv_i32 imm_v = tcg_constant8u_i32(imm);
3214    TCGv_ptr ptr = tcg_temp_new_ptr();
3215
3216    tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot),
3217                         vec_len, vec_len, 0);
3218
3219    tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_t0));
3220    tcg_gen_st_i32(imm_v, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
3221    return ptr;
3222}
3223
3224static void gen_PSRLDQ_i(DisasContext *s, X86DecodedInsn *decode)
3225{
3226    int vec_len = vector_len(s, decode);
3227    TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
3228
3229    if (s->vex_l) {
3230        gen_helper_psrldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
3231    } else {
3232        gen_helper_psrldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
3233    }
3234}
3235
3236static void gen_PSLLDQ_i(DisasContext *s, X86DecodedInsn *decode)
3237{
3238    int vec_len = vector_len(s, decode);
3239    TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
3240
3241    if (s->vex_l) {
3242        gen_helper_pslldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
3243    } else {
3244        gen_helper_pslldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
3245    }
3246}
3247
3248static void gen_PUSH(DisasContext *s, X86DecodedInsn *decode)
3249{
3250    gen_push_v(s, s->T0);
3251}
3252
3253static void gen_PUSHA(DisasContext *s, X86DecodedInsn *decode)
3254{
3255    gen_pusha(s);
3256}
3257
3258static void gen_PUSHF(DisasContext *s, X86DecodedInsn *decode)
3259{
3260    gen_update_cc_op(s);
3261    gen_helper_read_eflags(s->T0, tcg_env);
3262    gen_push_v(s, s->T0);
3263}
3264
3265static MemOp gen_shift_count(DisasContext *s, X86DecodedInsn *decode,
3266                             bool *can_be_zero, TCGv *count, int unit)
3267{
3268    MemOp ot = decode->op[0].ot;
3269    int mask = (ot <= MO_32 ? 0x1f : 0x3f);
3270
3271    *can_be_zero = false;
3272    switch (unit) {
3273    case X86_OP_INT:
3274        *count = tcg_temp_new();
3275        tcg_gen_andi_tl(*count, cpu_regs[R_ECX], mask);
3276        *can_be_zero = true;
3277        break;
3278
3279    case X86_OP_IMM:
3280        if ((decode->immediate & mask) == 0) {
3281            *count = NULL;
3282            break;
3283        }
3284        *count = tcg_temp_new();
3285        tcg_gen_movi_tl(*count, decode->immediate & mask);
3286        break;
3287
3288    case X86_OP_SKIP:
3289        *count = tcg_temp_new();
3290        tcg_gen_movi_tl(*count, 1);
3291        break;
3292
3293    default:
3294        g_assert_not_reached();
3295    }
3296
3297    return ot;
3298}
3299
3300/*
3301 * Compute existing flags in decode->cc_src, for gen_* functions that wants
3302 * to set the cc_op set to CC_OP_ADCOX.  In particular, this allows rotate
3303 * operations to compute the carry in decode->cc_dst and the overflow in
3304 * decode->cc_src2.
3305 *
3306 * If need_flags is true, decode->cc_dst and decode->cc_src2 are preloaded
3307 * with the value of CF and OF before the instruction, so that it is possible
3308 * to keep the flags unmodified.
3309 *
3310 * Return true if carry could be made available cheaply as a 1-bit value in
3311 * decode->cc_dst (trying a bit harder if want_carry is true).  If false is
3312 * returned, decode->cc_dst is uninitialized and the carry is only available
3313 * as bit 0 of decode->cc_src.
3314 */
3315static bool gen_eflags_adcox(DisasContext *s, X86DecodedInsn *decode, bool want_carry, bool need_flags)
3316{
3317    bool got_cf = false;
3318    bool got_of = false;
3319
3320    decode->cc_dst = tcg_temp_new();
3321    decode->cc_src = tcg_temp_new();
3322    decode->cc_src2 = tcg_temp_new();
3323    decode->cc_op = CC_OP_ADCOX;
3324
3325    /* A lot more cc_ops could be "optimized" to avoid the extracts at
3326     * the end (INC/DEC, BMILG, MUL), but they are all really unlikely
3327     * to be followed by rotations within the same basic block.
3328     */
3329    switch (s->cc_op) {
3330    case CC_OP_ADCOX:
3331        /* No need to compute the full EFLAGS, CF/OF are already isolated.  */
3332        tcg_gen_mov_tl(decode->cc_src, cpu_cc_src);
3333        if (need_flags) {
3334            tcg_gen_mov_tl(decode->cc_src2, cpu_cc_src2);
3335            got_of = true;
3336        }
3337        if (want_carry || need_flags) {
3338            tcg_gen_mov_tl(decode->cc_dst, cpu_cc_dst);
3339            got_cf = true;
3340        }
3341        break;
3342
3343    case CC_OP_LOGICB ... CC_OP_LOGICQ:
3344        /* CF and OF are zero, do it just because it's easy.  */
3345        gen_mov_eflags(s, decode->cc_src);
3346        if (need_flags) {
3347            tcg_gen_movi_tl(decode->cc_src2, 0);
3348            got_of = true;
3349        }
3350        if (want_carry || need_flags) {
3351            tcg_gen_movi_tl(decode->cc_dst, 0);
3352            got_cf = true;
3353        }
3354        break;
3355
3356    case CC_OP_SARB ... CC_OP_SARQ:
3357        /*
3358         * SHR/RCR/SHR/RCR/... is a relatively common occurrence of RCR.
3359         * By computing CF without using eflags, the calls to cc_compute_all
3360         * can be eliminated as dead code (except for the last RCR).
3361         */
3362        if (want_carry || need_flags) {
3363            tcg_gen_andi_tl(decode->cc_dst, cpu_cc_src, 1);
3364            got_cf = true;
3365        }
3366        gen_mov_eflags(s, decode->cc_src);
3367        break;
3368
3369    case CC_OP_SHLB ... CC_OP_SHLQ:
3370        /*
3371         * Likewise for SHL/RCL/SHL/RCL/... but, if CF is not in the sign
3372         * bit, we might as well fish CF out of EFLAGS and save a shift.
3373         */
3374        if (want_carry && (!need_flags || s->cc_op == CC_OP_SHLB + MO_TL)) {
3375            MemOp size = cc_op_size(s->cc_op);
3376            tcg_gen_shri_tl(decode->cc_dst, cpu_cc_src, (8 << size) - 1);
3377            got_cf = true;
3378        }
3379        gen_mov_eflags(s, decode->cc_src);
3380        break;
3381
3382    default:
3383        gen_mov_eflags(s, decode->cc_src);
3384        break;
3385    }
3386
3387    if (need_flags) {
3388        /* If the flags could be left unmodified, always load them.  */
3389        if (!got_of) {
3390            tcg_gen_extract_tl(decode->cc_src2, decode->cc_src, ctz32(CC_O), 1);
3391            got_of = true;
3392        }
3393        if (!got_cf) {
3394            tcg_gen_extract_tl(decode->cc_dst, decode->cc_src, ctz32(CC_C), 1);
3395            got_cf = true;
3396        }
3397    }
3398    return got_cf;
3399}
3400
3401static void gen_rot_overflow(X86DecodedInsn *decode, TCGv result, TCGv old,
3402                             bool can_be_zero, TCGv count)
3403{
3404    MemOp ot = decode->op[0].ot;
3405    TCGv temp = can_be_zero ? tcg_temp_new() : decode->cc_src2;
3406
3407    tcg_gen_xor_tl(temp, old, result);
3408    tcg_gen_extract_tl(temp, temp, (8 << ot) - 1, 1);
3409    if (can_be_zero) {
3410        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_src2, count, tcg_constant_tl(0),
3411                           decode->cc_src2, temp);
3412    }
3413}
3414
3415/*
3416 * RCx operations are invariant modulo 8*operand_size+1.  For 8 and 16-bit operands,
3417 * this is less than 0x1f (the mask applied by gen_shift_count) so reduce further.
3418 */
3419static void gen_rotc_mod(MemOp ot, TCGv count)
3420{
3421    TCGv temp;
3422
3423    switch (ot) {
3424    case MO_8:
3425        temp = tcg_temp_new();
3426        tcg_gen_subi_tl(temp, count, 18);
3427        tcg_gen_movcond_tl(TCG_COND_GE, count, temp, tcg_constant_tl(0), temp, count);
3428        tcg_gen_subi_tl(temp, count, 9);
3429        tcg_gen_movcond_tl(TCG_COND_GE, count, temp, tcg_constant_tl(0), temp, count);
3430        break;
3431
3432    case MO_16:
3433        temp = tcg_temp_new();
3434        tcg_gen_subi_tl(temp, count, 17);
3435        tcg_gen_movcond_tl(TCG_COND_GE, count, temp, tcg_constant_tl(0), temp, count);
3436        break;
3437
3438    default:
3439        break;
3440    }
3441}
3442
3443/*
3444 * The idea here is that the bit to the right of the new bit 0 is the
3445 * new carry, and the bit to the right of the old bit 0 is the old carry.
3446 * Just like a regular rotation, the result of the rotation is composed
3447 * from a right shifted part and a left shifted part of s->T0.  The new carry
3448 * is extracted from the right-shifted portion, and the old carry is
3449 * inserted at the end of the left-shifted portion.
3450 *
3451 * Because of the separate shifts involving the carry, gen_RCL and gen_RCR
3452 * mostly operate on count-1.  This also comes in handy when computing
3453 * length - count, because (length-1) - (count-1) can be computed with
3454 * a XOR, and that is commutative unlike subtraction.
3455 */
3456static void gen_RCL(DisasContext *s, X86DecodedInsn *decode)
3457{
3458    bool have_1bit_cin, can_be_zero;
3459    TCGv count;
3460    TCGLabel *zero_label = NULL;
3461    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3462    TCGv low, high, low_count;
3463
3464    if (!count) {
3465        return;
3466    }
3467
3468    low = tcg_temp_new();
3469    high = tcg_temp_new();
3470    low_count = tcg_temp_new();
3471
3472    gen_rotc_mod(ot, count);
3473    have_1bit_cin = gen_eflags_adcox(s, decode, true, can_be_zero);
3474    if (can_be_zero) {
3475        zero_label = gen_new_label();
3476        tcg_gen_brcondi_tl(TCG_COND_EQ, count, 0, zero_label);
3477    }
3478
3479    /* Compute high part, including incoming carry.  */
3480    if (!have_1bit_cin || tcg_op_deposit_valid(TCG_TYPE_TL, 1, TARGET_LONG_BITS - 1)) {
3481        /* high = (T0 << 1) | cin */
3482        TCGv cin = have_1bit_cin ? decode->cc_dst : decode->cc_src;
3483        tcg_gen_deposit_tl(high, cin, s->T0, 1, TARGET_LONG_BITS - 1);
3484    } else {
3485        /* Same as above but without deposit; cin in cc_dst.  */
3486        tcg_gen_add_tl(high, s->T0, decode->cc_dst);
3487        tcg_gen_add_tl(high, high, s->T0);
3488    }
3489    tcg_gen_subi_tl(count, count, 1);
3490    tcg_gen_shl_tl(high, high, count);
3491
3492    /* Compute low part and outgoing carry, incoming s->T0 is zero extended */
3493    tcg_gen_xori_tl(low_count, count, (8 << ot) - 1); /* LENGTH - 1 - (count - 1) */
3494    tcg_gen_shr_tl(low, s->T0, low_count);
3495    tcg_gen_andi_tl(decode->cc_dst, low, 1);
3496    tcg_gen_shri_tl(low, low, 1);
3497
3498    /* Compute result and outgoing overflow */
3499    tcg_gen_mov_tl(decode->cc_src2, s->T0);
3500    tcg_gen_or_tl(s->T0, low, high);
3501    gen_rot_overflow(decode, s->T0, decode->cc_src2, false, NULL);
3502
3503    if (zero_label) {
3504        gen_set_label(zero_label);
3505    }
3506}
3507
3508static void gen_RCR(DisasContext *s, X86DecodedInsn *decode)
3509{
3510    bool have_1bit_cin, can_be_zero;
3511    TCGv count;
3512    TCGLabel *zero_label = NULL;
3513    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3514    TCGv low, high, high_count;
3515
3516    if (!count) {
3517        return;
3518    }
3519
3520    low = tcg_temp_new();
3521    high = tcg_temp_new();
3522    high_count = tcg_temp_new();
3523
3524    gen_rotc_mod(ot, count);
3525    have_1bit_cin = gen_eflags_adcox(s, decode, true, can_be_zero);
3526    if (can_be_zero) {
3527        zero_label = gen_new_label();
3528        tcg_gen_brcondi_tl(TCG_COND_EQ, count, 0, zero_label);
3529    }
3530
3531    /* Save incoming carry into high, it will be shifted later.  */
3532    if (!have_1bit_cin || tcg_op_deposit_valid(TCG_TYPE_TL, 1, TARGET_LONG_BITS - 1)) {
3533        TCGv cin = have_1bit_cin ? decode->cc_dst : decode->cc_src;
3534        tcg_gen_deposit_tl(high, cin, s->T0, 1, TARGET_LONG_BITS - 1);
3535    } else {
3536        /* Same as above but without deposit; cin in cc_dst.  */
3537        tcg_gen_add_tl(high, s->T0, decode->cc_dst);
3538        tcg_gen_add_tl(high, high, s->T0);
3539    }
3540
3541    /* Compute low part and outgoing carry, incoming s->T0 is zero extended */
3542    tcg_gen_subi_tl(count, count, 1);
3543    tcg_gen_shr_tl(low, s->T0, count);
3544    tcg_gen_andi_tl(decode->cc_dst, low, 1);
3545    tcg_gen_shri_tl(low, low, 1);
3546
3547    /* Move high part to the right position */
3548    tcg_gen_xori_tl(high_count, count, (8 << ot) - 1); /* LENGTH - 1 - (count - 1) */
3549    tcg_gen_shl_tl(high, high, high_count);
3550
3551    /* Compute result and outgoing overflow */
3552    tcg_gen_mov_tl(decode->cc_src2, s->T0);
3553    tcg_gen_or_tl(s->T0, low, high);
3554    gen_rot_overflow(decode, s->T0, decode->cc_src2, false, NULL);
3555
3556    if (zero_label) {
3557        gen_set_label(zero_label);
3558    }
3559}
3560
3561#ifdef CONFIG_USER_ONLY
3562static void gen_unreachable(DisasContext *s, X86DecodedInsn *decode)
3563{
3564    g_assert_not_reached();
3565}
3566#endif
3567
3568#ifndef CONFIG_USER_ONLY
3569static void gen_RDMSR(DisasContext *s, X86DecodedInsn *decode)
3570{
3571    gen_update_cc_op(s);
3572    gen_update_eip_cur(s);
3573    gen_helper_rdmsr(tcg_env);
3574}
3575#else
3576#define gen_RDMSR gen_unreachable
3577#endif
3578
3579static void gen_RDPMC(DisasContext *s, X86DecodedInsn *decode)
3580{
3581    gen_update_cc_op(s);
3582    gen_update_eip_cur(s);
3583    translator_io_start(&s->base);
3584    gen_helper_rdpmc(tcg_env);
3585    s->base.is_jmp = DISAS_NORETURN;
3586}
3587
3588static void gen_RDTSC(DisasContext *s, X86DecodedInsn *decode)
3589{
3590    gen_update_cc_op(s);
3591    gen_update_eip_cur(s);
3592    translator_io_start(&s->base);
3593    gen_helper_rdtsc(tcg_env);
3594}
3595
3596static void gen_RDxxBASE(DisasContext *s, X86DecodedInsn *decode)
3597{
3598    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
3599
3600    /* Preserve hflags bits by testing CR4 at runtime.  */
3601    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
3602    tcg_gen_mov_tl(s->T0, base);
3603}
3604
3605static void gen_RET(DisasContext *s, X86DecodedInsn *decode)
3606{
3607    int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
3608
3609    MemOp ot = gen_pop_T0(s);
3610    gen_stack_update(s, adjust + (1 << ot));
3611    gen_op_jmp_v(s, s->T0);
3612    gen_bnd_jmp(s);
3613    s->base.is_jmp = DISAS_JUMP;
3614}
3615
3616static void gen_RETF(DisasContext *s, X86DecodedInsn *decode)
3617{
3618    int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
3619
3620    if (!PE(s) || VM86(s)) {
3621        gen_lea_ss_ofs(s, s->A0, cpu_regs[R_ESP], 0);
3622        /* pop offset */
3623        gen_op_ld_v(s, s->dflag, s->T0, s->A0);
3624        /* NOTE: keeping EIP updated is not a problem in case of
3625           exception */
3626        gen_op_jmp_v(s, s->T0);
3627        /* pop selector */
3628        gen_add_A0_im(s, 1 << s->dflag);
3629        gen_op_ld_v(s, s->dflag, s->T0, s->A0);
3630        gen_op_movl_seg_real(s, R_CS, s->T0);
3631        /* add stack offset */
3632        gen_stack_update(s, adjust + (2 << s->dflag));
3633    } else {
3634        gen_update_cc_op(s);
3635        gen_update_eip_cur(s);
3636        gen_helper_lret_protected(tcg_env, tcg_constant_i32(s->dflag - 1),
3637                                  tcg_constant_i32(adjust));
3638    }
3639    s->base.is_jmp = DISAS_EOB_ONLY;
3640}
3641
3642/*
3643 * Return non-NULL if a 32-bit rotate works, after possibly replicating the input.
3644 * The input has already been zero-extended upon operand decode.
3645 */
3646static TCGv_i32 gen_rot_replicate(MemOp ot, TCGv in)
3647{
3648    TCGv_i32 temp;
3649    switch (ot) {
3650    case MO_8:
3651        temp = tcg_temp_new_i32();
3652        tcg_gen_trunc_tl_i32(temp, in);
3653        tcg_gen_muli_i32(temp, temp, 0x01010101);
3654        return temp;
3655
3656    case MO_16:
3657        temp = tcg_temp_new_i32();
3658        tcg_gen_trunc_tl_i32(temp, in);
3659        tcg_gen_deposit_i32(temp, temp, temp, 16, 16);
3660        return temp;
3661
3662#ifdef TARGET_X86_64
3663    case MO_32:
3664        temp = tcg_temp_new_i32();
3665        tcg_gen_trunc_tl_i32(temp, in);
3666        return temp;
3667#endif
3668
3669    default:
3670        return NULL;
3671    }
3672}
3673
3674static void gen_rot_carry(X86DecodedInsn *decode, TCGv result,
3675                          bool can_be_zero, TCGv count, int bit)
3676{
3677    if (!can_be_zero) {
3678        tcg_gen_extract_tl(decode->cc_dst, result, bit, 1);
3679    } else {
3680        TCGv temp = tcg_temp_new();
3681        tcg_gen_extract_tl(temp, result, bit, 1);
3682        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_dst, count, tcg_constant_tl(0),
3683                           decode->cc_dst, temp);
3684    }
3685}
3686
3687static void gen_ROL(DisasContext *s, X86DecodedInsn *decode)
3688{
3689    bool can_be_zero;
3690    TCGv count;
3691    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3692    TCGv_i32 temp32, count32;
3693    TCGv old = tcg_temp_new();
3694
3695    if (!count) {
3696        return;
3697    }
3698
3699    gen_eflags_adcox(s, decode, false, can_be_zero);
3700    tcg_gen_mov_tl(old, s->T0);
3701    temp32 = gen_rot_replicate(ot, s->T0);
3702    if (temp32) {
3703        count32 = tcg_temp_new_i32();
3704        tcg_gen_trunc_tl_i32(count32, count);
3705        tcg_gen_rotl_i32(temp32, temp32, count32);
3706        /* Zero extend to facilitate later optimization.  */
3707        tcg_gen_extu_i32_tl(s->T0, temp32);
3708    } else {
3709        tcg_gen_rotl_tl(s->T0, s->T0, count);
3710    }
3711    gen_rot_carry(decode, s->T0, can_be_zero, count, 0);
3712    gen_rot_overflow(decode, s->T0, old, can_be_zero, count);
3713}
3714
3715static void gen_ROR(DisasContext *s, X86DecodedInsn *decode)
3716{
3717    bool can_be_zero;
3718    TCGv count;
3719    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3720    TCGv_i32 temp32, count32;
3721    TCGv old = tcg_temp_new();
3722
3723    if (!count) {
3724        return;
3725    }
3726
3727    gen_eflags_adcox(s, decode, false, can_be_zero);
3728    tcg_gen_mov_tl(old, s->T0);
3729    temp32 = gen_rot_replicate(ot, s->T0);
3730    if (temp32) {
3731        count32 = tcg_temp_new_i32();
3732        tcg_gen_trunc_tl_i32(count32, count);
3733        tcg_gen_rotr_i32(temp32, temp32, count32);
3734        /* Zero extend to facilitate later optimization.  */
3735        tcg_gen_extu_i32_tl(s->T0, temp32);
3736        gen_rot_carry(decode, s->T0, can_be_zero, count, 31);
3737    } else {
3738        tcg_gen_rotr_tl(s->T0, s->T0, count);
3739        gen_rot_carry(decode, s->T0, can_be_zero, count, TARGET_LONG_BITS - 1);
3740    }
3741    gen_rot_overflow(decode, s->T0, old, can_be_zero, count);
3742}
3743
3744static void gen_RORX(DisasContext *s, X86DecodedInsn *decode)
3745{
3746    MemOp ot = decode->op[0].ot;
3747    int mask = ot == MO_64 ? 63 : 31;
3748    int b = decode->immediate & mask;
3749
3750    switch (ot) {
3751    case MO_32:
3752#ifdef TARGET_X86_64
3753        {
3754            TCGv_i32 tmp = tcg_temp_new_i32();
3755
3756            tcg_gen_trunc_tl_i32(tmp, s->T0);
3757            tcg_gen_rotri_i32(tmp, tmp, b);
3758            tcg_gen_extu_i32_tl(s->T0, tmp);
3759            break;
3760        }
3761
3762    case MO_64:
3763#endif
3764        tcg_gen_rotri_tl(s->T0, s->T0, b);
3765        break;
3766
3767    default:
3768        g_assert_not_reached();
3769    }
3770}
3771
3772#ifndef CONFIG_USER_ONLY
3773static void gen_RSM(DisasContext *s, X86DecodedInsn *decode)
3774{
3775    gen_helper_rsm(tcg_env);
3776    assume_cc_op(s, CC_OP_EFLAGS);
3777    s->base.is_jmp = DISAS_EOB_ONLY;
3778}
3779#else
3780#define gen_RSM gen_UD
3781#endif
3782
3783static void gen_SAHF(DisasContext *s, X86DecodedInsn *decode)
3784{
3785    if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
3786        return gen_illegal_opcode(s);
3787    }
3788    tcg_gen_shri_tl(s->T0, cpu_regs[R_EAX], 8);
3789    gen_compute_eflags(s);
3790    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
3791    tcg_gen_andi_tl(s->T0, s->T0, CC_S | CC_Z | CC_A | CC_P | CC_C);
3792    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, s->T0);
3793}
3794
3795static void gen_SALC(DisasContext *s, X86DecodedInsn *decode)
3796{
3797    gen_compute_eflags_c(s, s->T0);
3798    tcg_gen_neg_tl(s->T0, s->T0);
3799}
3800
3801static void gen_shift_dynamic_flags(DisasContext *s, X86DecodedInsn *decode, TCGv count, CCOp cc_op)
3802{
3803    TCGv_i32 count32 = tcg_temp_new_i32();
3804    TCGv_i32 old_cc_op;
3805
3806    decode->cc_op = CC_OP_DYNAMIC;
3807    decode->cc_op_dynamic = tcg_temp_new_i32();
3808
3809    assert(decode->cc_dst == s->T0);
3810    if (cc_op_live(s->cc_op) & USES_CC_DST) {
3811        decode->cc_dst = tcg_temp_new();
3812        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_dst, count, tcg_constant_tl(0),
3813                           cpu_cc_dst, s->T0);
3814    }
3815
3816    if (cc_op_live(s->cc_op) & USES_CC_SRC) {
3817        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_src, count, tcg_constant_tl(0),
3818                           cpu_cc_src, decode->cc_src);
3819    }
3820
3821    tcg_gen_trunc_tl_i32(count32, count);
3822    if (s->cc_op == CC_OP_DYNAMIC) {
3823        old_cc_op = cpu_cc_op;
3824    } else {
3825        old_cc_op = tcg_constant_i32(s->cc_op);
3826    }
3827    tcg_gen_movcond_i32(TCG_COND_EQ, decode->cc_op_dynamic, count32, tcg_constant_i32(0),
3828                        old_cc_op, tcg_constant_i32(cc_op));
3829}
3830
3831static void gen_SAR(DisasContext *s, X86DecodedInsn *decode)
3832{
3833    bool can_be_zero;
3834    TCGv count;
3835    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3836
3837    if (!count) {
3838        return;
3839    }
3840
3841    decode->cc_dst = s->T0;
3842    decode->cc_src = tcg_temp_new();
3843    tcg_gen_subi_tl(decode->cc_src, count, 1);
3844    tcg_gen_sar_tl(decode->cc_src, s->T0, decode->cc_src);
3845    tcg_gen_sar_tl(s->T0, s->T0, count);
3846    if (can_be_zero) {
3847        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
3848    } else {
3849        decode->cc_op = CC_OP_SARB + ot;
3850    }
3851}
3852
3853static void gen_SARX(DisasContext *s, X86DecodedInsn *decode)
3854{
3855    MemOp ot = decode->op[0].ot;
3856    int mask;
3857
3858    mask = ot == MO_64 ? 63 : 31;
3859    tcg_gen_andi_tl(s->T1, s->T1, mask);
3860    tcg_gen_sar_tl(s->T0, s->T0, s->T1);
3861}
3862
3863static void gen_SUB(DisasContext *s, X86DecodedInsn *decode);
3864static void gen_SBB(DisasContext *s, X86DecodedInsn *decode)
3865{
3866    MemOp ot = decode->op[0].ot;
3867    TCGv c_in;
3868
3869    /*
3870     * Try to avoid CC_OP_SBB by transforming as follows:
3871     * CC_SBB: src1 = dst + c_in, src2 = 0, src3 = c_in
3872     * CC_SUB: src1 = dst + c_in, src2 = c_in (no src3)
3873     *
3874     * In general src2 vs. src3 matters when computing AF and OF, but not here:
3875     * - AF is bit 4 of dst^src1^src2, which is bit 4 of dst^src1 in both cases
3876     * - OF is a function of the two MSBs, and in both cases they are zero for src2
3877     */
3878    if (decode->e.op2 == X86_TYPE_I && decode->immediate == 0) {
3879        gen_compute_eflags_c(s, s->T1);
3880        gen_SUB(s, decode);
3881        return;
3882    }
3883
3884    c_in = tcg_temp_new();
3885    gen_compute_eflags_c(s, c_in);
3886
3887    /*
3888     * Here the change is as follows:
3889     * CC_SBB: src1 = T0, src2 = T0, src3 = c_in
3890     * CC_SUB: src1 = 0, src2 = c_in (no src3)
3891     *
3892     * The difference also does not matter:
3893     * - AF is bit 4 of dst^src1^src2, but bit 4 of src1^src2 is zero in both cases
3894     *   therefore AF comes straight from dst (in fact it is c_in)
3895     * - for OF, src1 and src2 have the same sign in both cases, meaning there
3896     *   can be no overflow
3897     */
3898    if (decode->e.op2 != X86_TYPE_I && !decode->op[0].has_ea && decode->op[0].n == decode->op[2].n) {
3899        if (s->cc_op == CC_OP_DYNAMIC) {
3900            tcg_gen_neg_tl(s->T0, c_in);
3901        } else {
3902            /*
3903             * Do not negate c_in because it will often be dead and only the
3904             * instruction generated by negsetcond will survive.
3905             */
3906            gen_neg_setcc(s, JCC_B << 1, s->T0);
3907        }
3908        tcg_gen_movi_tl(s->cc_srcT, 0);
3909        decode->cc_src = c_in;
3910        decode->cc_dst = s->T0;
3911        decode->cc_op = CC_OP_SUBB + ot;
3912        return;
3913    }
3914
3915    if (s->prefix & PREFIX_LOCK) {
3916        tcg_gen_add_tl(s->T0, s->T1, c_in);
3917        tcg_gen_neg_tl(s->T0, s->T0);
3918        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T0,
3919                                    s->mem_index, ot | MO_LE);
3920    } else {
3921        tcg_gen_sub_tl(s->T0, s->T0, s->T1);
3922        tcg_gen_sub_tl(s->T0, s->T0, c_in);
3923    }
3924    prepare_update3_cc(decode, s, CC_OP_SBBB + ot, c_in);
3925}
3926
3927static void gen_SCAS(DisasContext *s, X86DecodedInsn *decode)
3928{
3929    MemOp ot = decode->op[2].ot;
3930    gen_repz_nz(s, ot, gen_scas);
3931}
3932
3933static void gen_SETcc(DisasContext *s, X86DecodedInsn *decode)
3934{
3935    gen_setcc(s, decode->b & 0xf, s->T0);
3936}
3937
3938static void gen_SFENCE(DisasContext *s, X86DecodedInsn *decode)
3939{
3940    tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
3941}
3942
3943static void gen_SHA1NEXTE(DisasContext *s, X86DecodedInsn *decode)
3944{
3945    gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
3946}
3947
3948static void gen_SHA1MSG1(DisasContext *s, X86DecodedInsn *decode)
3949{
3950    gen_helper_sha1msg1(OP_PTR0, OP_PTR1, OP_PTR2);
3951}
3952
3953static void gen_SHA1MSG2(DisasContext *s, X86DecodedInsn *decode)
3954{
3955    gen_helper_sha1msg2(OP_PTR0, OP_PTR1, OP_PTR2);
3956}
3957
3958static void gen_SHA1RNDS4(DisasContext *s, X86DecodedInsn *decode)
3959{
3960    switch(decode->immediate & 3) {
3961    case 0:
3962        gen_helper_sha1rnds4_f0(OP_PTR0, OP_PTR0, OP_PTR1);
3963        break;
3964    case 1:
3965        gen_helper_sha1rnds4_f1(OP_PTR0, OP_PTR0, OP_PTR1);
3966        break;
3967    case 2:
3968        gen_helper_sha1rnds4_f2(OP_PTR0, OP_PTR0, OP_PTR1);
3969        break;
3970    case 3:
3971        gen_helper_sha1rnds4_f3(OP_PTR0, OP_PTR0, OP_PTR1);
3972        break;
3973    }
3974}
3975
3976static void gen_SHA256MSG1(DisasContext *s, X86DecodedInsn *decode)
3977{
3978    gen_helper_sha256msg1(OP_PTR0, OP_PTR1, OP_PTR2);
3979}
3980
3981static void gen_SHA256MSG2(DisasContext *s, X86DecodedInsn *decode)
3982{
3983    gen_helper_sha256msg2(OP_PTR0, OP_PTR1, OP_PTR2);
3984}
3985
3986static void gen_SHA256RNDS2(DisasContext *s, X86DecodedInsn *decode)
3987{
3988    TCGv_i32 wk0 = tcg_temp_new_i32();
3989    TCGv_i32 wk1 = tcg_temp_new_i32();
3990
3991    tcg_gen_ld_i32(wk0, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(0)));
3992    tcg_gen_ld_i32(wk1, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(1)));
3993
3994    gen_helper_sha256rnds2(OP_PTR0, OP_PTR1, OP_PTR2, wk0, wk1);
3995}
3996
3997static void gen_SHL(DisasContext *s, X86DecodedInsn *decode)
3998{
3999    bool can_be_zero;
4000    TCGv count;
4001    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
4002
4003    if (!count) {
4004        return;
4005    }
4006
4007    decode->cc_dst = s->T0;
4008    decode->cc_src = tcg_temp_new();
4009    tcg_gen_subi_tl(decode->cc_src, count, 1);
4010    tcg_gen_shl_tl(decode->cc_src, s->T0, decode->cc_src);
4011    tcg_gen_shl_tl(s->T0, s->T0, count);
4012    if (can_be_zero) {
4013        gen_shift_dynamic_flags(s, decode, count, CC_OP_SHLB + ot);
4014    } else {
4015        decode->cc_op = CC_OP_SHLB + ot;
4016    }
4017}
4018
4019static void gen_SHLD(DisasContext *s, X86DecodedInsn *decode)
4020{
4021    bool can_be_zero;
4022    TCGv count;
4023    int unit = decode->e.op3 == X86_TYPE_I ? X86_OP_IMM : X86_OP_INT;
4024    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, unit);
4025
4026    if (!count) {
4027        return;
4028    }
4029
4030    decode->cc_dst = s->T0;
4031    decode->cc_src = gen_shiftd_rm_T1(s, ot, false, count);
4032    if (can_be_zero) {
4033        gen_shift_dynamic_flags(s, decode, count, CC_OP_SHLB + ot);
4034    } else {
4035        decode->cc_op = CC_OP_SHLB + ot;
4036    }
4037}
4038
4039static void gen_SHLX(DisasContext *s, X86DecodedInsn *decode)
4040{
4041    MemOp ot = decode->op[0].ot;
4042    int mask;
4043
4044    mask = ot == MO_64 ? 63 : 31;
4045    tcg_gen_andi_tl(s->T1, s->T1, mask);
4046    tcg_gen_shl_tl(s->T0, s->T0, s->T1);
4047}
4048
4049static void gen_SHR(DisasContext *s, X86DecodedInsn *decode)
4050{
4051    bool can_be_zero;
4052    TCGv count;
4053    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
4054
4055    if (!count) {
4056        return;
4057    }
4058
4059    decode->cc_dst = s->T0;
4060    decode->cc_src = tcg_temp_new();
4061    tcg_gen_subi_tl(decode->cc_src, count, 1);
4062    tcg_gen_shr_tl(decode->cc_src, s->T0, decode->cc_src);
4063    tcg_gen_shr_tl(s->T0, s->T0, count);
4064    if (can_be_zero) {
4065        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
4066    } else {
4067        decode->cc_op = CC_OP_SARB + ot;
4068    }
4069}
4070
4071static void gen_SHRD(DisasContext *s, X86DecodedInsn *decode)
4072{
4073    bool can_be_zero;
4074    TCGv count;
4075    int unit = decode->e.op3 == X86_TYPE_I ? X86_OP_IMM : X86_OP_INT;
4076    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, unit);
4077
4078    if (!count) {
4079        return;
4080    }
4081
4082    decode->cc_dst = s->T0;
4083    decode->cc_src = gen_shiftd_rm_T1(s, ot, true, count);
4084    if (can_be_zero) {
4085        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
4086    } else {
4087        decode->cc_op = CC_OP_SARB + ot;
4088    }
4089}
4090
4091static void gen_SHRX(DisasContext *s, X86DecodedInsn *decode)
4092{
4093    MemOp ot = decode->op[0].ot;
4094    int mask;
4095
4096    mask = ot == MO_64 ? 63 : 31;
4097    tcg_gen_andi_tl(s->T1, s->T1, mask);
4098    tcg_gen_shr_tl(s->T0, s->T0, s->T1);
4099}
4100
4101static void gen_STC(DisasContext *s, X86DecodedInsn *decode)
4102{
4103    gen_compute_eflags(s);
4104    tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
4105}
4106
4107static void gen_STD(DisasContext *s, X86DecodedInsn *decode)
4108{
4109    tcg_gen_st_i32(tcg_constant_i32(-1), tcg_env, offsetof(CPUX86State, df));
4110}
4111
4112static void gen_STI(DisasContext *s, X86DecodedInsn *decode)
4113{
4114    gen_set_eflags(s, IF_MASK);
4115    s->base.is_jmp = DISAS_EOB_INHIBIT_IRQ;
4116}
4117
4118static void gen_VAESKEYGEN(DisasContext *s, X86DecodedInsn *decode)
4119{
4120    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4121    assert(!s->vex_l);
4122    gen_helper_aeskeygenassist_xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
4123}
4124
4125static void gen_STMXCSR(DisasContext *s, X86DecodedInsn *decode)
4126{
4127    gen_helper_update_mxcsr(tcg_env);
4128    tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
4129}
4130
4131static void gen_STOS(DisasContext *s, X86DecodedInsn *decode)
4132{
4133    MemOp ot = decode->op[1].ot;
4134    gen_repz(s, ot, gen_stos);
4135}
4136
4137static void gen_SUB(DisasContext *s, X86DecodedInsn *decode)
4138{
4139    MemOp ot = decode->op[1].ot;
4140
4141    if (s->prefix & PREFIX_LOCK) {
4142        tcg_gen_neg_tl(s->T0, s->T1);
4143        tcg_gen_atomic_fetch_add_tl(s->cc_srcT, s->A0, s->T0,
4144                                    s->mem_index, ot | MO_LE);
4145        tcg_gen_sub_tl(s->T0, s->cc_srcT, s->T1);
4146    } else {
4147        tcg_gen_mov_tl(s->cc_srcT, s->T0);
4148        tcg_gen_sub_tl(s->T0, s->T0, s->T1);
4149    }
4150    prepare_update2_cc(decode, s, CC_OP_SUBB + ot);
4151}
4152
4153static void gen_SYSCALL(DisasContext *s, X86DecodedInsn *decode)
4154{
4155    gen_update_cc_op(s);
4156    gen_update_eip_cur(s);
4157    gen_helper_syscall(tcg_env, cur_insn_len_i32(s));
4158    if (LMA(s)) {
4159        assume_cc_op(s, CC_OP_EFLAGS);
4160    }
4161
4162    /*
4163     * TF handling for the syscall insn is different. The TF bit is checked
4164     * after the syscall insn completes. This allows #DB to not be
4165     * generated after one has entered CPL0 if TF is set in FMASK.
4166     */
4167    s->base.is_jmp = DISAS_EOB_RECHECK_TF;
4168}
4169
4170static void gen_SYSENTER(DisasContext *s, X86DecodedInsn *decode)
4171{
4172    gen_helper_sysenter(tcg_env);
4173    s->base.is_jmp = DISAS_EOB_ONLY;
4174}
4175
4176static void gen_SYSEXIT(DisasContext *s, X86DecodedInsn *decode)
4177{
4178    gen_helper_sysexit(tcg_env, tcg_constant_i32(s->dflag - 1));
4179    s->base.is_jmp = DISAS_EOB_ONLY;
4180}
4181
4182static void gen_SYSRET(DisasContext *s, X86DecodedInsn *decode)
4183{
4184    gen_helper_sysret(tcg_env, tcg_constant_i32(s->dflag - 1));
4185    if (LMA(s)) {
4186        assume_cc_op(s, CC_OP_EFLAGS);
4187    }
4188
4189    /*
4190     * TF handling for the sysret insn is different. The TF bit is checked
4191     * after the sysret insn completes. This allows #DB to be
4192     * generated "as if" the syscall insn in userspace has just
4193     * completed.
4194     */
4195    s->base.is_jmp = DISAS_EOB_RECHECK_TF;
4196}
4197
4198static void gen_TZCNT(DisasContext *s, X86DecodedInsn *decode)
4199{
4200    MemOp ot = decode->op[0].ot;
4201
4202    /* C bit (cc_src) is defined related to the input.  */
4203    decode->cc_src = tcg_temp_new();
4204    decode->cc_dst = s->T0;
4205    decode->cc_op = CC_OP_BMILGB + ot;
4206    tcg_gen_mov_tl(decode->cc_src, s->T0);
4207
4208    /* A zero input returns the operand size.  */
4209    tcg_gen_ctzi_tl(s->T0, s->T0, 8 << ot);
4210}
4211
4212static void gen_UD(DisasContext *s, X86DecodedInsn *decode)
4213{
4214    gen_illegal_opcode(s);
4215}
4216
4217static void gen_VAESIMC(DisasContext *s, X86DecodedInsn *decode)
4218{
4219    assert(!s->vex_l);
4220    gen_helper_aesimc_xmm(tcg_env, OP_PTR0, OP_PTR2);
4221}
4222
4223/*
4224 * 00 = v*ps Vps, Hps, Wpd
4225 * 66 = v*pd Vpd, Hpd, Wps
4226 * f3 = v*ss Vss, Hss, Wps
4227 * f2 = v*sd Vsd, Hsd, Wps
4228 */
4229#define SSE_CMP(x) { \
4230    gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \
4231    gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \
4232    gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm}
4233static const SSEFunc_0_eppp gen_helper_cmp_funcs[32][6] = {
4234    SSE_CMP(cmpeq),
4235    SSE_CMP(cmplt),
4236    SSE_CMP(cmple),
4237    SSE_CMP(cmpunord),
4238    SSE_CMP(cmpneq),
4239    SSE_CMP(cmpnlt),
4240    SSE_CMP(cmpnle),
4241    SSE_CMP(cmpord),
4242
4243    SSE_CMP(cmpequ),
4244    SSE_CMP(cmpnge),
4245    SSE_CMP(cmpngt),
4246    SSE_CMP(cmpfalse),
4247    SSE_CMP(cmpnequ),
4248    SSE_CMP(cmpge),
4249    SSE_CMP(cmpgt),
4250    SSE_CMP(cmptrue),
4251
4252    SSE_CMP(cmpeqs),
4253    SSE_CMP(cmpltq),
4254    SSE_CMP(cmpleq),
4255    SSE_CMP(cmpunords),
4256    SSE_CMP(cmpneqq),
4257    SSE_CMP(cmpnltq),
4258    SSE_CMP(cmpnleq),
4259    SSE_CMP(cmpords),
4260
4261    SSE_CMP(cmpequs),
4262    SSE_CMP(cmpngeq),
4263    SSE_CMP(cmpngtq),
4264    SSE_CMP(cmpfalses),
4265    SSE_CMP(cmpnequs),
4266    SSE_CMP(cmpgeq),
4267    SSE_CMP(cmpgtq),
4268    SSE_CMP(cmptrues),
4269};
4270#undef SSE_CMP
4271
4272static void gen_VCMP(DisasContext *s, X86DecodedInsn *decode)
4273{
4274    int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7);
4275    int b =
4276        s->prefix & PREFIX_REPZ  ? 2 /* ss */ :
4277        s->prefix & PREFIX_REPNZ ? 3 /* sd */ :
4278        !!(s->prefix & PREFIX_DATA) /* pd */ + (s->vex_l << 2);
4279
4280    gen_helper_cmp_funcs[index][b](tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
4281}
4282
4283static void gen_VCOMI(DisasContext *s, X86DecodedInsn *decode)
4284{
4285    SSEFunc_0_epp fn;
4286    fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss;
4287    fn(tcg_env, OP_PTR1, OP_PTR2);
4288    assume_cc_op(s, CC_OP_EFLAGS);
4289}
4290
4291static void gen_VCVTPD2PS(DisasContext *s, X86DecodedInsn *decode)
4292{
4293    if (s->vex_l) {
4294        gen_helper_cvtpd2ps_ymm(tcg_env, OP_PTR0, OP_PTR2);
4295    } else {
4296        gen_helper_cvtpd2ps_xmm(tcg_env, OP_PTR0, OP_PTR2);
4297    }
4298}
4299
4300static void gen_VCVTPS2PD(DisasContext *s, X86DecodedInsn *decode)
4301{
4302    if (s->vex_l) {
4303        gen_helper_cvtps2pd_ymm(tcg_env, OP_PTR0, OP_PTR2);
4304    } else {
4305        gen_helper_cvtps2pd_xmm(tcg_env, OP_PTR0, OP_PTR2);
4306    }
4307}
4308
4309static void gen_VCVTPS2PH(DisasContext *s, X86DecodedInsn *decode)
4310{
4311    gen_unary_imm_fp_sse(s, decode,
4312                      gen_helper_cvtps2ph_xmm,
4313                      gen_helper_cvtps2ph_ymm);
4314    /*
4315     * VCVTPS2PH is the only instruction that performs an operation on a
4316     * register source and then *stores* into memory.
4317     */
4318    if (decode->op[0].has_ea) {
4319        gen_store_sse(s, decode, decode->op[0].offset);
4320    }
4321}
4322
4323static void gen_VCVTSD2SS(DisasContext *s, X86DecodedInsn *decode)
4324{
4325    gen_helper_cvtsd2ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
4326}
4327
4328static void gen_VCVTSS2SD(DisasContext *s, X86DecodedInsn *decode)
4329{
4330    gen_helper_cvtss2sd(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
4331}
4332
4333static void gen_VCVTSI2Sx(DisasContext *s, X86DecodedInsn *decode)
4334{
4335    int vec_len = vector_len(s, decode);
4336    TCGv_i32 in;
4337
4338    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4339
4340#ifdef TARGET_X86_64
4341    MemOp ot = decode->op[2].ot;
4342    if (ot == MO_64) {
4343        if (s->prefix & PREFIX_REPNZ) {
4344            gen_helper_cvtsq2sd(tcg_env, OP_PTR0, s->T1);
4345        } else {
4346            gen_helper_cvtsq2ss(tcg_env, OP_PTR0, s->T1);
4347        }
4348        return;
4349    }
4350    in = tcg_temp_new_i32();
4351    tcg_gen_trunc_tl_i32(in, s->T1);
4352#else
4353    in = s->T1;
4354#endif
4355
4356    if (s->prefix & PREFIX_REPNZ) {
4357        gen_helper_cvtsi2sd(tcg_env, OP_PTR0, in);
4358    } else {
4359        gen_helper_cvtsi2ss(tcg_env, OP_PTR0, in);
4360    }
4361}
4362
4363static inline void gen_VCVTtSx2SI(DisasContext *s, X86DecodedInsn *decode,
4364                                  SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq,
4365                                  SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq)
4366{
4367    TCGv_i32 out;
4368
4369#ifdef TARGET_X86_64
4370    MemOp ot = decode->op[0].ot;
4371    if (ot == MO_64) {
4372        if (s->prefix & PREFIX_REPNZ) {
4373            sd2sq(s->T0, tcg_env, OP_PTR2);
4374        } else {
4375            ss2sq(s->T0, tcg_env, OP_PTR2);
4376        }
4377        return;
4378    }
4379
4380    out = tcg_temp_new_i32();
4381#else
4382    out = s->T0;
4383#endif
4384    if (s->prefix & PREFIX_REPNZ) {
4385        sd2si(out, tcg_env, OP_PTR2);
4386    } else {
4387        ss2si(out, tcg_env, OP_PTR2);
4388    }
4389#ifdef TARGET_X86_64
4390    tcg_gen_extu_i32_tl(s->T0, out);
4391#endif
4392}
4393
4394#ifndef TARGET_X86_64
4395#define gen_helper_cvtss2sq NULL
4396#define gen_helper_cvtsd2sq NULL
4397#define gen_helper_cvttss2sq NULL
4398#define gen_helper_cvttsd2sq NULL
4399#endif
4400
4401static void gen_VCVTSx2SI(DisasContext *s, X86DecodedInsn *decode)
4402{
4403    gen_VCVTtSx2SI(s, decode,
4404                   gen_helper_cvtss2si, gen_helper_cvtss2sq,
4405                   gen_helper_cvtsd2si, gen_helper_cvtsd2sq);
4406}
4407
4408static void gen_VCVTTSx2SI(DisasContext *s, X86DecodedInsn *decode)
4409{
4410    gen_VCVTtSx2SI(s, decode,
4411                   gen_helper_cvttss2si, gen_helper_cvttss2sq,
4412                   gen_helper_cvttsd2si, gen_helper_cvttsd2sq);
4413}
4414
4415static void gen_VEXTRACTx128(DisasContext *s, X86DecodedInsn *decode)
4416{
4417    int mask = decode->immediate & 1;
4418    int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask);
4419    if (decode->op[0].has_ea) {
4420        /* VEX-only instruction, no alignment requirements.  */
4421        gen_sto_env_A0(s, src_ofs, false);
4422    } else {
4423        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, 16, 16);
4424    }
4425}
4426
4427static void gen_VEXTRACTPS(DisasContext *s, X86DecodedInsn *decode)
4428{
4429    gen_pextr(s, decode, MO_32);
4430}
4431
4432static void gen_vinsertps(DisasContext *s, X86DecodedInsn *decode, TCGv_i32 tmp)
4433{
4434    int val = decode->immediate;
4435    int dest_word = (val >> 4) & 3;
4436    int new_mask = (val & 15) | (1 << dest_word);
4437    int vec_len = 16;
4438
4439    assert(!s->vex_l);
4440
4441    if (new_mask == 15) {
4442        /* All zeroes except possibly for the inserted element */
4443        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
4444    } else if (decode->op[1].offset != decode->op[0].offset) {
4445        gen_store_sse(s, decode, decode->op[1].offset);
4446    }
4447
4448    if (new_mask != (val & 15)) {
4449        tcg_gen_st_i32(tmp, tcg_env,
4450                       vector_elem_offset(&decode->op[0], MO_32, dest_word));
4451    }
4452
4453    if (new_mask != 15) {
4454        TCGv_i32 zero = tcg_constant_i32(0); /* float32_zero */
4455        int i;
4456        for (i = 0; i < 4; i++) {
4457            if ((val >> i) & 1) {
4458                tcg_gen_st_i32(zero, tcg_env,
4459                               vector_elem_offset(&decode->op[0], MO_32, i));
4460            }
4461        }
4462    }
4463}
4464
4465static void gen_VINSERTPS_r(DisasContext *s, X86DecodedInsn *decode)
4466{
4467    int val = decode->immediate;
4468    TCGv_i32 tmp = tcg_temp_new_i32();
4469
4470    tcg_gen_ld_i32(tmp, tcg_env,
4471                   vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3));
4472    gen_vinsertps(s, decode, tmp);
4473}
4474
4475static void gen_VINSERTPS_m(DisasContext *s, X86DecodedInsn *decode)
4476{
4477    TCGv_i32 tmp = tcg_temp_new_i32();
4478
4479    tcg_gen_qemu_ld_i32(tmp, s->A0, s->mem_index, MO_LEUL);
4480    gen_vinsertps(s, decode, tmp);
4481}
4482
4483static void gen_VINSERTx128(DisasContext *s, X86DecodedInsn *decode)
4484{
4485    int mask = decode->immediate & 1;
4486    tcg_gen_gvec_mov(MO_64,
4487                     decode->op[0].offset + offsetof(YMMReg, YMM_X(mask)),
4488                     decode->op[2].offset + offsetof(YMMReg, YMM_X(0)), 16, 16);
4489    tcg_gen_gvec_mov(MO_64,
4490                     decode->op[0].offset + offsetof(YMMReg, YMM_X(!mask)),
4491                     decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
4492}
4493
4494static inline void gen_maskmov(DisasContext *s, X86DecodedInsn *decode,
4495                               SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
4496{
4497    if (!s->vex_l) {
4498        xmm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
4499    } else {
4500        ymm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
4501    }
4502}
4503
4504static void gen_VMASKMOVPD_st(DisasContext *s, X86DecodedInsn *decode)
4505{
4506    gen_maskmov(s, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
4507}
4508
4509static void gen_VMASKMOVPS_st(DisasContext *s, X86DecodedInsn *decode)
4510{
4511    gen_maskmov(s, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
4512}
4513
4514static void gen_VMOVHPx_ld(DisasContext *s, X86DecodedInsn *decode)
4515{
4516    gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4517    if (decode->op[0].offset != decode->op[1].offset) {
4518        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
4519        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4520    }
4521}
4522
4523static void gen_VMOVHPx_st(DisasContext *s, X86DecodedInsn *decode)
4524{
4525    gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
4526}
4527
4528static void gen_VMOVHPx(DisasContext *s, X86DecodedInsn *decode)
4529{
4530    if (decode->op[0].offset != decode->op[2].offset) {
4531        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
4532        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4533    }
4534    if (decode->op[0].offset != decode->op[1].offset) {
4535        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
4536        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4537    }
4538}
4539
4540static void gen_VMOVHLPS(DisasContext *s, X86DecodedInsn *decode)
4541{
4542    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
4543    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4544    if (decode->op[0].offset != decode->op[1].offset) {
4545        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
4546        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4547    }
4548}
4549
4550static void gen_VMOVLHPS(DisasContext *s, X86DecodedInsn *decode)
4551{
4552    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
4553    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4554    if (decode->op[0].offset != decode->op[1].offset) {
4555        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
4556        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4557    }
4558}
4559
4560/*
4561 * Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
4562 * Use a gvec move to move everything above the bottom 64 bits.
4563 */
4564
4565static void gen_VMOVLPx(DisasContext *s, X86DecodedInsn *decode)
4566{
4567    int vec_len = vector_len(s, decode);
4568
4569    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
4570    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4571    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4572}
4573
4574static void gen_VMOVLPx_ld(DisasContext *s, X86DecodedInsn *decode)
4575{
4576    int vec_len = vector_len(s, decode);
4577
4578    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
4579    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4580    tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
4581}
4582
4583static void gen_VMOVLPx_st(DisasContext *s, X86DecodedInsn *decode)
4584{
4585    tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
4586    tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
4587}
4588
4589static void gen_VMOVSD_ld(DisasContext *s, X86DecodedInsn *decode)
4590{
4591    TCGv_i64 zero = tcg_constant_i64(0);
4592
4593    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
4594    tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
4595    tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
4596}
4597
4598static void gen_VMOVSS(DisasContext *s, X86DecodedInsn *decode)
4599{
4600    int vec_len = vector_len(s, decode);
4601    TCGv_i32 tmp = tcg_temp_new_i32();
4602
4603    tcg_gen_ld_i32(tmp, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
4604    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4605    tcg_gen_st_i32(tmp, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
4606}
4607
4608static void gen_VMOVSS_ld(DisasContext *s, X86DecodedInsn *decode)
4609{
4610    int vec_len = vector_len(s, decode);
4611    TCGv_i32 tmp = tcg_temp_new_i32();
4612
4613    tcg_gen_qemu_ld_i32(tmp, s->A0, s->mem_index, MO_LEUL);
4614    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
4615    tcg_gen_st_i32(tmp, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
4616}
4617
4618static void gen_VMOVSS_st(DisasContext *s, X86DecodedInsn *decode)
4619{
4620    TCGv_i32 tmp = tcg_temp_new_i32();
4621
4622    tcg_gen_ld_i32(tmp, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
4623    tcg_gen_qemu_st_i32(tmp, s->A0, s->mem_index, MO_LEUL);
4624}
4625
4626static void gen_VPMASKMOV_st(DisasContext *s, X86DecodedInsn *decode)
4627{
4628    if (s->vex_w) {
4629        gen_VMASKMOVPD_st(s, decode);
4630    } else {
4631        gen_VMASKMOVPS_st(s, decode);
4632    }
4633}
4634
4635static void gen_VPERMD(DisasContext *s, X86DecodedInsn *decode)
4636{
4637    assert(s->vex_l);
4638    gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
4639}
4640
4641static void gen_VPERM2x128(DisasContext *s, X86DecodedInsn *decode)
4642{
4643    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4644    assert(s->vex_l);
4645    gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
4646}
4647
4648static void gen_VPHMINPOSUW(DisasContext *s, X86DecodedInsn *decode)
4649{
4650    assert(!s->vex_l);
4651    gen_helper_phminposuw_xmm(tcg_env, OP_PTR0, OP_PTR2);
4652}
4653
4654static void gen_VROUNDSD(DisasContext *s, X86DecodedInsn *decode)
4655{
4656    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4657    assert(!s->vex_l);
4658    gen_helper_roundsd_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
4659}
4660
4661static void gen_VROUNDSS(DisasContext *s, X86DecodedInsn *decode)
4662{
4663    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4664    assert(!s->vex_l);
4665    gen_helper_roundss_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
4666}
4667
4668static void gen_VSHUF(DisasContext *s, X86DecodedInsn *decode)
4669{
4670    TCGv_i32 imm = tcg_constant_i32(decode->immediate);
4671    SSEFunc_0_pppi ps, pd, fn;
4672    ps = s->vex_l ? gen_helper_shufps_ymm : gen_helper_shufps_xmm;
4673    pd = s->vex_l ? gen_helper_shufpd_ymm : gen_helper_shufpd_xmm;
4674    fn = s->prefix & PREFIX_DATA ? pd : ps;
4675    fn(OP_PTR0, OP_PTR1, OP_PTR2, imm);
4676}
4677
4678static void gen_VUCOMI(DisasContext *s, X86DecodedInsn *decode)
4679{
4680    SSEFunc_0_epp fn;
4681    fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss;
4682    fn(tcg_env, OP_PTR1, OP_PTR2);
4683    assume_cc_op(s, CC_OP_EFLAGS);
4684}
4685
4686static void gen_VZEROALL(DisasContext *s, X86DecodedInsn *decode)
4687{
4688    TCGv_ptr ptr = tcg_temp_new_ptr();
4689
4690    tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_regs));
4691    gen_helper_memset(ptr, ptr, tcg_constant_i32(0),
4692                      tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg)));
4693}
4694
4695static void gen_VZEROUPPER(DisasContext *s, X86DecodedInsn *decode)
4696{
4697    int i;
4698
4699    for (i = 0; i < CPU_NB_REGS; i++) {
4700        int offset = offsetof(CPUX86State, xmm_regs[i].ZMM_X(1));
4701        tcg_gen_gvec_dup_imm(MO_64, offset, 16, 16, 0);
4702    }
4703}
4704
4705static void gen_WAIT(DisasContext *s, X86DecodedInsn *decode)
4706{
4707    if ((s->flags & (HF_MP_MASK | HF_TS_MASK)) == (HF_MP_MASK | HF_TS_MASK)) {
4708        gen_NM_exception(s);
4709    } else {
4710        /* needs to be treated as I/O because of ferr_irq */
4711        translator_io_start(&s->base);
4712        gen_helper_fwait(tcg_env);
4713    }
4714}
4715
4716#ifndef CONFIG_USER_ONLY
4717static void gen_WRMSR(DisasContext *s, X86DecodedInsn *decode)
4718{
4719    gen_update_cc_op(s);
4720    gen_update_eip_cur(s);
4721    gen_helper_wrmsr(tcg_env);
4722    s->base.is_jmp = DISAS_EOB_NEXT;
4723}
4724#else
4725#define gen_WRMSR gen_unreachable
4726#endif
4727
4728static void gen_WRxxBASE(DisasContext *s, X86DecodedInsn *decode)
4729{
4730    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
4731
4732    /* Preserve hflags bits by testing CR4 at runtime.  */
4733    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
4734    tcg_gen_mov_tl(base, s->T0);
4735}
4736
4737static void gen_XADD(DisasContext *s, X86DecodedInsn *decode)
4738{
4739    MemOp ot = decode->op[1].ot;
4740
4741    decode->cc_dst = tcg_temp_new();
4742    decode->cc_src = s->T1;
4743    decode->cc_op = CC_OP_ADDB + ot;
4744
4745    if (s->prefix & PREFIX_LOCK) {
4746        tcg_gen_atomic_fetch_add_tl(s->T0, s->A0, s->T1, s->mem_index, ot | MO_LE);
4747        tcg_gen_add_tl(decode->cc_dst, s->T0, s->T1);
4748    } else {
4749        tcg_gen_add_tl(decode->cc_dst, s->T0, s->T1);
4750        /*
4751         * NOTE: writing memory first is important for MMU exceptions,
4752         * but "new result" wins for XADD AX, AX.
4753         */
4754        gen_writeback(s, decode, 0, decode->cc_dst);
4755    }
4756    if (decode->op[0].has_ea || decode->op[2].n != decode->op[0].n) {
4757        gen_writeback(s, decode, 2, s->T0);
4758    }
4759}
4760
4761static void gen_XCHG(DisasContext *s, X86DecodedInsn *decode)
4762{
4763    if (s->prefix & PREFIX_LOCK) {
4764        tcg_gen_atomic_xchg_tl(s->T0, s->A0, s->T1,
4765                               s->mem_index, decode->op[0].ot | MO_LE);
4766        /* now store old value into register operand */
4767        gen_op_mov_reg_v(s, decode->op[2].ot, decode->op[2].n, s->T0);
4768    } else {
4769        /* move destination value into source operand, source preserved in T1 */
4770        gen_op_mov_reg_v(s, decode->op[2].ot, decode->op[2].n, s->T0);
4771        tcg_gen_mov_tl(s->T0, s->T1);
4772    }
4773}
4774
4775static void gen_XLAT(DisasContext *s, X86DecodedInsn *decode)
4776{
4777    /* AL is already zero-extended into s->T0.  */
4778    tcg_gen_add_tl(s->A0, cpu_regs[R_EBX], s->T0);
4779    gen_lea_v_seg(s, s->A0, R_DS, s->override);
4780    gen_op_ld_v(s, MO_8, s->T0, s->A0);
4781}
4782
4783static void gen_XOR(DisasContext *s, X86DecodedInsn *decode)
4784{
4785    /* special case XOR reg, reg */
4786    if (decode->op[1].unit == X86_OP_INT &&
4787        decode->op[2].unit == X86_OP_INT &&
4788        decode->op[1].n == decode->op[2].n) {
4789        tcg_gen_movi_tl(s->T0, 0);
4790        decode->cc_op = CC_OP_EFLAGS;
4791        decode->cc_src = tcg_constant_tl(CC_Z | CC_P);
4792    } else {
4793        MemOp ot = decode->op[1].ot;
4794
4795        if (s->prefix & PREFIX_LOCK) {
4796            tcg_gen_atomic_xor_fetch_tl(s->T0, s->A0, s->T1,
4797                                        s->mem_index, ot | MO_LE);
4798        } else {
4799            tcg_gen_xor_tl(s->T0, s->T0, s->T1);
4800        }
4801        prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
4802    }
4803}
4804
4805static void gen_XRSTOR(DisasContext *s, X86DecodedInsn *decode)
4806{
4807    TCGv_i64 features = tcg_temp_new_i64();
4808
4809    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
4810    gen_helper_xrstor(tcg_env, s->A0, features);
4811    if (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_MPX) {
4812        /*
4813         * XRSTOR is how MPX is enabled, which changes how
4814         * we translate.  Thus we need to end the TB.
4815         */
4816        s->base.is_jmp = DISAS_EOB_NEXT;
4817    }
4818}
4819
4820static void gen_XSAVE(DisasContext *s, X86DecodedInsn *decode)
4821{
4822    TCGv_i64 features = tcg_temp_new_i64();
4823
4824    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
4825    gen_helper_xsave(tcg_env, s->A0, features);
4826}
4827
4828static void gen_XSAVEOPT(DisasContext *s, X86DecodedInsn *decode)
4829{
4830    TCGv_i64 features = tcg_temp_new_i64();
4831
4832    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
4833    gen_helper_xsave(tcg_env, s->A0, features);
4834}
4835