xref: /openbmc/qemu/target/i386/tcg/emit.c.inc (revision 2b74dd91)
1/*
2 * New-style TCG opcode generator for i386 instructions
3 *
4 *  Copyright (c) 2022 Red Hat, Inc.
5 *
6 * Author: Paolo Bonzini <pbonzini@redhat.com>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22/*
23 * Sometimes, knowing what the backend has can produce better code.
24 * The exact opcode to check depends on 32- vs. 64-bit.
25 */
26#ifdef TARGET_X86_64
27#define TCG_TARGET_HAS_extract2_tl      TCG_TARGET_HAS_extract2_i64
28#define TCG_TARGET_deposit_tl_valid     TCG_TARGET_deposit_i64_valid
29#define TCG_TARGET_extract_tl_valid     TCG_TARGET_extract_i64_valid
30#else
31#define TCG_TARGET_HAS_extract2_tl      TCG_TARGET_HAS_extract2_i32
32#define TCG_TARGET_deposit_tl_valid     TCG_TARGET_deposit_i32_valid
33#define TCG_TARGET_extract_tl_valid     TCG_TARGET_extract_i32_valid
34#endif
35
36#define MMX_OFFSET(reg)                        \
37  ({ assert((reg) >= 0 && (reg) <= 7);         \
38     offsetof(CPUX86State, fpregs[reg].mmx); })
39
40#define ZMM_OFFSET(reg)                        \
41  ({ assert((reg) >= 0 && (reg) <= 15);        \
42     offsetof(CPUX86State, xmm_regs[reg]); })
43
44typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
45typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
46typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
47typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
48                               TCGv_ptr reg_c);
49typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
50                                TCGv_ptr reg_c, TCGv_ptr reg_d);
51typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
52                               TCGv_i32 val);
53typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
54                                TCGv_ptr reg_c, TCGv_i32 val);
55typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
56typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c,
57                               TCGv_i32 val);
58typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
59                               TCGv val);
60typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
61                                 TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale);
62typedef void (*SSEFunc_0_eppppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
63                                  TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 flags);
64typedef void (*SSEFunc_0_eppppii)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
65                                  TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 even,
66                                  TCGv_i32 odd);
67
68static void gen_JMP_m(DisasContext *s, X86DecodedInsn *decode);
69static void gen_JMP(DisasContext *s, X86DecodedInsn *decode);
70
71static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
72{
73    return tcg_constant_i32(val);
74}
75
76static void gen_NM_exception(DisasContext *s)
77{
78    gen_exception(s, EXCP07_PREX);
79}
80
81static void gen_load_ea(DisasContext *s, AddressParts *mem, bool is_vsib)
82{
83    TCGv ea = gen_lea_modrm_1(s, *mem, is_vsib);
84    gen_lea_v_seg(s, ea, mem->def_seg, s->override);
85}
86
87static inline int mmx_offset(MemOp ot)
88{
89    switch (ot) {
90    case MO_8:
91        return offsetof(MMXReg, MMX_B(0));
92    case MO_16:
93        return offsetof(MMXReg, MMX_W(0));
94    case MO_32:
95        return offsetof(MMXReg, MMX_L(0));
96    case MO_64:
97        return offsetof(MMXReg, MMX_Q(0));
98    default:
99        g_assert_not_reached();
100    }
101}
102
103static inline int xmm_offset(MemOp ot)
104{
105    switch (ot) {
106    case MO_8:
107        return offsetof(ZMMReg, ZMM_B(0));
108    case MO_16:
109        return offsetof(ZMMReg, ZMM_W(0));
110    case MO_32:
111        return offsetof(ZMMReg, ZMM_L(0));
112    case MO_64:
113        return offsetof(ZMMReg, ZMM_Q(0));
114    case MO_128:
115        return offsetof(ZMMReg, ZMM_X(0));
116    case MO_256:
117        return offsetof(ZMMReg, ZMM_Y(0));
118    default:
119        g_assert_not_reached();
120    }
121}
122
123static int vector_reg_offset(X86DecodedOp *op)
124{
125    assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
126
127    if (op->unit == X86_OP_MMX) {
128        return op->offset - mmx_offset(op->ot);
129    } else {
130        return op->offset - xmm_offset(op->ot);
131    }
132}
133
134static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n)
135{
136    int base_ofs = vector_reg_offset(op);
137    switch(ot) {
138    case MO_8:
139        if (op->unit == X86_OP_MMX) {
140            return base_ofs + offsetof(MMXReg, MMX_B(n));
141        } else {
142            return base_ofs + offsetof(ZMMReg, ZMM_B(n));
143        }
144    case MO_16:
145        if (op->unit == X86_OP_MMX) {
146            return base_ofs + offsetof(MMXReg, MMX_W(n));
147        } else {
148            return base_ofs + offsetof(ZMMReg, ZMM_W(n));
149        }
150    case MO_32:
151        if (op->unit == X86_OP_MMX) {
152            return base_ofs + offsetof(MMXReg, MMX_L(n));
153        } else {
154            return base_ofs + offsetof(ZMMReg, ZMM_L(n));
155        }
156    case MO_64:
157        if (op->unit == X86_OP_MMX) {
158            return base_ofs;
159        } else {
160            return base_ofs + offsetof(ZMMReg, ZMM_Q(n));
161        }
162    case MO_128:
163        assert(op->unit == X86_OP_SSE);
164        return base_ofs + offsetof(ZMMReg, ZMM_X(n));
165    case MO_256:
166        assert(op->unit == X86_OP_SSE);
167        return base_ofs + offsetof(ZMMReg, ZMM_Y(n));
168    default:
169        g_assert_not_reached();
170    }
171}
172
173static void compute_mmx_offset(X86DecodedOp *op)
174{
175    if (!op->has_ea) {
176        op->offset = MMX_OFFSET(op->n) + mmx_offset(op->ot);
177    } else {
178        op->offset = offsetof(CPUX86State, mmx_t0) + mmx_offset(op->ot);
179    }
180}
181
182static void compute_xmm_offset(X86DecodedOp *op)
183{
184    if (!op->has_ea) {
185        op->offset = ZMM_OFFSET(op->n) + xmm_offset(op->ot);
186    } else {
187        op->offset = offsetof(CPUX86State, xmm_t0) + xmm_offset(op->ot);
188    }
189}
190
191static void gen_load_sse(DisasContext *s, TCGv temp, MemOp ot, int dest_ofs, bool aligned)
192{
193    switch(ot) {
194    case MO_8:
195        gen_op_ld_v(s, MO_8, temp, s->A0);
196        tcg_gen_st8_tl(temp, tcg_env, dest_ofs);
197        break;
198    case MO_16:
199        gen_op_ld_v(s, MO_16, temp, s->A0);
200        tcg_gen_st16_tl(temp, tcg_env, dest_ofs);
201        break;
202    case MO_32:
203        gen_op_ld_v(s, MO_32, temp, s->A0);
204        tcg_gen_st32_tl(temp, tcg_env, dest_ofs);
205        break;
206    case MO_64:
207        gen_ldq_env_A0(s, dest_ofs);
208        break;
209    case MO_128:
210        gen_ldo_env_A0(s, dest_ofs, aligned);
211        break;
212    case MO_256:
213        gen_ldy_env_A0(s, dest_ofs, aligned);
214        break;
215    default:
216        g_assert_not_reached();
217    }
218}
219
220static bool sse_needs_alignment(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
221{
222    switch (decode->e.vex_class) {
223    case 2:
224    case 4:
225        if ((s->prefix & PREFIX_VEX) ||
226            decode->e.vex_special == X86_VEX_SSEUnaligned) {
227            /* MOST legacy SSE instructions require aligned memory operands, but not all.  */
228            return false;
229        }
230        /* fall through */
231    case 1:
232        return ot >= MO_128;
233
234    default:
235        return false;
236    }
237}
238
239static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
240{
241    X86DecodedOp *op = &decode->op[opn];
242
243    switch (op->unit) {
244    case X86_OP_SKIP:
245        return;
246    case X86_OP_SEG:
247        tcg_gen_ld32u_tl(v, tcg_env,
248                         offsetof(CPUX86State,segs[op->n].selector));
249        break;
250#ifndef CONFIG_USER_ONLY
251    case X86_OP_CR:
252        if (op->n == 8) {
253            translator_io_start(&s->base);
254            gen_helper_read_cr8(v, tcg_env);
255        } else {
256            tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, cr[op->n]));
257        }
258        break;
259    case X86_OP_DR:
260        /* CR4.DE tested in the helper.  */
261        gen_helper_get_dr(v, tcg_env, tcg_constant_i32(op->n));
262        break;
263#endif
264    case X86_OP_INT:
265        if (op->has_ea) {
266            if (v == s->T0 && decode->e.special == X86_SPECIAL_SExtT0) {
267                gen_op_ld_v(s, op->ot | MO_SIGN, v, s->A0);
268            } else {
269                gen_op_ld_v(s, op->ot, v, s->A0);
270            }
271
272        } else if (op->ot == MO_8 && byte_reg_is_xH(s, op->n)) {
273            if (v == s->T0 && decode->e.special == X86_SPECIAL_SExtT0) {
274                tcg_gen_sextract_tl(v, cpu_regs[op->n - 4], 8, 8);
275            } else {
276                tcg_gen_extract_tl(v, cpu_regs[op->n - 4], 8, 8);
277            }
278
279        } else if (op->ot < MO_TL && v == s->T0 &&
280                   (decode->e.special == X86_SPECIAL_SExtT0 ||
281                    decode->e.special == X86_SPECIAL_ZExtT0)) {
282            if (decode->e.special == X86_SPECIAL_SExtT0) {
283                tcg_gen_ext_tl(v, cpu_regs[op->n], op->ot | MO_SIGN);
284            } else {
285                tcg_gen_ext_tl(v, cpu_regs[op->n], op->ot);
286            }
287
288        } else {
289            tcg_gen_mov_tl(v, cpu_regs[op->n]);
290        }
291        break;
292    case X86_OP_IMM:
293        tcg_gen_movi_tl(v, op->imm);
294        break;
295
296    case X86_OP_MMX:
297        compute_mmx_offset(op);
298        goto load_vector;
299
300    case X86_OP_SSE:
301        compute_xmm_offset(op);
302    load_vector:
303        if (op->has_ea) {
304            bool aligned = sse_needs_alignment(s, decode, op->ot);
305            gen_load_sse(s, v, op->ot, op->offset, aligned);
306        }
307        break;
308
309    default:
310        g_assert_not_reached();
311    }
312}
313
314static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn)
315{
316    X86DecodedOp *op = &decode->op[opn];
317
318    assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
319    if (op->v_ptr) {
320        return op->v_ptr;
321    }
322    op->v_ptr = tcg_temp_new_ptr();
323
324    /* The temporary points to the MMXReg or ZMMReg.  */
325    tcg_gen_addi_ptr(op->v_ptr, tcg_env, vector_reg_offset(op));
326    return op->v_ptr;
327}
328
329#define OP_PTR0 op_ptr(decode, 0)
330#define OP_PTR1 op_ptr(decode, 1)
331#define OP_PTR2 op_ptr(decode, 2)
332
333static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
334{
335    X86DecodedOp *op = &decode->op[opn];
336    switch (op->unit) {
337    case X86_OP_SKIP:
338        break;
339    case X86_OP_SEG:
340        /* Note that gen_movl_seg takes care of interrupt shadow and TF.  */
341        gen_movl_seg(s, op->n, s->T0);
342        break;
343    case X86_OP_INT:
344        if (op->has_ea) {
345            gen_op_st_v(s, op->ot, v, s->A0);
346        } else {
347            gen_op_mov_reg_v(s, op->ot, op->n, v);
348        }
349        break;
350    case X86_OP_MMX:
351        break;
352    case X86_OP_SSE:
353        if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot <= MO_128) {
354            tcg_gen_gvec_dup_imm(MO_64,
355                                 offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)),
356                                 16, 16, 0);
357        }
358        break;
359#ifndef CONFIG_USER_ONLY
360    case X86_OP_CR:
361        if (op->n == 8) {
362            translator_io_start(&s->base);
363        }
364        gen_helper_write_crN(tcg_env, tcg_constant_i32(op->n), v);
365        s->base.is_jmp = DISAS_EOB_NEXT;
366        break;
367    case X86_OP_DR:
368        /* CR4.DE tested in the helper.  */
369        gen_helper_set_dr(tcg_env, tcg_constant_i32(op->n), v);
370        s->base.is_jmp = DISAS_EOB_NEXT;
371        break;
372#endif
373    default:
374        g_assert_not_reached();
375    }
376    op->unit = X86_OP_SKIP;
377}
378
379static inline int vector_len(DisasContext *s, X86DecodedInsn *decode)
380{
381    if (decode->e.special == X86_SPECIAL_MMX &&
382        !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
383        return 8;
384    }
385    return s->vex_l ? 32 : 16;
386}
387
388static void prepare_update1_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op)
389{
390    decode->cc_dst = s->T0;
391    decode->cc_op = op;
392}
393
394static void prepare_update2_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op)
395{
396    decode->cc_src = s->T1;
397    decode->cc_dst = s->T0;
398    decode->cc_op = op;
399}
400
401static void prepare_update_cc_incdec(X86DecodedInsn *decode, DisasContext *s, CCOp op)
402{
403    gen_compute_eflags_c(s, s->T1);
404    prepare_update2_cc(decode, s, op);
405}
406
407static void prepare_update3_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op, TCGv reg)
408{
409    decode->cc_src2 = reg;
410    decode->cc_src = s->T1;
411    decode->cc_dst = s->T0;
412    decode->cc_op = op;
413}
414
415static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs)
416{
417    MemOp ot = decode->op[0].ot;
418    int vec_len = vector_len(s, decode);
419    bool aligned = sse_needs_alignment(s, decode, ot);
420
421    if (!decode->op[0].has_ea) {
422        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len);
423        return;
424    }
425
426    switch (ot) {
427    case MO_64:
428        gen_stq_env_A0(s, src_ofs);
429        break;
430    case MO_128:
431        gen_sto_env_A0(s, src_ofs, aligned);
432        break;
433    case MO_256:
434        gen_sty_env_A0(s, src_ofs, aligned);
435        break;
436    default:
437        g_assert_not_reached();
438    }
439}
440
441static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b)
442{
443    gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b);
444}
445
446#define FN_3DNOW_MOVE ((SSEFunc_0_epp) (uintptr_t) 1)
447static const SSEFunc_0_epp fns_3dnow[] = {
448    [0x0c] = gen_helper_pi2fw,
449    [0x0d] = gen_helper_pi2fd,
450    [0x1c] = gen_helper_pf2iw,
451    [0x1d] = gen_helper_pf2id,
452    [0x8a] = gen_helper_pfnacc,
453    [0x8e] = gen_helper_pfpnacc,
454    [0x90] = gen_helper_pfcmpge,
455    [0x94] = gen_helper_pfmin,
456    [0x96] = gen_helper_pfrcp,
457    [0x97] = gen_helper_pfrsqrt,
458    [0x9a] = gen_helper_pfsub,
459    [0x9e] = gen_helper_pfadd,
460    [0xa0] = gen_helper_pfcmpgt,
461    [0xa4] = gen_helper_pfmax,
462    [0xa6] = FN_3DNOW_MOVE, /* PFRCPIT1; no need to actually increase precision */
463    [0xa7] = FN_3DNOW_MOVE, /* PFRSQIT1 */
464    [0xb6] = FN_3DNOW_MOVE, /* PFRCPIT2 */
465    [0xaa] = gen_helper_pfsubr,
466    [0xae] = gen_helper_pfacc,
467    [0xb0] = gen_helper_pfcmpeq,
468    [0xb4] = gen_helper_pfmul,
469    [0xb7] = gen_helper_pmulhrw_mmx,
470    [0xbb] = gen_helper_pswapd,
471    [0xbf] = gen_helper_pavgusb,
472};
473
474static void gen_3dnow(DisasContext *s, X86DecodedInsn *decode)
475{
476    uint8_t b = decode->immediate;
477    SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL;
478
479    if (!fn) {
480        gen_illegal_opcode(s);
481        return;
482    }
483    if (s->flags & HF_TS_MASK) {
484        gen_NM_exception(s);
485        return;
486    }
487    if (s->flags & HF_EM_MASK) {
488        gen_illegal_opcode(s);
489        return;
490    }
491
492    gen_helper_enter_mmx(tcg_env);
493    if (fn == FN_3DNOW_MOVE) {
494       tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset);
495       tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset);
496    } else {
497       fn(tcg_env, OP_PTR0, OP_PTR1);
498    }
499}
500
501/*
502 * 00 = v*ps Vps, Hps, Wpd
503 * 66 = v*pd Vpd, Hpd, Wps
504 * f3 = v*ss Vss, Hss, Wps
505 * f2 = v*sd Vsd, Hsd, Wps
506 */
507static inline void gen_unary_fp_sse(DisasContext *s, X86DecodedInsn *decode,
508                              SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm,
509                              SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm,
510                              SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
511{
512    if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
513        SSEFunc_0_eppp fn = s->prefix & PREFIX_REPZ ? ss : sd;
514        if (!fn) {
515            gen_illegal_opcode(s);
516            return;
517        }
518        fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
519    } else {
520        SSEFunc_0_epp ps, pd, fn;
521        ps = s->vex_l ? ps_ymm : ps_xmm;
522        pd = s->vex_l ? pd_ymm : pd_xmm;
523        fn = s->prefix & PREFIX_DATA ? pd : ps;
524        if (!fn) {
525            gen_illegal_opcode(s);
526            return;
527        }
528        fn(tcg_env, OP_PTR0, OP_PTR2);
529    }
530}
531#define UNARY_FP_SSE(uname, lname)                                                 \
532static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
533{                                                                                  \
534    gen_unary_fp_sse(s, decode,                                                    \
535                     gen_helper_##lname##pd_xmm,                                   \
536                     gen_helper_##lname##ps_xmm,                                   \
537                     gen_helper_##lname##pd_ymm,                                   \
538                     gen_helper_##lname##ps_ymm,                                   \
539                     gen_helper_##lname##sd,                                       \
540                     gen_helper_##lname##ss);                                      \
541}
542UNARY_FP_SSE(VSQRT, sqrt)
543
544/*
545 * 00 = v*ps Vps, Hps, Wpd
546 * 66 = v*pd Vpd, Hpd, Wps
547 * f3 = v*ss Vss, Hss, Wps
548 * f2 = v*sd Vsd, Hsd, Wps
549 */
550static inline void gen_fp_sse(DisasContext *s, X86DecodedInsn *decode,
551                              SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
552                              SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm,
553                              SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
554{
555    SSEFunc_0_eppp ps, pd, fn;
556    if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
557        fn = s->prefix & PREFIX_REPZ ? ss : sd;
558    } else {
559        ps = s->vex_l ? ps_ymm : ps_xmm;
560        pd = s->vex_l ? pd_ymm : pd_xmm;
561        fn = s->prefix & PREFIX_DATA ? pd : ps;
562    }
563    if (fn) {
564        fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
565    } else {
566        gen_illegal_opcode(s);
567    }
568}
569
570#define FP_SSE(uname, lname)                                                       \
571static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
572{                                                                                  \
573    gen_fp_sse(s, decode,                                                          \
574               gen_helper_##lname##pd_xmm,                                         \
575               gen_helper_##lname##ps_xmm,                                         \
576               gen_helper_##lname##pd_ymm,                                         \
577               gen_helper_##lname##ps_ymm,                                         \
578               gen_helper_##lname##sd,                                             \
579               gen_helper_##lname##ss);                                            \
580}
581FP_SSE(VADD, add)
582FP_SSE(VMUL, mul)
583FP_SSE(VSUB, sub)
584FP_SSE(VMIN, min)
585FP_SSE(VDIV, div)
586FP_SSE(VMAX, max)
587
588#define FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, even, odd)                         \
589static void gen_##uname##Px(DisasContext *s, X86DecodedInsn *decode)               \
590{                                                                                  \
591    SSEFunc_0_eppppii xmm = s->vex_w ? gen_helper_fma4pd_xmm : gen_helper_fma4ps_xmm; \
592    SSEFunc_0_eppppii ymm = s->vex_w ? gen_helper_fma4pd_ymm : gen_helper_fma4ps_ymm; \
593    SSEFunc_0_eppppii fn = s->vex_l ? ymm : xmm;                                   \
594                                                                                   \
595    fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2,                                         \
596       tcg_constant_i32(even),                                                     \
597       tcg_constant_i32((even) ^ (odd)));                                          \
598}
599
600#define FMA_SSE(uname, ptr0, ptr1, ptr2, flags)                                    \
601FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, flags, flags)                              \
602static void gen_##uname##Sx(DisasContext *s, X86DecodedInsn *decode)               \
603{                                                                                  \
604    SSEFunc_0_eppppi fn = s->vex_w ? gen_helper_fma4sd : gen_helper_fma4ss;        \
605                                                                                   \
606    fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2,                                         \
607       tcg_constant_i32(flags));                                                   \
608}                                                                                  \
609
610FMA_SSE(VFMADD231,  OP_PTR1, OP_PTR2, OP_PTR0, 0)
611FMA_SSE(VFMADD213,  OP_PTR1, OP_PTR0, OP_PTR2, 0)
612FMA_SSE(VFMADD132,  OP_PTR0, OP_PTR2, OP_PTR1, 0)
613
614FMA_SSE(VFNMADD231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_product)
615FMA_SSE(VFNMADD213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_product)
616FMA_SSE(VFNMADD132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_product)
617
618FMA_SSE(VFMSUB231,  OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c)
619FMA_SSE(VFMSUB213,  OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c)
620FMA_SSE(VFMSUB132,  OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c)
621
622FMA_SSE(VFNMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c|float_muladd_negate_product)
623FMA_SSE(VFNMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c|float_muladd_negate_product)
624FMA_SSE(VFNMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c|float_muladd_negate_product)
625
626FMA_SSE_PACKED(VFMADDSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c, 0)
627FMA_SSE_PACKED(VFMADDSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c, 0)
628FMA_SSE_PACKED(VFMADDSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c, 0)
629
630FMA_SSE_PACKED(VFMSUBADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0, float_muladd_negate_c)
631FMA_SSE_PACKED(VFMSUBADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0, float_muladd_negate_c)
632FMA_SSE_PACKED(VFMSUBADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0, float_muladd_negate_c)
633
634#define FP_UNPACK_SSE(uname, lname)                                                \
635static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
636{                                                                                  \
637    /* PS maps to the DQ integer instruction, PD maps to QDQ.  */                  \
638    gen_fp_sse(s, decode,                                                          \
639               gen_helper_##lname##qdq_xmm,                                        \
640               gen_helper_##lname##dq_xmm,                                         \
641               gen_helper_##lname##qdq_ymm,                                        \
642               gen_helper_##lname##dq_ymm,                                         \
643               NULL, NULL);                                                        \
644}
645FP_UNPACK_SSE(VUNPCKLPx, punpckl)
646FP_UNPACK_SSE(VUNPCKHPx, punpckh)
647
648/*
649 * 00 = v*ps Vps, Wpd
650 * f3 = v*ss Vss, Wps
651 */
652static inline void gen_unary_fp32_sse(DisasContext *s, X86DecodedInsn *decode,
653                                      SSEFunc_0_epp ps_xmm,
654                                      SSEFunc_0_epp ps_ymm,
655                                      SSEFunc_0_eppp ss)
656{
657    if ((s->prefix & (PREFIX_DATA | PREFIX_REPNZ)) != 0) {
658        goto illegal_op;
659    } else if (s->prefix & PREFIX_REPZ) {
660        if (!ss) {
661            goto illegal_op;
662        }
663        ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
664    } else {
665        SSEFunc_0_epp fn = s->vex_l ? ps_ymm : ps_xmm;
666        if (!fn) {
667            goto illegal_op;
668        }
669        fn(tcg_env, OP_PTR0, OP_PTR2);
670    }
671    return;
672
673illegal_op:
674    gen_illegal_opcode(s);
675}
676#define UNARY_FP32_SSE(uname, lname)                                               \
677static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
678{                                                                                  \
679    gen_unary_fp32_sse(s, decode,                                                  \
680                       gen_helper_##lname##ps_xmm,                                 \
681                       gen_helper_##lname##ps_ymm,                                 \
682                       gen_helper_##lname##ss);                                    \
683}
684UNARY_FP32_SSE(VRSQRT, rsqrt)
685UNARY_FP32_SSE(VRCP, rcp)
686
687/*
688 * 66 = v*pd Vpd, Hpd, Wpd
689 * f2 = v*ps Vps, Hps, Wps
690 */
691static inline void gen_horizontal_fp_sse(DisasContext *s, X86DecodedInsn *decode,
692                                         SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
693                                         SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
694{
695    SSEFunc_0_eppp ps, pd, fn;
696    ps = s->vex_l ? ps_ymm : ps_xmm;
697    pd = s->vex_l ? pd_ymm : pd_xmm;
698    fn = s->prefix & PREFIX_DATA ? pd : ps;
699    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
700}
701#define HORIZONTAL_FP_SSE(uname, lname)                                            \
702static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
703{                                                                                  \
704    gen_horizontal_fp_sse(s, decode,                                               \
705                          gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm,  \
706                          gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
707}
708HORIZONTAL_FP_SSE(VHADD, hadd)
709HORIZONTAL_FP_SSE(VHSUB, hsub)
710HORIZONTAL_FP_SSE(VADDSUB, addsub)
711
712static inline void gen_ternary_sse(DisasContext *s, X86DecodedInsn *decode,
713                                   int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm)
714{
715    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
716    TCGv_ptr ptr3 = tcg_temp_new_ptr();
717
718    /* The format of the fourth input is Lx */
719    tcg_gen_addi_ptr(ptr3, tcg_env, ZMM_OFFSET(op3));
720    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
721}
722#define TERNARY_SSE(uname, uvname, lname)                                          \
723static void gen_##uvname(DisasContext *s, X86DecodedInsn *decode)                  \
724{                                                                                  \
725    gen_ternary_sse(s, decode, (uint8_t)decode->immediate >> 4,                    \
726                    gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);           \
727}                                                                                  \
728static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
729{                                                                                  \
730    gen_ternary_sse(s, decode, 0,                                                  \
731                  gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);             \
732}
733TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
734TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
735TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
736
737static inline void gen_binary_imm_sse(DisasContext *s, X86DecodedInsn *decode,
738                                      SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
739{
740    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
741    if (!s->vex_l) {
742        xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
743    } else {
744        ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
745    }
746}
747
748#define BINARY_IMM_SSE(uname, lname)                                               \
749static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
750{                                                                                  \
751    gen_binary_imm_sse(s, decode,                                                  \
752                       gen_helper_##lname##_xmm,                                   \
753                       gen_helper_##lname##_ymm);                                  \
754}
755
756BINARY_IMM_SSE(VBLENDPD,   blendpd)
757BINARY_IMM_SSE(VBLENDPS,   blendps)
758BINARY_IMM_SSE(VPBLENDW,   pblendw)
759BINARY_IMM_SSE(VDDPS,      dpps)
760#define gen_helper_dppd_ymm NULL
761BINARY_IMM_SSE(VDDPD,      dppd)
762BINARY_IMM_SSE(VMPSADBW,   mpsadbw)
763BINARY_IMM_SSE(PCLMULQDQ,  pclmulqdq)
764
765
766#define UNARY_INT_GVEC(uname, func, ...)                                           \
767static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
768{                                                                                  \
769    int vec_len = vector_len(s, decode);                                          \
770                                                                                   \
771    func(__VA_ARGS__, decode->op[0].offset,                                        \
772         decode->op[2].offset, vec_len, vec_len);                                  \
773}
774UNARY_INT_GVEC(PABSB,          tcg_gen_gvec_abs, MO_8)
775UNARY_INT_GVEC(PABSW,          tcg_gen_gvec_abs, MO_16)
776UNARY_INT_GVEC(PABSD,          tcg_gen_gvec_abs, MO_32)
777UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128)
778UNARY_INT_GVEC(VPBROADCASTB,   tcg_gen_gvec_dup_mem, MO_8)
779UNARY_INT_GVEC(VPBROADCASTW,   tcg_gen_gvec_dup_mem, MO_16)
780UNARY_INT_GVEC(VPBROADCASTD,   tcg_gen_gvec_dup_mem, MO_32)
781UNARY_INT_GVEC(VPBROADCASTQ,   tcg_gen_gvec_dup_mem, MO_64)
782
783
784#define BINARY_INT_GVEC(uname, func, ...)                                          \
785static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
786{                                                                                  \
787    int vec_len = vector_len(s, decode);                                          \
788                                                                                   \
789    func(__VA_ARGS__,                                                              \
790         decode->op[0].offset, decode->op[1].offset,                               \
791         decode->op[2].offset, vec_len, vec_len);                                  \
792}
793
794BINARY_INT_GVEC(PADDB,   tcg_gen_gvec_add, MO_8)
795BINARY_INT_GVEC(PADDW,   tcg_gen_gvec_add, MO_16)
796BINARY_INT_GVEC(PADDD,   tcg_gen_gvec_add, MO_32)
797BINARY_INT_GVEC(PADDQ,   tcg_gen_gvec_add, MO_64)
798BINARY_INT_GVEC(PADDSB,  tcg_gen_gvec_ssadd, MO_8)
799BINARY_INT_GVEC(PADDSW,  tcg_gen_gvec_ssadd, MO_16)
800BINARY_INT_GVEC(PADDUSB, tcg_gen_gvec_usadd, MO_8)
801BINARY_INT_GVEC(PADDUSW, tcg_gen_gvec_usadd, MO_16)
802BINARY_INT_GVEC(PAND,    tcg_gen_gvec_and, MO_64)
803BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8)
804BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32)
805BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16)
806BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64)
807BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8)
808BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16)
809BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32)
810BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64)
811BINARY_INT_GVEC(PMAXSB,  tcg_gen_gvec_smax, MO_8)
812BINARY_INT_GVEC(PMAXSW,  tcg_gen_gvec_smax, MO_16)
813BINARY_INT_GVEC(PMAXSD,  tcg_gen_gvec_smax, MO_32)
814BINARY_INT_GVEC(PMAXUB,  tcg_gen_gvec_umax, MO_8)
815BINARY_INT_GVEC(PMAXUW,  tcg_gen_gvec_umax, MO_16)
816BINARY_INT_GVEC(PMAXUD,  tcg_gen_gvec_umax, MO_32)
817BINARY_INT_GVEC(PMINSB,  tcg_gen_gvec_smin, MO_8)
818BINARY_INT_GVEC(PMINSW,  tcg_gen_gvec_smin, MO_16)
819BINARY_INT_GVEC(PMINSD,  tcg_gen_gvec_smin, MO_32)
820BINARY_INT_GVEC(PMINUB,  tcg_gen_gvec_umin, MO_8)
821BINARY_INT_GVEC(PMINUW,  tcg_gen_gvec_umin, MO_16)
822BINARY_INT_GVEC(PMINUD,  tcg_gen_gvec_umin, MO_32)
823BINARY_INT_GVEC(PMULLW,  tcg_gen_gvec_mul, MO_16)
824BINARY_INT_GVEC(PMULLD,  tcg_gen_gvec_mul, MO_32)
825BINARY_INT_GVEC(POR,     tcg_gen_gvec_or, MO_64)
826BINARY_INT_GVEC(PSUBB,   tcg_gen_gvec_sub, MO_8)
827BINARY_INT_GVEC(PSUBW,   tcg_gen_gvec_sub, MO_16)
828BINARY_INT_GVEC(PSUBD,   tcg_gen_gvec_sub, MO_32)
829BINARY_INT_GVEC(PSUBQ,   tcg_gen_gvec_sub, MO_64)
830BINARY_INT_GVEC(PSUBSB,  tcg_gen_gvec_sssub, MO_8)
831BINARY_INT_GVEC(PSUBSW,  tcg_gen_gvec_sssub, MO_16)
832BINARY_INT_GVEC(PSUBUSB, tcg_gen_gvec_ussub, MO_8)
833BINARY_INT_GVEC(PSUBUSW, tcg_gen_gvec_ussub, MO_16)
834BINARY_INT_GVEC(PXOR,    tcg_gen_gvec_xor, MO_64)
835
836
837/*
838 * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
839 * 66 = vp* Vx, Hx, Wx
840 *
841 * These are really the same encoding, because 1) V is the same as P when VEX.V
842 * is not present 2) P and Q are the same as H and W apart from MM/XMM
843 */
844static inline void gen_binary_int_sse(DisasContext *s, X86DecodedInsn *decode,
845                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
846{
847    assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
848
849    if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
850        /* VEX encoding is not applicable to MMX instructions.  */
851        gen_illegal_opcode(s);
852        return;
853    }
854    if (!(s->prefix & PREFIX_DATA)) {
855        mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
856    } else if (!s->vex_l) {
857        xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
858    } else {
859        ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
860    }
861}
862
863
864#define BINARY_INT_MMX(uname, lname)                                               \
865static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
866{                                                                                  \
867    gen_binary_int_sse(s, decode,                                                  \
868                          gen_helper_##lname##_mmx,                                \
869                          gen_helper_##lname##_xmm,                                \
870                          gen_helper_##lname##_ymm);                               \
871}
872BINARY_INT_MMX(PUNPCKLBW,  punpcklbw)
873BINARY_INT_MMX(PUNPCKLWD,  punpcklwd)
874BINARY_INT_MMX(PUNPCKLDQ,  punpckldq)
875BINARY_INT_MMX(PACKSSWB,   packsswb)
876BINARY_INT_MMX(PACKUSWB,   packuswb)
877BINARY_INT_MMX(PUNPCKHBW,  punpckhbw)
878BINARY_INT_MMX(PUNPCKHWD,  punpckhwd)
879BINARY_INT_MMX(PUNPCKHDQ,  punpckhdq)
880BINARY_INT_MMX(PACKSSDW,   packssdw)
881
882BINARY_INT_MMX(PAVGB,   pavgb)
883BINARY_INT_MMX(PAVGW,   pavgw)
884BINARY_INT_MMX(PMADDWD, pmaddwd)
885BINARY_INT_MMX(PMULHUW, pmulhuw)
886BINARY_INT_MMX(PMULHW,  pmulhw)
887BINARY_INT_MMX(PMULUDQ, pmuludq)
888BINARY_INT_MMX(PSADBW,  psadbw)
889
890BINARY_INT_MMX(PSLLW_r, psllw)
891BINARY_INT_MMX(PSLLD_r, pslld)
892BINARY_INT_MMX(PSLLQ_r, psllq)
893BINARY_INT_MMX(PSRLW_r, psrlw)
894BINARY_INT_MMX(PSRLD_r, psrld)
895BINARY_INT_MMX(PSRLQ_r, psrlq)
896BINARY_INT_MMX(PSRAW_r, psraw)
897BINARY_INT_MMX(PSRAD_r, psrad)
898
899BINARY_INT_MMX(PHADDW,    phaddw)
900BINARY_INT_MMX(PHADDSW,   phaddsw)
901BINARY_INT_MMX(PHADDD,    phaddd)
902BINARY_INT_MMX(PHSUBW,    phsubw)
903BINARY_INT_MMX(PHSUBSW,   phsubsw)
904BINARY_INT_MMX(PHSUBD,    phsubd)
905BINARY_INT_MMX(PMADDUBSW, pmaddubsw)
906BINARY_INT_MMX(PSHUFB,    pshufb)
907BINARY_INT_MMX(PSIGNB,    psignb)
908BINARY_INT_MMX(PSIGNW,    psignw)
909BINARY_INT_MMX(PSIGND,    psignd)
910BINARY_INT_MMX(PMULHRSW,  pmulhrsw)
911
912/* Instructions with no MMX equivalent.  */
913#define BINARY_INT_SSE(uname, lname)                                               \
914static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
915{                                                                                  \
916    gen_binary_int_sse(s, decode,                                                  \
917                          NULL,                                                    \
918                          gen_helper_##lname##_xmm,                                \
919                          gen_helper_##lname##_ymm);                               \
920}
921
922/* Instructions with no MMX equivalent.  */
923BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
924BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
925BINARY_INT_SSE(VPACKUSDW,  packusdw)
926BINARY_INT_SSE(VPERMILPS,  vpermilps)
927BINARY_INT_SSE(VPERMILPD,  vpermilpd)
928BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd)
929BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq)
930
931BINARY_INT_SSE(PMULDQ,    pmuldq)
932
933BINARY_INT_SSE(VAESDEC, aesdec)
934BINARY_INT_SSE(VAESDECLAST, aesdeclast)
935BINARY_INT_SSE(VAESENC, aesenc)
936BINARY_INT_SSE(VAESENCLAST, aesenclast)
937
938#define UNARY_CMP_SSE(uname, lname)                                                \
939static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
940{                                                                                  \
941    if (!s->vex_l) {                                                               \
942        gen_helper_##lname##_xmm(tcg_env, OP_PTR1, OP_PTR2);                       \
943    } else {                                                                       \
944        gen_helper_##lname##_ymm(tcg_env, OP_PTR1, OP_PTR2);                       \
945    }                                                                              \
946    assume_cc_op(s, CC_OP_EFLAGS);                                                  \
947}
948UNARY_CMP_SSE(VPTEST,     ptest)
949UNARY_CMP_SSE(VTESTPS,    vtestps)
950UNARY_CMP_SSE(VTESTPD,    vtestpd)
951
952static inline void gen_unary_int_sse(DisasContext *s, X86DecodedInsn *decode,
953                                     SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
954{
955    if (!s->vex_l) {
956        xmm(tcg_env, OP_PTR0, OP_PTR2);
957    } else {
958        ymm(tcg_env, OP_PTR0, OP_PTR2);
959    }
960}
961
962#define UNARY_INT_SSE(uname, lname)                                                \
963static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
964{                                                                                  \
965    gen_unary_int_sse(s, decode,                                                   \
966                      gen_helper_##lname##_xmm,                                    \
967                      gen_helper_##lname##_ymm);                                   \
968}
969
970UNARY_INT_SSE(VPMOVSXBW,    pmovsxbw)
971UNARY_INT_SSE(VPMOVSXBD,    pmovsxbd)
972UNARY_INT_SSE(VPMOVSXBQ,    pmovsxbq)
973UNARY_INT_SSE(VPMOVSXWD,    pmovsxwd)
974UNARY_INT_SSE(VPMOVSXWQ,    pmovsxwq)
975UNARY_INT_SSE(VPMOVSXDQ,    pmovsxdq)
976
977UNARY_INT_SSE(VPMOVZXBW,    pmovzxbw)
978UNARY_INT_SSE(VPMOVZXBD,    pmovzxbd)
979UNARY_INT_SSE(VPMOVZXBQ,    pmovzxbq)
980UNARY_INT_SSE(VPMOVZXWD,    pmovzxwd)
981UNARY_INT_SSE(VPMOVZXWQ,    pmovzxwq)
982UNARY_INT_SSE(VPMOVZXDQ,    pmovzxdq)
983
984UNARY_INT_SSE(VMOVSLDUP,    pmovsldup)
985UNARY_INT_SSE(VMOVSHDUP,    pmovshdup)
986UNARY_INT_SSE(VMOVDDUP,     pmovdldup)
987
988UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
989UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
990UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
991UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps)
992UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq)
993UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq)
994UNARY_INT_SSE(VCVTPH2PS, cvtph2ps)
995
996
997static inline void gen_unary_imm_sse(DisasContext *s, X86DecodedInsn *decode,
998                                     SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
999{
1000    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1001    if (!s->vex_l) {
1002        xmm(OP_PTR0, OP_PTR1, imm);
1003    } else {
1004        ymm(OP_PTR0, OP_PTR1, imm);
1005    }
1006}
1007
1008#define UNARY_IMM_SSE(uname, lname)                                                \
1009static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1010{                                                                                  \
1011    gen_unary_imm_sse(s, decode,                                                   \
1012                      gen_helper_##lname##_xmm,                                    \
1013                      gen_helper_##lname##_ymm);                                   \
1014}
1015
1016UNARY_IMM_SSE(PSHUFD,     pshufd)
1017UNARY_IMM_SSE(PSHUFHW,    pshufhw)
1018UNARY_IMM_SSE(PSHUFLW,    pshuflw)
1019#define gen_helper_vpermq_xmm NULL
1020UNARY_IMM_SSE(VPERMQ,      vpermq)
1021UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm)
1022UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm)
1023
1024static inline void gen_unary_imm_fp_sse(DisasContext *s, X86DecodedInsn *decode,
1025                                        SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm)
1026{
1027    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1028    if (!s->vex_l) {
1029        xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
1030    } else {
1031        ymm(tcg_env, OP_PTR0, OP_PTR1, imm);
1032    }
1033}
1034
1035#define UNARY_IMM_FP_SSE(uname, lname)                                             \
1036static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1037{                                                                                  \
1038    gen_unary_imm_fp_sse(s, decode,                                                \
1039                      gen_helper_##lname##_xmm,                                    \
1040                      gen_helper_##lname##_ymm);                                   \
1041}
1042
1043UNARY_IMM_FP_SSE(VROUNDPS,    roundps)
1044UNARY_IMM_FP_SSE(VROUNDPD,    roundpd)
1045
1046static inline void gen_vexw_avx(DisasContext *s, X86DecodedInsn *decode,
1047                                SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
1048                                SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
1049{
1050    SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm;
1051    SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm;
1052    SSEFunc_0_eppp fn = s->vex_w ? q : d;
1053    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
1054}
1055
1056/* VEX.W affects whether to operate on 32- or 64-bit elements.  */
1057#define VEXW_AVX(uname, lname)                                                     \
1058static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1059{                                                                                  \
1060    gen_vexw_avx(s, decode,                                                        \
1061                 gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
1062                 gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
1063}
1064VEXW_AVX(VPSLLV,    vpsllv)
1065VEXW_AVX(VPSRLV,    vpsrlv)
1066VEXW_AVX(VPSRAV,    vpsrav)
1067VEXW_AVX(VPMASKMOV, vpmaskmov)
1068
1069/* Same as above, but with extra arguments to the helper.  */
1070static inline void gen_vsib_avx(DisasContext *s, X86DecodedInsn *decode,
1071                                SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
1072                                SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
1073{
1074    SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
1075    SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
1076    SSEFunc_0_epppti fn = s->vex_w ? q : d;
1077    TCGv_i32 scale = tcg_constant_i32(decode->mem.scale);
1078    TCGv_ptr index = tcg_temp_new_ptr();
1079
1080    /* Pass third input as (index, base, scale) */
1081    tcg_gen_addi_ptr(index, tcg_env, ZMM_OFFSET(decode->mem.index));
1082    fn(tcg_env, OP_PTR0, OP_PTR1, index, s->A0, scale);
1083
1084    /*
1085     * There are two output operands, so zero OP1's high 128 bits
1086     * in the VEX.128 case.
1087     */
1088    if (!s->vex_l) {
1089        int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1);
1090        tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0);
1091    }
1092}
1093#define VSIB_AVX(uname, lname)                                                     \
1094static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
1095{                                                                                  \
1096    gen_vsib_avx(s, decode,                                                        \
1097                 gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
1098                 gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
1099}
1100VSIB_AVX(VPGATHERD, vpgatherd)
1101VSIB_AVX(VPGATHERQ, vpgatherq)
1102
1103static void gen_AAA(DisasContext *s, X86DecodedInsn *decode)
1104{
1105    gen_update_cc_op(s);
1106    gen_helper_aaa(tcg_env);
1107    assume_cc_op(s, CC_OP_EFLAGS);
1108}
1109
1110static void gen_AAD(DisasContext *s, X86DecodedInsn *decode)
1111{
1112    gen_helper_aad(s->T0, s->T0, s->T1);
1113    prepare_update1_cc(decode, s, CC_OP_LOGICB);
1114}
1115
1116static void gen_AAM(DisasContext *s, X86DecodedInsn *decode)
1117{
1118    if (decode->immediate == 0) {
1119        gen_exception(s, EXCP00_DIVZ);
1120    } else {
1121        gen_helper_aam(s->T0, s->T0, s->T1);
1122        prepare_update1_cc(decode, s, CC_OP_LOGICB);
1123    }
1124}
1125
1126static void gen_AAS(DisasContext *s, X86DecodedInsn *decode)
1127{
1128    gen_update_cc_op(s);
1129    gen_helper_aas(tcg_env);
1130    assume_cc_op(s, CC_OP_EFLAGS);
1131}
1132
1133static void gen_ADC(DisasContext *s, X86DecodedInsn *decode)
1134{
1135    MemOp ot = decode->op[1].ot;
1136    TCGv c_in = tcg_temp_new();
1137
1138    gen_compute_eflags_c(s, c_in);
1139    if (s->prefix & PREFIX_LOCK) {
1140        tcg_gen_add_tl(s->T0, c_in, s->T1);
1141        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T0,
1142                                    s->mem_index, ot | MO_LE);
1143    } else {
1144        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1145        tcg_gen_add_tl(s->T0, s->T0, c_in);
1146    }
1147    prepare_update3_cc(decode, s, CC_OP_ADCB + ot, c_in);
1148}
1149
1150static void gen_ADCOX(DisasContext *s, X86DecodedInsn *decode, int cc_op)
1151{
1152    MemOp ot = decode->op[0].ot;
1153    TCGv carry_in = NULL;
1154    TCGv *carry_out = (cc_op == CC_OP_ADCX ? &decode->cc_dst : &decode->cc_src2);
1155    TCGv zero;
1156
1157    decode->cc_op = cc_op;
1158    *carry_out = tcg_temp_new();
1159    if (CC_OP_HAS_EFLAGS(s->cc_op)) {
1160        decode->cc_src = cpu_cc_src;
1161
1162        /* Re-use the carry-out from a previous round?  */
1163        if (s->cc_op == cc_op || s->cc_op == CC_OP_ADCOX) {
1164            carry_in = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
1165        }
1166
1167        /* Preserve the opposite carry from previous rounds?  */
1168        if (s->cc_op != cc_op && s->cc_op != CC_OP_EFLAGS) {
1169            decode->cc_op = CC_OP_ADCOX;
1170            if (carry_out == &decode->cc_dst) {
1171                decode->cc_src2 = cpu_cc_src2;
1172            } else {
1173                decode->cc_dst = cpu_cc_dst;
1174            }
1175        }
1176    } else {
1177        decode->cc_src = tcg_temp_new();
1178        gen_mov_eflags(s, decode->cc_src);
1179    }
1180
1181    if (!carry_in) {
1182        /* Get carry_in out of EFLAGS.  */
1183        carry_in = tcg_temp_new();
1184        tcg_gen_extract_tl(carry_in, decode->cc_src,
1185            ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
1186    }
1187
1188    switch (ot) {
1189#ifdef TARGET_X86_64
1190    case MO_32:
1191        /* If TL is 64-bit just do everything in 64-bit arithmetic.  */
1192        tcg_gen_ext32u_tl(s->T0, s->T0);
1193        tcg_gen_ext32u_tl(s->T1, s->T1);
1194        tcg_gen_add_i64(s->T0, s->T0, s->T1);
1195        tcg_gen_add_i64(s->T0, s->T0, carry_in);
1196        tcg_gen_shri_i64(*carry_out, s->T0, 32);
1197        break;
1198#endif
1199    default:
1200        zero = tcg_constant_tl(0);
1201        tcg_gen_add2_tl(s->T0, *carry_out, s->T0, zero, carry_in, zero);
1202        tcg_gen_add2_tl(s->T0, *carry_out, s->T0, *carry_out, s->T1, zero);
1203        break;
1204    }
1205}
1206
1207static void gen_ADCX(DisasContext *s, X86DecodedInsn *decode)
1208{
1209    gen_ADCOX(s, decode, CC_OP_ADCX);
1210}
1211
1212static void gen_ADD(DisasContext *s, X86DecodedInsn *decode)
1213{
1214    MemOp ot = decode->op[1].ot;
1215
1216    if (s->prefix & PREFIX_LOCK) {
1217        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T1,
1218                                    s->mem_index, ot | MO_LE);
1219    } else {
1220        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1221    }
1222    prepare_update2_cc(decode, s, CC_OP_ADDB + ot);
1223}
1224
1225static void gen_ADOX(DisasContext *s, X86DecodedInsn *decode)
1226{
1227    gen_ADCOX(s, decode, CC_OP_ADOX);
1228}
1229
1230static void gen_AND(DisasContext *s, X86DecodedInsn *decode)
1231{
1232    MemOp ot = decode->op[1].ot;
1233
1234    if (s->prefix & PREFIX_LOCK) {
1235        tcg_gen_atomic_and_fetch_tl(s->T0, s->A0, s->T1,
1236                                    s->mem_index, ot | MO_LE);
1237    } else {
1238        tcg_gen_and_tl(s->T0, s->T0, s->T1);
1239    }
1240    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1241}
1242
1243static void gen_ANDN(DisasContext *s, X86DecodedInsn *decode)
1244{
1245    MemOp ot = decode->op[0].ot;
1246
1247    tcg_gen_andc_tl(s->T0, s->T1, s->T0);
1248    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1249}
1250
1251static void gen_ARPL(DisasContext *s, X86DecodedInsn *decode)
1252{
1253    TCGv zf = tcg_temp_new();
1254    TCGv flags = tcg_temp_new();
1255
1256    gen_mov_eflags(s, flags);
1257
1258    /* Compute adjusted DST in T1, merging in SRC[RPL].  */
1259    tcg_gen_deposit_tl(s->T1, s->T0, s->T1, 0, 2);
1260
1261    /* Z flag set if DST[RPL] < SRC[RPL] */
1262    tcg_gen_setcond_tl(TCG_COND_LTU, zf, s->T0, s->T1);
1263    tcg_gen_deposit_tl(flags, flags, zf, ctz32(CC_Z), 1);
1264
1265    /* Place maximum RPL in DST */
1266    tcg_gen_umax_tl(s->T0, s->T0, s->T1);
1267
1268    decode->cc_src = flags;
1269    decode->cc_op = CC_OP_EFLAGS;
1270}
1271
1272static void gen_BEXTR(DisasContext *s, X86DecodedInsn *decode)
1273{
1274    MemOp ot = decode->op[0].ot;
1275    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
1276    TCGv zero = tcg_constant_tl(0);
1277    TCGv mone = tcg_constant_tl(-1);
1278
1279    /*
1280     * Extract START, and shift the operand.
1281     * Shifts larger than operand size get zeros.
1282     */
1283    tcg_gen_ext8u_tl(s->A0, s->T1);
1284    tcg_gen_shr_tl(s->T0, s->T0, s->A0);
1285
1286    tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero);
1287
1288    /*
1289     * Extract the LEN into an inverse mask.  Lengths larger than
1290     * operand size get all zeros, length 0 gets all ones.
1291     */
1292    tcg_gen_extract_tl(s->A0, s->T1, 8, 8);
1293    tcg_gen_shl_tl(s->T1, mone, s->A0);
1294    tcg_gen_movcond_tl(TCG_COND_LEU, s->T1, s->A0, bound, s->T1, zero);
1295    tcg_gen_andc_tl(s->T0, s->T0, s->T1);
1296
1297    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1298}
1299
1300static void gen_BLSI(DisasContext *s, X86DecodedInsn *decode)
1301{
1302    MemOp ot = decode->op[0].ot;
1303
1304    /* input in T1, which is ready for prepare_update2_cc  */
1305    tcg_gen_neg_tl(s->T0, s->T1);
1306    tcg_gen_and_tl(s->T0, s->T0, s->T1);
1307    prepare_update2_cc(decode, s, CC_OP_BLSIB + ot);
1308}
1309
1310static void gen_BLSMSK(DisasContext *s, X86DecodedInsn *decode)
1311{
1312    MemOp ot = decode->op[0].ot;
1313
1314    /* input in T1, which is ready for prepare_update2_cc  */
1315    tcg_gen_subi_tl(s->T0, s->T1, 1);
1316    tcg_gen_xor_tl(s->T0, s->T0, s->T1);
1317    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1318}
1319
1320static void gen_BLSR(DisasContext *s, X86DecodedInsn *decode)
1321{
1322    MemOp ot = decode->op[0].ot;
1323
1324    /* input in T1, which is ready for prepare_update2_cc  */
1325    tcg_gen_subi_tl(s->T0, s->T1, 1);
1326    tcg_gen_and_tl(s->T0, s->T0, s->T1);
1327    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1328}
1329
1330static void gen_BOUND(DisasContext *s, X86DecodedInsn *decode)
1331{
1332    TCGv_i32 op = tcg_temp_new_i32();
1333    tcg_gen_trunc_tl_i32(op, s->T0);
1334    if (decode->op[1].ot == MO_16) {
1335        gen_helper_boundw(tcg_env, s->A0, op);
1336    } else {
1337        gen_helper_boundl(tcg_env, s->A0, op);
1338    }
1339}
1340
1341/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
1342static void gen_BSF(DisasContext *s, X86DecodedInsn *decode)
1343{
1344    MemOp ot = decode->op[0].ot;
1345
1346    /* Only the Z bit is defined and it is related to the input.  */
1347    decode->cc_dst = tcg_temp_new();
1348    decode->cc_op = CC_OP_LOGICB + ot;
1349    tcg_gen_mov_tl(decode->cc_dst, s->T0);
1350
1351    /*
1352     * The manual says that the output is undefined when the
1353     * input is zero, but real hardware leaves it unchanged, and
1354     * real programs appear to depend on that.  Accomplish this
1355     * by passing the output as the value to return upon zero.
1356     */
1357    tcg_gen_ctz_tl(s->T0, s->T0, s->T1);
1358}
1359
1360/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
1361static void gen_BSR(DisasContext *s, X86DecodedInsn *decode)
1362{
1363    MemOp ot = decode->op[0].ot;
1364
1365    /* Only the Z bit is defined and it is related to the input.  */
1366    decode->cc_dst = tcg_temp_new();
1367    decode->cc_op = CC_OP_LOGICB + ot;
1368    tcg_gen_mov_tl(decode->cc_dst, s->T0);
1369
1370    /*
1371     * The manual says that the output is undefined when the
1372     * input is zero, but real hardware leaves it unchanged, and
1373     * real programs appear to depend on that.  Accomplish this
1374     * by passing the output as the value to return upon zero.
1375     * Plus, return the bit index of the first 1 bit.
1376     */
1377    tcg_gen_xori_tl(s->T1, s->T1, TARGET_LONG_BITS - 1);
1378    tcg_gen_clz_tl(s->T0, s->T0, s->T1);
1379    tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1);
1380}
1381
1382static void gen_BSWAP(DisasContext *s, X86DecodedInsn *decode)
1383{
1384#ifdef TARGET_X86_64
1385    if (s->dflag == MO_64) {
1386        tcg_gen_bswap64_i64(s->T0, s->T0);
1387        return;
1388    }
1389#endif
1390    tcg_gen_bswap32_tl(s->T0, s->T0, TCG_BSWAP_OZ);
1391}
1392
1393static void gen_BZHI(DisasContext *s, X86DecodedInsn *decode)
1394{
1395    MemOp ot = decode->op[0].ot;
1396    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
1397    TCGv zero = tcg_constant_tl(0);
1398    TCGv mone = tcg_constant_tl(-1);
1399
1400    tcg_gen_ext8u_tl(s->T1, s->T1);
1401
1402    tcg_gen_shl_tl(s->A0, mone, s->T1);
1403    tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->T1, bound, s->A0, zero);
1404    tcg_gen_andc_tl(s->T0, s->T0, s->A0);
1405    /*
1406     * Note that since we're using BMILG (in order to get O
1407     * cleared) we need to store the inverse into C.
1408     */
1409    tcg_gen_setcond_tl(TCG_COND_LEU, s->T1, s->T1, bound);
1410    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1411}
1412
1413static void gen_CALL(DisasContext *s, X86DecodedInsn *decode)
1414{
1415    gen_push_v(s, eip_next_tl(s));
1416    gen_JMP(s, decode);
1417}
1418
1419static void gen_CALL_m(DisasContext *s, X86DecodedInsn *decode)
1420{
1421    gen_push_v(s, eip_next_tl(s));
1422    gen_JMP_m(s, decode);
1423}
1424
1425static void gen_CALLF(DisasContext *s, X86DecodedInsn *decode)
1426{
1427    gen_far_call(s);
1428}
1429
1430static void gen_CALLF_m(DisasContext *s, X86DecodedInsn *decode)
1431{
1432    MemOp ot = decode->op[1].ot;
1433
1434    gen_op_ld_v(s, ot, s->T0, s->A0);
1435    gen_add_A0_im(s, 1 << ot);
1436    gen_op_ld_v(s, MO_16, s->T1, s->A0);
1437    gen_far_call(s);
1438}
1439
1440static void gen_CBW(DisasContext *s, X86DecodedInsn *decode)
1441{
1442    MemOp src_ot = decode->op[0].ot - 1;
1443
1444    tcg_gen_ext_tl(s->T0, s->T0, src_ot | MO_SIGN);
1445}
1446
1447static void gen_CLC(DisasContext *s, X86DecodedInsn *decode)
1448{
1449    gen_compute_eflags(s);
1450    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
1451}
1452
1453static void gen_CLD(DisasContext *s, X86DecodedInsn *decode)
1454{
1455    tcg_gen_st_i32(tcg_constant_i32(1), tcg_env, offsetof(CPUX86State, df));
1456}
1457
1458static void gen_CLI(DisasContext *s, X86DecodedInsn *decode)
1459{
1460    gen_reset_eflags(s, IF_MASK);
1461}
1462
1463static void gen_CLTS(DisasContext *s, X86DecodedInsn *decode)
1464{
1465    gen_helper_clts(tcg_env);
1466    /* abort block because static cpu state changed */
1467    s->base.is_jmp = DISAS_EOB_NEXT;
1468}
1469
1470static void gen_CMC(DisasContext *s, X86DecodedInsn *decode)
1471{
1472    gen_compute_eflags(s);
1473    tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
1474}
1475
1476static void gen_CMOVcc(DisasContext *s, X86DecodedInsn *decode)
1477{
1478    gen_cmovcc1(s, decode->b & 0xf, s->T0, s->T1);
1479}
1480
1481static void gen_CMPccXADD(DisasContext *s, X86DecodedInsn *decode)
1482{
1483    TCGLabel *label_top = gen_new_label();
1484    TCGLabel *label_bottom = gen_new_label();
1485    TCGv oldv = tcg_temp_new();
1486    TCGv newv = tcg_temp_new();
1487    TCGv cmpv = tcg_temp_new();
1488    TCGCond cond;
1489
1490    TCGv cmp_lhs, cmp_rhs;
1491    MemOp ot, ot_full;
1492
1493    int jcc_op = (decode->b >> 1) & 7;
1494    static const TCGCond cond_table[8] = {
1495        [JCC_O] = TCG_COND_LT,  /* test sign bit by comparing against 0 */
1496        [JCC_B] = TCG_COND_LTU,
1497        [JCC_Z] = TCG_COND_EQ,
1498        [JCC_BE] = TCG_COND_LEU,
1499        [JCC_S] = TCG_COND_LT,  /* test sign bit by comparing against 0 */
1500        [JCC_P] = TCG_COND_TSTEQ,  /* even parity - tests low bit of popcount */
1501        [JCC_L] = TCG_COND_LT,
1502        [JCC_LE] = TCG_COND_LE,
1503    };
1504
1505    cond = cond_table[jcc_op];
1506    if (decode->b & 1) {
1507        cond = tcg_invert_cond(cond);
1508    }
1509
1510    ot = decode->op[0].ot;
1511    ot_full = ot | MO_LE;
1512    if (jcc_op >= JCC_S) {
1513        /*
1514         * Sign-extend values before subtracting for S, P (zero/sign extension
1515         * does not matter there) L, LE and their inverses.
1516         */
1517        ot_full |= MO_SIGN;
1518    }
1519
1520    /*
1521     * cmpv will be moved to cc_src *after* cpu_regs[] is written back, so use
1522     * tcg_gen_ext_tl instead of gen_ext_tl.
1523     */
1524    tcg_gen_ext_tl(cmpv, cpu_regs[decode->op[1].n], ot_full);
1525
1526    /*
1527     * Cmpxchg loop starts here.
1528     * - s->T1: addition operand (from decoder)
1529     * - s->A0: dest address (from decoder)
1530     * - s->cc_srcT: memory operand (lhs for comparison)
1531     * - cmpv: rhs for comparison
1532     */
1533    gen_set_label(label_top);
1534    gen_op_ld_v(s, ot_full, s->cc_srcT, s->A0);
1535    tcg_gen_sub_tl(s->T0, s->cc_srcT, cmpv);
1536
1537    /* Compute the comparison result by hand, to avoid clobbering cc_*.  */
1538    switch (jcc_op) {
1539    case JCC_O:
1540        /* (src1 ^ src2) & (src1 ^ dst). newv is only used here for a moment */
1541        tcg_gen_xor_tl(newv, s->cc_srcT, s->T0);
1542        tcg_gen_xor_tl(s->tmp0, s->cc_srcT, cmpv);
1543        tcg_gen_and_tl(s->tmp0, s->tmp0, newv);
1544        tcg_gen_sextract_tl(s->tmp0, s->tmp0, 0, 8 << ot);
1545        cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0);
1546        break;
1547
1548    case JCC_P:
1549        tcg_gen_ext8u_tl(s->tmp0, s->T0);
1550        tcg_gen_ctpop_tl(s->tmp0, s->tmp0);
1551        cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(1);
1552        break;
1553
1554    case JCC_S:
1555        tcg_gen_sextract_tl(s->tmp0, s->T0, 0, 8 << ot);
1556        cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0);
1557        break;
1558
1559    default:
1560        cmp_lhs = s->cc_srcT, cmp_rhs = cmpv;
1561        break;
1562    }
1563
1564    /* Compute new value: if condition does not hold, just store back s->cc_srcT */
1565    tcg_gen_add_tl(newv, s->cc_srcT, s->T1);
1566    tcg_gen_movcond_tl(cond, newv, cmp_lhs, cmp_rhs, newv, s->cc_srcT);
1567    tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, s->cc_srcT, newv, s->mem_index, ot_full);
1568
1569    /* Exit unconditionally if cmpxchg succeeded.  */
1570    tcg_gen_brcond_tl(TCG_COND_EQ, oldv, s->cc_srcT, label_bottom);
1571
1572    /* Try again if there was actually a store to make.  */
1573    tcg_gen_brcond_tl(cond, cmp_lhs, cmp_rhs, label_top);
1574    gen_set_label(label_bottom);
1575
1576    /* Store old value to registers only after a successful store.  */
1577    gen_writeback(s, decode, 1, s->cc_srcT);
1578
1579    decode->cc_dst = s->T0;
1580    decode->cc_src = cmpv;
1581    decode->cc_op = CC_OP_SUBB + ot;
1582}
1583
1584static void gen_CMPS(DisasContext *s, X86DecodedInsn *decode)
1585{
1586    MemOp ot = decode->op[2].ot;
1587    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
1588        gen_repz_nz(s, ot, gen_cmps);
1589    } else {
1590        gen_cmps(s, ot);
1591    }
1592}
1593
1594static void gen_CMPXCHG(DisasContext *s, X86DecodedInsn *decode)
1595{
1596    MemOp ot = decode->op[2].ot;
1597    TCGv cmpv = tcg_temp_new();
1598    TCGv oldv = tcg_temp_new();
1599    TCGv newv = tcg_temp_new();
1600    TCGv dest;
1601
1602    tcg_gen_ext_tl(cmpv, cpu_regs[R_EAX], ot);
1603    tcg_gen_ext_tl(newv, s->T1, ot);
1604    if (s->prefix & PREFIX_LOCK) {
1605        tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, cmpv, newv,
1606                                  s->mem_index, ot | MO_LE);
1607    } else {
1608        tcg_gen_ext_tl(oldv, s->T0, ot);
1609        if (decode->op[0].has_ea) {
1610            /*
1611             * Perform an unconditional store cycle like physical cpu;
1612             * must be before changing accumulator to ensure
1613             * idempotency if the store faults and the instruction
1614             * is restarted
1615             */
1616            tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
1617            gen_op_st_v(s, ot, newv, s->A0);
1618        } else {
1619            /*
1620             * Unlike the memory case, where "the destination operand receives
1621             * a write cycle without regard to the result of the comparison",
1622             * rm must not be touched altogether if the write fails, including
1623             * not zero-extending it on 64-bit processors.  So, precompute
1624             * the result of a successful writeback and perform the movcond
1625             * directly on cpu_regs.  In case rm is part of RAX, note that this
1626             * movcond and the one below are mutually exclusive is executed.
1627             */
1628            dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, newv, newv);
1629            tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, newv, dest);
1630        }
1631        decode->op[0].unit = X86_OP_SKIP;
1632    }
1633
1634    /* Write RAX only if the cmpxchg fails.  */
1635    dest = gen_op_deposit_reg_v(s, ot, R_EAX, s->T0, oldv);
1636    tcg_gen_movcond_tl(TCG_COND_NE, dest, oldv, cmpv, s->T0, dest);
1637
1638    tcg_gen_mov_tl(s->cc_srcT, cmpv);
1639    tcg_gen_sub_tl(cmpv, cmpv, oldv);
1640    decode->cc_dst = cmpv;
1641    decode->cc_src = oldv;
1642    decode->cc_op = CC_OP_SUBB + ot;
1643}
1644
1645static void gen_CPUID(DisasContext *s, X86DecodedInsn *decode)
1646{
1647    gen_update_cc_op(s);
1648    gen_update_eip_cur(s);
1649    gen_helper_cpuid(tcg_env);
1650}
1651
1652static void gen_CRC32(DisasContext *s, X86DecodedInsn *decode)
1653{
1654    MemOp ot = decode->op[2].ot;
1655
1656    tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
1657    gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
1658}
1659
1660static void gen_CVTPI2Px(DisasContext *s, X86DecodedInsn *decode)
1661{
1662    gen_helper_enter_mmx(tcg_env);
1663    if (s->prefix & PREFIX_DATA) {
1664        gen_helper_cvtpi2pd(tcg_env, OP_PTR0, OP_PTR2);
1665    } else {
1666        gen_helper_cvtpi2ps(tcg_env, OP_PTR0, OP_PTR2);
1667    }
1668}
1669
1670static void gen_CVTPx2PI(DisasContext *s, X86DecodedInsn *decode)
1671{
1672    gen_helper_enter_mmx(tcg_env);
1673    if (s->prefix & PREFIX_DATA) {
1674        gen_helper_cvtpd2pi(tcg_env, OP_PTR0, OP_PTR2);
1675    } else {
1676        gen_helper_cvtps2pi(tcg_env, OP_PTR0, OP_PTR2);
1677    }
1678}
1679
1680static void gen_CVTTPx2PI(DisasContext *s, X86DecodedInsn *decode)
1681{
1682    gen_helper_enter_mmx(tcg_env);
1683    if (s->prefix & PREFIX_DATA) {
1684        gen_helper_cvttpd2pi(tcg_env, OP_PTR0, OP_PTR2);
1685    } else {
1686        gen_helper_cvttps2pi(tcg_env, OP_PTR0, OP_PTR2);
1687    }
1688}
1689
1690static void gen_CWD(DisasContext *s, X86DecodedInsn *decode)
1691{
1692    int shift = 8 << decode->op[0].ot;
1693
1694    tcg_gen_sextract_tl(s->T0, s->T0, shift - 1, 1);
1695}
1696
1697static void gen_DAA(DisasContext *s, X86DecodedInsn *decode)
1698{
1699    gen_update_cc_op(s);
1700    gen_helper_daa(tcg_env);
1701    assume_cc_op(s, CC_OP_EFLAGS);
1702}
1703
1704static void gen_DAS(DisasContext *s, X86DecodedInsn *decode)
1705{
1706    gen_update_cc_op(s);
1707    gen_helper_das(tcg_env);
1708    assume_cc_op(s, CC_OP_EFLAGS);
1709}
1710
1711static void gen_DEC(DisasContext *s, X86DecodedInsn *decode)
1712{
1713    MemOp ot = decode->op[1].ot;
1714
1715    tcg_gen_movi_tl(s->T1, -1);
1716    if (s->prefix & PREFIX_LOCK) {
1717        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T1,
1718                                    s->mem_index, ot | MO_LE);
1719    } else {
1720        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1721    }
1722    prepare_update_cc_incdec(decode, s, CC_OP_DECB + ot);
1723}
1724
1725static void gen_DIV(DisasContext *s, X86DecodedInsn *decode)
1726{
1727    MemOp ot = decode->op[1].ot;
1728
1729    switch(ot) {
1730    case MO_8:
1731        gen_helper_divb_AL(tcg_env, s->T0);
1732        break;
1733    case MO_16:
1734        gen_helper_divw_AX(tcg_env, s->T0);
1735        break;
1736    default:
1737    case MO_32:
1738        gen_helper_divl_EAX(tcg_env, s->T0);
1739        break;
1740#ifdef TARGET_X86_64
1741    case MO_64:
1742        gen_helper_divq_EAX(tcg_env, s->T0);
1743        break;
1744#endif
1745    }
1746}
1747
1748static void gen_EMMS(DisasContext *s, X86DecodedInsn *decode)
1749{
1750    gen_helper_emms(tcg_env);
1751}
1752
1753static void gen_ENTER(DisasContext *s, X86DecodedInsn *decode)
1754{
1755   gen_enter(s, decode->op[1].imm, decode->op[2].imm);
1756}
1757
1758static void gen_EXTRQ_i(DisasContext *s, X86DecodedInsn *decode)
1759{
1760    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
1761    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
1762
1763    gen_helper_extrq_i(tcg_env, OP_PTR0, index, length);
1764}
1765
1766static void gen_EXTRQ_r(DisasContext *s, X86DecodedInsn *decode)
1767{
1768    gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
1769}
1770
1771static void gen_FXRSTOR(DisasContext *s, X86DecodedInsn *decode)
1772{
1773    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
1774        gen_NM_exception(s);
1775    } else {
1776        gen_helper_fxrstor(tcg_env, s->A0);
1777    }
1778}
1779
1780static void gen_FXSAVE(DisasContext *s, X86DecodedInsn *decode)
1781{
1782    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
1783        gen_NM_exception(s);
1784    } else {
1785        gen_helper_fxsave(tcg_env, s->A0);
1786    }
1787}
1788
1789static void gen_HLT(DisasContext *s, X86DecodedInsn *decode)
1790{
1791#ifdef CONFIG_SYSTEM_ONLY
1792    gen_update_cc_op(s);
1793    gen_update_eip_next(s);
1794    gen_helper_hlt(tcg_env);
1795    s->base.is_jmp = DISAS_NORETURN;
1796#endif
1797}
1798
1799static void gen_IDIV(DisasContext *s, X86DecodedInsn *decode)
1800{
1801    MemOp ot = decode->op[1].ot;
1802
1803    switch(ot) {
1804    case MO_8:
1805        gen_helper_idivb_AL(tcg_env, s->T0);
1806        break;
1807    case MO_16:
1808        gen_helper_idivw_AX(tcg_env, s->T0);
1809        break;
1810    default:
1811    case MO_32:
1812        gen_helper_idivl_EAX(tcg_env, s->T0);
1813        break;
1814#ifdef TARGET_X86_64
1815    case MO_64:
1816        gen_helper_idivq_EAX(tcg_env, s->T0);
1817        break;
1818#endif
1819    }
1820}
1821
1822static void gen_IMUL3(DisasContext *s, X86DecodedInsn *decode)
1823{
1824    MemOp ot = decode->op[0].ot;
1825    TCGv cc_src_rhs;
1826
1827    switch (ot) {
1828    case MO_16:
1829        /* s->T0 already sign-extended */
1830        tcg_gen_ext16s_tl(s->T1, s->T1);
1831        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
1832        /* Compare the full result to the extension of the truncated result.  */
1833        tcg_gen_ext16s_tl(s->T1, s->T0);
1834        cc_src_rhs = s->T0;
1835        break;
1836
1837    case MO_32:
1838#ifdef TARGET_X86_64
1839        if (TCG_TARGET_REG_BITS == 64) {
1840            /*
1841             * This produces fewer TCG ops, and better code if flags are needed,
1842             * but it requires a 64-bit multiply even if they are not.  Use it
1843             * only if the target has 64-bits registers.
1844             *
1845             * s->T0 is already sign-extended.
1846             */
1847            tcg_gen_ext32s_tl(s->T1, s->T1);
1848            tcg_gen_mul_tl(s->T0, s->T0, s->T1);
1849            /* Compare the full result to the extension of the truncated result.  */
1850            tcg_gen_ext32s_tl(s->T1, s->T0);
1851            cc_src_rhs = s->T0;
1852        } else {
1853            /* Variant that only needs a 32-bit widening multiply.  */
1854            TCGv_i32 hi = tcg_temp_new_i32();
1855            TCGv_i32 lo = tcg_temp_new_i32();
1856            tcg_gen_trunc_tl_i32(lo, s->T0);
1857            tcg_gen_trunc_tl_i32(hi, s->T1);
1858            tcg_gen_muls2_i32(lo, hi, lo, hi);
1859            tcg_gen_extu_i32_tl(s->T0, lo);
1860
1861            cc_src_rhs = tcg_temp_new();
1862            tcg_gen_extu_i32_tl(cc_src_rhs, hi);
1863            /* Compare the high part to the sign bit of the truncated result */
1864            tcg_gen_sari_i32(lo, lo, 31);
1865            tcg_gen_extu_i32_tl(s->T1, lo);
1866        }
1867        break;
1868
1869    case MO_64:
1870#endif
1871        cc_src_rhs = tcg_temp_new();
1872        tcg_gen_muls2_tl(s->T0, cc_src_rhs, s->T0, s->T1);
1873        /* Compare the high part to the sign bit of the truncated result */
1874        tcg_gen_sari_tl(s->T1, s->T0, TARGET_LONG_BITS - 1);
1875        break;
1876
1877    default:
1878        g_assert_not_reached();
1879    }
1880
1881    tcg_gen_sub_tl(s->T1, s->T1, cc_src_rhs);
1882    prepare_update2_cc(decode, s, CC_OP_MULB + ot);
1883}
1884
1885static void gen_IMUL(DisasContext *s, X86DecodedInsn *decode)
1886{
1887    MemOp ot = decode->op[1].ot;
1888    TCGv cc_src_rhs;
1889
1890    switch (ot) {
1891    case MO_8:
1892        /* s->T0 already sign-extended */
1893        tcg_gen_ext8s_tl(s->T1, s->T1);
1894        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
1895        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
1896        /* Compare the full result to the extension of the truncated result.  */
1897        tcg_gen_ext8s_tl(s->T1, s->T0);
1898        cc_src_rhs = s->T0;
1899        break;
1900
1901    case MO_16:
1902        /* s->T0 already sign-extended */
1903        tcg_gen_ext16s_tl(s->T1, s->T1);
1904        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
1905        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
1906        tcg_gen_shri_tl(s->T1, s->T0, 16);
1907        gen_op_mov_reg_v(s, MO_16, R_EDX, s->T1);
1908        /* Compare the full result to the extension of the truncated result.  */
1909        tcg_gen_ext16s_tl(s->T1, s->T0);
1910        cc_src_rhs = s->T0;
1911        break;
1912
1913    case MO_32:
1914#ifdef TARGET_X86_64
1915        /* s->T0 already sign-extended */
1916        tcg_gen_ext32s_tl(s->T1, s->T1);
1917        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
1918        tcg_gen_ext32u_tl(cpu_regs[R_EAX], s->T0);
1919        tcg_gen_shri_tl(cpu_regs[R_EDX], s->T0, 32);
1920        /* Compare the full result to the extension of the truncated result.  */
1921        tcg_gen_ext32s_tl(s->T1, s->T0);
1922        cc_src_rhs = s->T0;
1923        break;
1924
1925    case MO_64:
1926#endif
1927        tcg_gen_muls2_tl(s->T0, cpu_regs[R_EDX], s->T0, s->T1);
1928        tcg_gen_mov_tl(cpu_regs[R_EAX], s->T0);
1929
1930        /* Compare the high part to the sign bit of the truncated result */
1931        tcg_gen_negsetcondi_tl(TCG_COND_LT, s->T1, s->T0, 0);
1932        cc_src_rhs = cpu_regs[R_EDX];
1933        break;
1934
1935    default:
1936        g_assert_not_reached();
1937    }
1938
1939    tcg_gen_sub_tl(s->T1, s->T1, cc_src_rhs);
1940    prepare_update2_cc(decode, s, CC_OP_MULB + ot);
1941}
1942
1943static void gen_IN(DisasContext *s, X86DecodedInsn *decode)
1944{
1945    MemOp ot = decode->op[0].ot;
1946    TCGv_i32 port = tcg_temp_new_i32();
1947
1948    tcg_gen_trunc_tl_i32(port, s->T0);
1949    tcg_gen_ext16u_i32(port, port);
1950    if (!gen_check_io(s, ot, port, SVM_IOIO_TYPE_MASK)) {
1951        return;
1952    }
1953    translator_io_start(&s->base);
1954    gen_helper_in_func(ot, s->T0, port);
1955    gen_writeback(s, decode, 0, s->T0);
1956    gen_bpt_io(s, port, ot);
1957}
1958
1959static void gen_INC(DisasContext *s, X86DecodedInsn *decode)
1960{
1961    MemOp ot = decode->op[1].ot;
1962
1963    tcg_gen_movi_tl(s->T1, 1);
1964    if (s->prefix & PREFIX_LOCK) {
1965        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T1,
1966                                    s->mem_index, ot | MO_LE);
1967    } else {
1968        tcg_gen_add_tl(s->T0, s->T0, s->T1);
1969    }
1970    prepare_update_cc_incdec(decode, s, CC_OP_INCB + ot);
1971}
1972
1973static void gen_INS(DisasContext *s, X86DecodedInsn *decode)
1974{
1975    MemOp ot = decode->op[1].ot;
1976    TCGv_i32 port = tcg_temp_new_i32();
1977
1978    tcg_gen_trunc_tl_i32(port, s->T1);
1979    tcg_gen_ext16u_i32(port, port);
1980    if (!gen_check_io(s, ot, port,
1981                      SVM_IOIO_TYPE_MASK | SVM_IOIO_STR_MASK)) {
1982        return;
1983    }
1984
1985    translator_io_start(&s->base);
1986    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
1987        gen_repz(s, ot, gen_ins);
1988    } else {
1989        gen_ins(s, ot);
1990    }
1991}
1992
1993static void gen_INSERTQ_i(DisasContext *s, X86DecodedInsn *decode)
1994{
1995    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
1996    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
1997
1998    gen_helper_insertq_i(tcg_env, OP_PTR0, OP_PTR1, index, length);
1999}
2000
2001static void gen_INSERTQ_r(DisasContext *s, X86DecodedInsn *decode)
2002{
2003    gen_helper_insertq_r(tcg_env, OP_PTR0, OP_PTR2);
2004}
2005
2006static void gen_INT(DisasContext *s, X86DecodedInsn *decode)
2007{
2008    gen_interrupt(s, decode->immediate);
2009}
2010
2011static void gen_INT1(DisasContext *s, X86DecodedInsn *decode)
2012{
2013    gen_update_cc_op(s);
2014    gen_update_eip_next(s);
2015    gen_helper_icebp(tcg_env);
2016    s->base.is_jmp = DISAS_NORETURN;
2017}
2018
2019static void gen_INT3(DisasContext *s, X86DecodedInsn *decode)
2020{
2021    gen_interrupt(s, EXCP03_INT3);
2022}
2023
2024static void gen_INTO(DisasContext *s, X86DecodedInsn *decode)
2025{
2026    gen_update_cc_op(s);
2027    gen_update_eip_cur(s);
2028    gen_helper_into(tcg_env, cur_insn_len_i32(s));
2029}
2030
2031static void gen_IRET(DisasContext *s, X86DecodedInsn *decode)
2032{
2033    if (!PE(s) || VM86(s)) {
2034        gen_helper_iret_real(tcg_env, tcg_constant_i32(s->dflag - 1));
2035    } else {
2036        gen_helper_iret_protected(tcg_env, tcg_constant_i32(s->dflag - 1),
2037                                  eip_next_i32(s));
2038    }
2039    assume_cc_op(s, CC_OP_EFLAGS);
2040    s->base.is_jmp = DISAS_EOB_ONLY;
2041}
2042
2043static void gen_Jcc(DisasContext *s, X86DecodedInsn *decode)
2044{
2045    gen_bnd_jmp(s);
2046    gen_jcc(s, decode->b & 0xf, decode->immediate);
2047}
2048
2049static void gen_JCXZ(DisasContext *s, X86DecodedInsn *decode)
2050{
2051    TCGLabel *taken = gen_new_label();
2052
2053    gen_update_cc_op(s);
2054    gen_op_jz_ecx(s, taken);
2055    gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
2056}
2057
2058static void gen_JMP(DisasContext *s, X86DecodedInsn *decode)
2059{
2060    gen_update_cc_op(s);
2061    gen_jmp_rel(s, s->dflag, decode->immediate, 0);
2062}
2063
2064static void gen_JMP_m(DisasContext *s, X86DecodedInsn *decode)
2065{
2066    gen_op_jmp_v(s, s->T0);
2067    gen_bnd_jmp(s);
2068    s->base.is_jmp = DISAS_JUMP;
2069}
2070
2071static void gen_JMPF(DisasContext *s, X86DecodedInsn *decode)
2072{
2073    gen_far_jmp(s);
2074}
2075
2076static void gen_JMPF_m(DisasContext *s, X86DecodedInsn *decode)
2077{
2078    MemOp ot = decode->op[1].ot;
2079
2080    gen_op_ld_v(s, ot, s->T0, s->A0);
2081    gen_add_A0_im(s, 1 << ot);
2082    gen_op_ld_v(s, MO_16, s->T1, s->A0);
2083    gen_far_jmp(s);
2084}
2085
2086static void gen_LAHF(DisasContext *s, X86DecodedInsn *decode)
2087{
2088    if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
2089        return gen_illegal_opcode(s);
2090    }
2091    gen_compute_eflags(s);
2092    /* Note: gen_compute_eflags() only gives the condition codes */
2093    tcg_gen_ori_tl(s->T0, cpu_cc_src, 0x02);
2094    tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8);
2095}
2096
2097static void gen_LAR(DisasContext *s, X86DecodedInsn *decode)
2098{
2099    MemOp ot = decode->op[0].ot;
2100    TCGv result = tcg_temp_new();
2101    TCGv dest;
2102
2103    gen_compute_eflags(s);
2104    gen_update_cc_op(s);
2105    gen_helper_lar(result, tcg_env, s->T0);
2106
2107    /* Perform writeback here to skip it if ZF=0.  */
2108    decode->op[0].unit = X86_OP_SKIP;
2109    dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, result, result);
2110    tcg_gen_movcond_tl(TCG_COND_TSTNE, dest, cpu_cc_src, tcg_constant_tl(CC_Z),
2111                       result, dest);
2112}
2113
2114static void gen_LDMXCSR(DisasContext *s, X86DecodedInsn *decode)
2115{
2116    tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
2117    gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
2118}
2119
2120static void gen_lxx_seg(DisasContext *s, X86DecodedInsn *decode, int seg)
2121{
2122    MemOp ot = decode->op[0].ot;
2123
2124    /* Offset already in s->T0.  */
2125    gen_add_A0_im(s, 1 << ot);
2126    gen_op_ld_v(s, MO_16, s->T1, s->A0);
2127
2128    /* load the segment here to handle exceptions properly */
2129    gen_movl_seg(s, seg, s->T1);
2130}
2131
2132static void gen_LDS(DisasContext *s, X86DecodedInsn *decode)
2133{
2134    gen_lxx_seg(s, decode, R_DS);
2135}
2136
2137static void gen_LEA(DisasContext *s, X86DecodedInsn *decode)
2138{
2139    TCGv ea = gen_lea_modrm_1(s, decode->mem, false);
2140    gen_lea_v_seg_dest(s, s->aflag, s->T0, ea, -1, -1);
2141}
2142
2143static void gen_LEAVE(DisasContext *s, X86DecodedInsn *decode)
2144{
2145    gen_leave(s);
2146}
2147
2148static void gen_LES(DisasContext *s, X86DecodedInsn *decode)
2149{
2150    gen_lxx_seg(s, decode, R_ES);
2151}
2152
2153static void gen_LFENCE(DisasContext *s, X86DecodedInsn *decode)
2154{
2155    tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
2156}
2157
2158static void gen_LFS(DisasContext *s, X86DecodedInsn *decode)
2159{
2160    gen_lxx_seg(s, decode, R_FS);
2161}
2162
2163static void gen_LGS(DisasContext *s, X86DecodedInsn *decode)
2164{
2165    gen_lxx_seg(s, decode, R_GS);
2166}
2167
2168static void gen_LODS(DisasContext *s, X86DecodedInsn *decode)
2169{
2170    MemOp ot = decode->op[1].ot;
2171    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
2172        gen_repz(s, ot, gen_lods);
2173    } else {
2174        gen_lods(s, ot);
2175    }
2176}
2177
2178static void gen_LOOP(DisasContext *s, X86DecodedInsn *decode)
2179{
2180    TCGLabel *taken = gen_new_label();
2181
2182    gen_update_cc_op(s);
2183    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
2184    gen_op_jnz_ecx(s, taken);
2185    gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
2186}
2187
2188static void gen_LOOPE(DisasContext *s, X86DecodedInsn *decode)
2189{
2190    TCGLabel *taken = gen_new_label();
2191    TCGLabel *not_taken = gen_new_label();
2192
2193    gen_update_cc_op(s);
2194    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
2195    gen_op_jz_ecx(s, not_taken);
2196    gen_jcc1(s, (JCC_Z << 1), taken); /* jz taken */
2197    gen_conditional_jump_labels(s, decode->immediate, not_taken, taken);
2198}
2199
2200static void gen_LOOPNE(DisasContext *s, X86DecodedInsn *decode)
2201{
2202    TCGLabel *taken = gen_new_label();
2203    TCGLabel *not_taken = gen_new_label();
2204
2205    gen_update_cc_op(s);
2206    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
2207    gen_op_jz_ecx(s, not_taken);
2208    gen_jcc1(s, (JCC_Z << 1) | 1, taken); /* jnz taken */
2209    gen_conditional_jump_labels(s, decode->immediate, not_taken, taken);
2210}
2211
2212static void gen_LSL(DisasContext *s, X86DecodedInsn *decode)
2213{
2214    MemOp ot = decode->op[0].ot;
2215    TCGv result = tcg_temp_new();
2216    TCGv dest;
2217
2218    gen_compute_eflags(s);
2219    gen_update_cc_op(s);
2220    gen_helper_lsl(result, tcg_env, s->T0);
2221
2222    /* Perform writeback here to skip it if ZF=0.  */
2223    decode->op[0].unit = X86_OP_SKIP;
2224    dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, result, result);
2225    tcg_gen_movcond_tl(TCG_COND_TSTNE, dest, cpu_cc_src, tcg_constant_tl(CC_Z),
2226                       result, dest);
2227}
2228
2229static void gen_LSS(DisasContext *s, X86DecodedInsn *decode)
2230{
2231    gen_lxx_seg(s, decode, R_SS);
2232}
2233
2234static void gen_LZCNT(DisasContext *s, X86DecodedInsn *decode)
2235{
2236    MemOp ot = decode->op[0].ot;
2237
2238    /* C bit (cc_src) is defined related to the input.  */
2239    decode->cc_src = tcg_temp_new();
2240    decode->cc_dst = s->T0;
2241    decode->cc_op = CC_OP_BMILGB + ot;
2242    tcg_gen_mov_tl(decode->cc_src, s->T0);
2243
2244    /*
2245     * Reduce the target_ulong result by the number of zeros that
2246     * we expect to find at the top.
2247     */
2248    tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS);
2249    tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - (8 << ot));
2250}
2251
2252static void gen_MFENCE(DisasContext *s, X86DecodedInsn *decode)
2253{
2254    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
2255}
2256
2257static void gen_MOV(DisasContext *s, X86DecodedInsn *decode)
2258{
2259    /* nothing to do! */
2260}
2261#define gen_NOP gen_MOV
2262
2263static void gen_MASKMOV(DisasContext *s, X86DecodedInsn *decode)
2264{
2265    gen_lea_v_seg(s, cpu_regs[R_EDI], R_DS, s->override);
2266
2267    if (s->prefix & PREFIX_DATA) {
2268        gen_helper_maskmov_xmm(tcg_env, OP_PTR1, OP_PTR2, s->A0);
2269    } else {
2270        gen_helper_maskmov_mmx(tcg_env, OP_PTR1, OP_PTR2, s->A0);
2271    }
2272}
2273
2274static void gen_MOVBE(DisasContext *s, X86DecodedInsn *decode)
2275{
2276    MemOp ot = decode->op[0].ot;
2277
2278    /* M operand type does not load/store */
2279    if (decode->e.op0 == X86_TYPE_M) {
2280        tcg_gen_qemu_st_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
2281    } else {
2282        tcg_gen_qemu_ld_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
2283    }
2284}
2285
2286static void gen_MOVD_from(DisasContext *s, X86DecodedInsn *decode)
2287{
2288    MemOp ot = decode->op[2].ot;
2289
2290    switch (ot) {
2291    case MO_32:
2292#ifdef TARGET_X86_64
2293        tcg_gen_ld32u_tl(s->T0, tcg_env, decode->op[2].offset);
2294        break;
2295    case MO_64:
2296#endif
2297        tcg_gen_ld_tl(s->T0, tcg_env, decode->op[2].offset);
2298        break;
2299    default:
2300        abort();
2301    }
2302}
2303
2304static void gen_MOVD_to(DisasContext *s, X86DecodedInsn *decode)
2305{
2306    MemOp ot = decode->op[2].ot;
2307    int vec_len = vector_len(s, decode);
2308    int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0);
2309
2310    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2311
2312    switch (ot) {
2313    case MO_32:
2314#ifdef TARGET_X86_64
2315        tcg_gen_st32_tl(s->T1, tcg_env, lo_ofs);
2316        break;
2317    case MO_64:
2318#endif
2319        tcg_gen_st_tl(s->T1, tcg_env, lo_ofs);
2320        break;
2321    default:
2322        g_assert_not_reached();
2323    }
2324}
2325
2326static void gen_MOVDQ(DisasContext *s, X86DecodedInsn *decode)
2327{
2328    gen_store_sse(s, decode, decode->op[2].offset);
2329}
2330
2331static void gen_MOVMSK(DisasContext *s, X86DecodedInsn *decode)
2332{
2333    typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn;
2334    ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm;
2335    pd = s->vex_l ? gen_helper_movmskpd_ymm : gen_helper_movmskpd_xmm;
2336    fn = s->prefix & PREFIX_DATA ? pd : ps;
2337    fn(s->tmp2_i32, tcg_env, OP_PTR2);
2338    tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
2339}
2340
2341static void gen_MOVQ(DisasContext *s, X86DecodedInsn *decode)
2342{
2343    int vec_len = vector_len(s, decode);
2344    int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
2345
2346    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
2347    if (decode->op[0].has_ea) {
2348        tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
2349    } else {
2350        /*
2351         * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
2352         * seem to work, but it does not on big-endian platforms; the cleared parts
2353         * are always at higher addresses, but cross-endian emulation inverts the
2354         * byte order so that the cleared parts need to be at *lower* addresses.
2355         * Because oprsz is 8, we see this here even for SSE; but more in general,
2356         * it disqualifies using oprsz < maxsz to emulate VEX128.
2357         */
2358        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2359        tcg_gen_st_i64(s->tmp1_i64, tcg_env, lo_ofs);
2360    }
2361}
2362
2363static void gen_MOVq_dq(DisasContext *s, X86DecodedInsn *decode)
2364{
2365    gen_helper_enter_mmx(tcg_env);
2366    /* Otherwise the same as any other movq.  */
2367    return gen_MOVQ(s, decode);
2368}
2369
2370static void gen_MOVS(DisasContext *s, X86DecodedInsn *decode)
2371{
2372    MemOp ot = decode->op[2].ot;
2373    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
2374        gen_repz(s, ot, gen_movs);
2375    } else {
2376        gen_movs(s, ot);
2377    }
2378}
2379
2380static void gen_MUL(DisasContext *s, X86DecodedInsn *decode)
2381{
2382    MemOp ot = decode->op[1].ot;
2383
2384    switch (ot) {
2385    case MO_8:
2386        /* s->T0 already zero-extended */
2387        tcg_gen_ext8u_tl(s->T1, s->T1);
2388        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2389        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
2390        tcg_gen_andi_tl(s->T1, s->T0, 0xff00);
2391        decode->cc_dst = s->T0;
2392        decode->cc_src = s->T1;
2393        break;
2394
2395    case MO_16:
2396        /* s->T0 already zero-extended */
2397        tcg_gen_ext16u_tl(s->T1, s->T1);
2398        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2399        gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0);
2400        tcg_gen_shri_tl(s->T1, s->T0, 16);
2401        gen_op_mov_reg_v(s, MO_16, R_EDX, s->T1);
2402        decode->cc_dst = s->T0;
2403        decode->cc_src = s->T1;
2404        break;
2405
2406    case MO_32:
2407#ifdef TARGET_X86_64
2408        /* s->T0 already zero-extended */
2409        tcg_gen_ext32u_tl(s->T1, s->T1);
2410        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
2411        tcg_gen_ext32u_tl(cpu_regs[R_EAX], s->T0);
2412        tcg_gen_shri_tl(cpu_regs[R_EDX], s->T0, 32);
2413        decode->cc_dst = cpu_regs[R_EAX];
2414        decode->cc_src = cpu_regs[R_EDX];
2415        break;
2416
2417    case MO_64:
2418#endif
2419        tcg_gen_mulu2_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], s->T0, s->T1);
2420        decode->cc_dst = cpu_regs[R_EAX];
2421        decode->cc_src = cpu_regs[R_EDX];
2422        break;
2423
2424    default:
2425        g_assert_not_reached();
2426    }
2427
2428    decode->cc_op = CC_OP_MULB + ot;
2429}
2430
2431static void gen_MULX(DisasContext *s, X86DecodedInsn *decode)
2432{
2433    MemOp ot = decode->op[0].ot;
2434
2435    /* low part of result in VEX.vvvv, high in MODRM */
2436    switch (ot) {
2437    case MO_32:
2438#ifdef TARGET_X86_64
2439        tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
2440        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
2441        tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32,
2442                          s->tmp2_i32, s->tmp3_i32);
2443        tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32);
2444        tcg_gen_extu_i32_tl(s->T0, s->tmp3_i32);
2445        break;
2446
2447    case MO_64:
2448#endif
2449        tcg_gen_mulu2_tl(cpu_regs[s->vex_v], s->T0, s->T0, s->T1);
2450        break;
2451
2452    default:
2453        g_assert_not_reached();
2454    }
2455}
2456
2457static void gen_NEG(DisasContext *s, X86DecodedInsn *decode)
2458{
2459    MemOp ot = decode->op[0].ot;
2460    TCGv oldv = tcg_temp_new();
2461
2462    if (s->prefix & PREFIX_LOCK) {
2463        TCGv newv = tcg_temp_new();
2464        TCGv cmpv = tcg_temp_new();
2465        TCGLabel *label1 = gen_new_label();
2466
2467        gen_set_label(label1);
2468        gen_op_ld_v(s, ot, oldv, s->A0);
2469        tcg_gen_neg_tl(newv, oldv);
2470        tcg_gen_atomic_cmpxchg_tl(cmpv, s->A0, oldv, newv,
2471                                  s->mem_index, ot | MO_LE);
2472        tcg_gen_brcond_tl(TCG_COND_NE, oldv, cmpv, label1);
2473    } else {
2474        tcg_gen_mov_tl(oldv, s->T0);
2475    }
2476    tcg_gen_neg_tl(s->T0, oldv);
2477
2478    decode->cc_dst = s->T0;
2479    decode->cc_src = oldv;
2480    tcg_gen_movi_tl(s->cc_srcT, 0);
2481    decode->cc_op = CC_OP_SUBB + ot;
2482}
2483
2484static void gen_NOT(DisasContext *s, X86DecodedInsn *decode)
2485{
2486    MemOp ot = decode->op[0].ot;
2487
2488    if (s->prefix & PREFIX_LOCK) {
2489        tcg_gen_movi_tl(s->T0, ~0);
2490        tcg_gen_atomic_xor_fetch_tl(s->T0, s->A0, s->T0,
2491                                    s->mem_index, ot | MO_LE);
2492    } else {
2493        tcg_gen_not_tl(s->T0, s->T0);
2494    }
2495}
2496
2497static void gen_OR(DisasContext *s, X86DecodedInsn *decode)
2498{
2499    MemOp ot = decode->op[1].ot;
2500
2501    if (s->prefix & PREFIX_LOCK) {
2502        tcg_gen_atomic_or_fetch_tl(s->T0, s->A0, s->T1,
2503                                   s->mem_index, ot | MO_LE);
2504    } else {
2505        tcg_gen_or_tl(s->T0, s->T0, s->T1);
2506    }
2507    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
2508}
2509
2510static void gen_OUT(DisasContext *s, X86DecodedInsn *decode)
2511{
2512    MemOp ot = decode->op[1].ot;
2513    TCGv_i32 port = tcg_temp_new_i32();
2514    TCGv_i32 value = tcg_temp_new_i32();
2515
2516    tcg_gen_trunc_tl_i32(port, s->T1);
2517    tcg_gen_ext16u_i32(port, port);
2518    if (!gen_check_io(s, ot, port, 0)) {
2519        return;
2520    }
2521    tcg_gen_trunc_tl_i32(value, s->T0);
2522    translator_io_start(&s->base);
2523    gen_helper_out_func(ot, port, value);
2524    gen_bpt_io(s, port, ot);
2525}
2526
2527static void gen_OUTS(DisasContext *s, X86DecodedInsn *decode)
2528{
2529    MemOp ot = decode->op[1].ot;
2530    TCGv_i32 port = tcg_temp_new_i32();
2531
2532    tcg_gen_trunc_tl_i32(port, s->T1);
2533    tcg_gen_ext16u_i32(port, port);
2534    if (!gen_check_io(s, ot, port, SVM_IOIO_STR_MASK)) {
2535        return;
2536    }
2537
2538    translator_io_start(&s->base);
2539    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
2540        gen_repz(s, ot, gen_outs);
2541    } else {
2542        gen_outs(s, ot);
2543    }
2544}
2545
2546static void gen_PALIGNR(DisasContext *s, X86DecodedInsn *decode)
2547{
2548    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2549    if (!(s->prefix & PREFIX_DATA)) {
2550        gen_helper_palignr_mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2551    } else if (!s->vex_l) {
2552        gen_helper_palignr_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2553    } else {
2554        gen_helper_palignr_ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2555    }
2556}
2557
2558static void gen_PANDN(DisasContext *s, X86DecodedInsn *decode)
2559{
2560    int vec_len = vector_len(s, decode);
2561
2562    /* Careful, operand order is reversed!  */
2563    tcg_gen_gvec_andc(MO_64,
2564                      decode->op[0].offset, decode->op[2].offset,
2565                      decode->op[1].offset, vec_len, vec_len);
2566}
2567
2568static void gen_PAUSE(DisasContext *s, X86DecodedInsn *decode)
2569{
2570    gen_update_cc_op(s);
2571    gen_update_eip_next(s);
2572    gen_helper_pause(tcg_env);
2573    s->base.is_jmp = DISAS_NORETURN;
2574}
2575
2576static void gen_PCMPESTRI(DisasContext *s, X86DecodedInsn *decode)
2577{
2578    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2579    gen_helper_pcmpestri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2580    assume_cc_op(s, CC_OP_EFLAGS);
2581}
2582
2583static void gen_PCMPESTRM(DisasContext *s, X86DecodedInsn *decode)
2584{
2585    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2586    gen_helper_pcmpestrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2587    assume_cc_op(s, CC_OP_EFLAGS);
2588    if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
2589        tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
2590                             16, 16, 0);
2591    }
2592}
2593
2594static void gen_PCMPISTRI(DisasContext *s, X86DecodedInsn *decode)
2595{
2596    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2597    gen_helper_pcmpistri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2598    assume_cc_op(s, CC_OP_EFLAGS);
2599}
2600
2601static void gen_PCMPISTRM(DisasContext *s, X86DecodedInsn *decode)
2602{
2603    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2604    gen_helper_pcmpistrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
2605    assume_cc_op(s, CC_OP_EFLAGS);
2606    if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
2607        tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
2608                             16, 16, 0);
2609    }
2610}
2611
2612static void gen_PDEP(DisasContext *s, X86DecodedInsn *decode)
2613{
2614    gen_helper_pdep(s->T0, s->T0, s->T1);
2615}
2616
2617static void gen_PEXT(DisasContext *s, X86DecodedInsn *decode)
2618{
2619    gen_helper_pext(s->T0, s->T0, s->T1);
2620}
2621
2622static inline void gen_pextr(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
2623{
2624    int vec_len = vector_len(s, decode);
2625    int mask = (vec_len >> ot) - 1;
2626    int val = decode->immediate & mask;
2627
2628    switch (ot) {
2629    case MO_8:
2630        tcg_gen_ld8u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2631        break;
2632    case MO_16:
2633        tcg_gen_ld16u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2634        break;
2635    case MO_32:
2636#ifdef TARGET_X86_64
2637        tcg_gen_ld32u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2638        break;
2639    case MO_64:
2640#endif
2641        tcg_gen_ld_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
2642        break;
2643    default:
2644        abort();
2645    }
2646}
2647
2648static void gen_PEXTRB(DisasContext *s, X86DecodedInsn *decode)
2649{
2650    gen_pextr(s, decode, MO_8);
2651}
2652
2653static void gen_PEXTRW(DisasContext *s, X86DecodedInsn *decode)
2654{
2655    gen_pextr(s, decode, MO_16);
2656}
2657
2658static void gen_PEXTR(DisasContext *s, X86DecodedInsn *decode)
2659{
2660    MemOp ot = decode->op[0].ot;
2661    gen_pextr(s, decode, ot);
2662}
2663
2664static inline void gen_pinsr(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
2665{
2666    int vec_len = vector_len(s, decode);
2667    int mask = (vec_len >> ot) - 1;
2668    int val = decode->immediate & mask;
2669
2670    if (decode->op[1].offset != decode->op[0].offset) {
2671        assert(vec_len == 16);
2672        gen_store_sse(s, decode, decode->op[1].offset);
2673    }
2674
2675    switch (ot) {
2676    case MO_8:
2677        tcg_gen_st8_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2678        break;
2679    case MO_16:
2680        tcg_gen_st16_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2681        break;
2682    case MO_32:
2683#ifdef TARGET_X86_64
2684        tcg_gen_st32_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2685        break;
2686    case MO_64:
2687#endif
2688        tcg_gen_st_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
2689        break;
2690    default:
2691        abort();
2692    }
2693}
2694
2695static void gen_PINSRB(DisasContext *s, X86DecodedInsn *decode)
2696{
2697    gen_pinsr(s, decode, MO_8);
2698}
2699
2700static void gen_PINSRW(DisasContext *s, X86DecodedInsn *decode)
2701{
2702    gen_pinsr(s, decode, MO_16);
2703}
2704
2705static void gen_PINSR(DisasContext *s, X86DecodedInsn *decode)
2706{
2707    gen_pinsr(s, decode, decode->op[2].ot);
2708}
2709
2710static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
2711{
2712    TCGv_i64 t = tcg_temp_new_i64();
2713
2714    tcg_gen_andi_i64(d, s, 0x8080808080808080ull);
2715
2716    /*
2717     * After each shift+or pair:
2718     * 0:  a.......b.......c.......d.......e.......f.......g.......h.......
2719     * 7:  ab......bc......cd......de......ef......fg......gh......h.......
2720     * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h.......
2721     * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h.......
2722     * The result is left in the high bits of the word.
2723     */
2724    tcg_gen_shli_i64(t, d, 7);
2725    tcg_gen_or_i64(d, d, t);
2726    tcg_gen_shli_i64(t, d, 14);
2727    tcg_gen_or_i64(d, d, t);
2728    tcg_gen_shli_i64(t, d, 28);
2729    tcg_gen_or_i64(d, d, t);
2730}
2731
2732static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s)
2733{
2734    TCGv_vec t = tcg_temp_new_vec_matching(d);
2735    TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80);
2736
2737    /* See above */
2738    tcg_gen_and_vec(vece, d, s, m);
2739    tcg_gen_shli_vec(vece, t, d, 7);
2740    tcg_gen_or_vec(vece, d, d, t);
2741    tcg_gen_shli_vec(vece, t, d, 14);
2742    tcg_gen_or_vec(vece, d, d, t);
2743    tcg_gen_shli_vec(vece, t, d, 28);
2744    tcg_gen_or_vec(vece, d, d, t);
2745}
2746
2747static void gen_PMOVMSKB(DisasContext *s, X86DecodedInsn *decode)
2748{
2749    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2750    static const GVecGen2 g = {
2751        .fni8 = gen_pmovmskb_i64,
2752        .fniv = gen_pmovmskb_vec,
2753        .opt_opc = vecop_list,
2754        .vece = MO_64,
2755        .prefer_i64 = TCG_TARGET_REG_BITS == 64
2756    };
2757    MemOp ot = decode->op[2].ot;
2758    int vec_len = vector_len(s, decode);
2759    TCGv t = tcg_temp_new();
2760
2761    tcg_gen_gvec_2(offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), decode->op[2].offset,
2762                   vec_len, vec_len, &g);
2763    tcg_gen_ld8u_tl(s->T0, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
2764    while (vec_len > 8) {
2765        vec_len -= 8;
2766        if (TCG_TARGET_HAS_extract2_tl) {
2767            /*
2768             * Load the next byte of the result into the high byte of T.
2769             * TCG does a similar expansion of deposit to shl+extract2; by
2770             * loading the whole word, the shift left is avoided.
2771             */
2772#ifdef TARGET_X86_64
2773            tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_Q((vec_len - 1) / 8)));
2774#else
2775            tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L((vec_len - 1) / 4)));
2776#endif
2777
2778            tcg_gen_extract2_tl(s->T0, t, s->T0, TARGET_LONG_BITS - 8);
2779        } else {
2780            /*
2781             * The _previous_ value is deposited into bits 8 and higher of t.  Because
2782             * those bits are known to be zero after ld8u, this becomes a shift+or
2783             * if deposit is not available.
2784             */
2785            tcg_gen_ld8u_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
2786            tcg_gen_deposit_tl(s->T0, t, s->T0, 8, TARGET_LONG_BITS - 8);
2787        }
2788    }
2789}
2790
2791static void gen_POP(DisasContext *s, X86DecodedInsn *decode)
2792{
2793    X86DecodedOp *op = &decode->op[0];
2794    MemOp ot = gen_pop_T0(s);
2795
2796    assert(ot >= op->ot);
2797    if (op->has_ea || op->unit == X86_OP_SEG) {
2798        /* NOTE: order is important for MMU exceptions */
2799        gen_writeback(s, decode, 0, s->T0);
2800    }
2801
2802    /* NOTE: writing back registers after update is important for pop %sp */
2803    gen_pop_update(s, ot);
2804}
2805
2806static void gen_POPA(DisasContext *s, X86DecodedInsn *decode)
2807{
2808    gen_popa(s);
2809}
2810
2811static void gen_POPCNT(DisasContext *s, X86DecodedInsn *decode)
2812{
2813    decode->cc_dst = tcg_temp_new();
2814    decode->cc_op = CC_OP_POPCNT;
2815
2816    tcg_gen_mov_tl(decode->cc_dst, s->T0);
2817    tcg_gen_ctpop_tl(s->T0, s->T0);
2818}
2819
2820static void gen_POPF(DisasContext *s, X86DecodedInsn *decode)
2821{
2822    MemOp ot;
2823    int mask = TF_MASK | AC_MASK | ID_MASK | NT_MASK;
2824
2825    if (CPL(s) == 0) {
2826        mask |= IF_MASK | IOPL_MASK;
2827    } else if (CPL(s) <= IOPL(s)) {
2828        mask |= IF_MASK;
2829    }
2830    if (s->dflag == MO_16) {
2831        mask &= 0xffff;
2832    }
2833
2834    ot = gen_pop_T0(s);
2835    gen_helper_write_eflags(tcg_env, s->T0, tcg_constant_i32(mask));
2836    gen_pop_update(s, ot);
2837    set_cc_op(s, CC_OP_EFLAGS);
2838    /* abort translation because TF/AC flag may change */
2839    s->base.is_jmp = DISAS_EOB_NEXT;
2840}
2841
2842static void gen_PSHUFW(DisasContext *s, X86DecodedInsn *decode)
2843{
2844    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2845    gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm);
2846}
2847
2848static void gen_PSRLW_i(DisasContext *s, X86DecodedInsn *decode)
2849{
2850    int vec_len = vector_len(s, decode);
2851
2852    if (decode->immediate >= 16) {
2853        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2854    } else {
2855        tcg_gen_gvec_shri(MO_16,
2856                          decode->op[0].offset, decode->op[1].offset,
2857                          decode->immediate, vec_len, vec_len);
2858    }
2859}
2860
2861static void gen_PSLLW_i(DisasContext *s, X86DecodedInsn *decode)
2862{
2863    int vec_len = vector_len(s, decode);
2864
2865    if (decode->immediate >= 16) {
2866        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2867    } else {
2868        tcg_gen_gvec_shli(MO_16,
2869                          decode->op[0].offset, decode->op[1].offset,
2870                          decode->immediate, vec_len, vec_len);
2871    }
2872}
2873
2874static void gen_PSRAW_i(DisasContext *s, X86DecodedInsn *decode)
2875{
2876    int vec_len = vector_len(s, decode);
2877
2878    if (decode->immediate >= 16) {
2879        decode->immediate = 15;
2880    }
2881    tcg_gen_gvec_sari(MO_16,
2882                      decode->op[0].offset, decode->op[1].offset,
2883                      decode->immediate, vec_len, vec_len);
2884}
2885
2886static void gen_PSRLD_i(DisasContext *s, X86DecodedInsn *decode)
2887{
2888    int vec_len = vector_len(s, decode);
2889
2890    if (decode->immediate >= 32) {
2891        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2892    } else {
2893        tcg_gen_gvec_shri(MO_32,
2894                          decode->op[0].offset, decode->op[1].offset,
2895                          decode->immediate, vec_len, vec_len);
2896    }
2897}
2898
2899static void gen_PSLLD_i(DisasContext *s, X86DecodedInsn *decode)
2900{
2901    int vec_len = vector_len(s, decode);
2902
2903    if (decode->immediate >= 32) {
2904        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2905    } else {
2906        tcg_gen_gvec_shli(MO_32,
2907                          decode->op[0].offset, decode->op[1].offset,
2908                          decode->immediate, vec_len, vec_len);
2909    }
2910}
2911
2912static void gen_PSRAD_i(DisasContext *s, X86DecodedInsn *decode)
2913{
2914    int vec_len = vector_len(s, decode);
2915
2916    if (decode->immediate >= 32) {
2917        decode->immediate = 31;
2918    }
2919    tcg_gen_gvec_sari(MO_32,
2920                      decode->op[0].offset, decode->op[1].offset,
2921                      decode->immediate, vec_len, vec_len);
2922}
2923
2924static void gen_PSRLQ_i(DisasContext *s, X86DecodedInsn *decode)
2925{
2926    int vec_len = vector_len(s, decode);
2927
2928    if (decode->immediate >= 64) {
2929        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2930    } else {
2931        tcg_gen_gvec_shri(MO_64,
2932                          decode->op[0].offset, decode->op[1].offset,
2933                          decode->immediate, vec_len, vec_len);
2934    }
2935}
2936
2937static void gen_PSLLQ_i(DisasContext *s, X86DecodedInsn *decode)
2938{
2939    int vec_len = vector_len(s, decode);
2940
2941    if (decode->immediate >= 64) {
2942        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2943    } else {
2944        tcg_gen_gvec_shli(MO_64,
2945                          decode->op[0].offset, decode->op[1].offset,
2946                          decode->immediate, vec_len, vec_len);
2947    }
2948}
2949
2950static TCGv_ptr make_imm8u_xmm_vec(uint8_t imm, int vec_len)
2951{
2952    MemOp ot = vec_len == 16 ? MO_128 : MO_256;
2953    TCGv_i32 imm_v = tcg_constant8u_i32(imm);
2954    TCGv_ptr ptr = tcg_temp_new_ptr();
2955
2956    tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot),
2957                         vec_len, vec_len, 0);
2958
2959    tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_t0));
2960    tcg_gen_st_i32(imm_v, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
2961    return ptr;
2962}
2963
2964static void gen_PSRLDQ_i(DisasContext *s, X86DecodedInsn *decode)
2965{
2966    int vec_len = vector_len(s, decode);
2967    TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
2968
2969    if (s->vex_l) {
2970        gen_helper_psrldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
2971    } else {
2972        gen_helper_psrldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
2973    }
2974}
2975
2976static void gen_PSLLDQ_i(DisasContext *s, X86DecodedInsn *decode)
2977{
2978    int vec_len = vector_len(s, decode);
2979    TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
2980
2981    if (s->vex_l) {
2982        gen_helper_pslldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
2983    } else {
2984        gen_helper_pslldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
2985    }
2986}
2987
2988static void gen_PUSH(DisasContext *s, X86DecodedInsn *decode)
2989{
2990    gen_push_v(s, s->T0);
2991}
2992
2993static void gen_PUSHA(DisasContext *s, X86DecodedInsn *decode)
2994{
2995    gen_pusha(s);
2996}
2997
2998static void gen_PUSHF(DisasContext *s, X86DecodedInsn *decode)
2999{
3000    gen_update_cc_op(s);
3001    gen_helper_read_eflags(s->T0, tcg_env);
3002    gen_push_v(s, s->T0);
3003}
3004
3005static MemOp gen_shift_count(DisasContext *s, X86DecodedInsn *decode,
3006                             bool *can_be_zero, TCGv *count, int unit)
3007{
3008    MemOp ot = decode->op[0].ot;
3009    int mask = (ot <= MO_32 ? 0x1f : 0x3f);
3010
3011    *can_be_zero = false;
3012    switch (unit) {
3013    case X86_OP_INT:
3014        *count = tcg_temp_new();
3015        tcg_gen_andi_tl(*count, cpu_regs[R_ECX], mask);
3016        *can_be_zero = true;
3017        break;
3018
3019    case X86_OP_IMM:
3020        if ((decode->immediate & mask) == 0) {
3021            *count = NULL;
3022            break;
3023        }
3024        *count = tcg_temp_new();
3025        tcg_gen_movi_tl(*count, decode->immediate & mask);
3026        break;
3027
3028    case X86_OP_SKIP:
3029        *count = tcg_temp_new();
3030        tcg_gen_movi_tl(*count, 1);
3031        break;
3032
3033    default:
3034        g_assert_not_reached();
3035    }
3036
3037    return ot;
3038}
3039
3040/*
3041 * Compute existing flags in decode->cc_src, for gen_* functions that wants
3042 * to set the cc_op set to CC_OP_ADCOX.  In particular, this allows rotate
3043 * operations to compute the carry in decode->cc_dst and the overflow in
3044 * decode->cc_src2.
3045 *
3046 * If need_flags is true, decode->cc_dst and decode->cc_src2 are preloaded
3047 * with the value of CF and OF before the instruction, so that it is possible
3048 * to keep the flags unmodified.
3049 *
3050 * Return true if carry could be made available cheaply as a 1-bit value in
3051 * decode->cc_dst (trying a bit harder if want_carry is true).  If false is
3052 * returned, decode->cc_dst is uninitialized and the carry is only available
3053 * as bit 0 of decode->cc_src.
3054 */
3055static bool gen_eflags_adcox(DisasContext *s, X86DecodedInsn *decode, bool want_carry, bool need_flags)
3056{
3057    bool got_cf = false;
3058    bool got_of = false;
3059
3060    decode->cc_dst = tcg_temp_new();
3061    decode->cc_src = tcg_temp_new();
3062    decode->cc_src2 = tcg_temp_new();
3063    decode->cc_op = CC_OP_ADCOX;
3064
3065    /* A lot more cc_ops could be "optimized" to avoid the extracts at
3066     * the end (INC/DEC, BMILG, MUL), but they are all really unlikely
3067     * to be followed by rotations within the same basic block.
3068     */
3069    switch (s->cc_op) {
3070    case CC_OP_ADCOX:
3071        /* No need to compute the full EFLAGS, CF/OF are already isolated.  */
3072        tcg_gen_mov_tl(decode->cc_src, cpu_cc_src);
3073        if (need_flags) {
3074            tcg_gen_mov_tl(decode->cc_src2, cpu_cc_src2);
3075            got_of = true;
3076        }
3077        if (want_carry || need_flags) {
3078            tcg_gen_mov_tl(decode->cc_dst, cpu_cc_dst);
3079            got_cf = true;
3080        }
3081        break;
3082
3083    case CC_OP_LOGICB ... CC_OP_LOGICQ:
3084        /* CF and OF are zero, do it just because it's easy.  */
3085        gen_mov_eflags(s, decode->cc_src);
3086        if (need_flags) {
3087            tcg_gen_movi_tl(decode->cc_src2, 0);
3088            got_of = true;
3089        }
3090        if (want_carry || need_flags) {
3091            tcg_gen_movi_tl(decode->cc_dst, 0);
3092            got_cf = true;
3093        }
3094        break;
3095
3096    case CC_OP_SARB ... CC_OP_SARQ:
3097        /*
3098         * SHR/RCR/SHR/RCR/... is a relatively common occurrence of RCR.
3099         * By computing CF without using eflags, the calls to cc_compute_all
3100         * can be eliminated as dead code (except for the last RCR).
3101         */
3102        if (want_carry || need_flags) {
3103            tcg_gen_andi_tl(decode->cc_dst, cpu_cc_src, 1);
3104            got_cf = true;
3105        }
3106        gen_mov_eflags(s, decode->cc_src);
3107        break;
3108
3109    case CC_OP_SHLB ... CC_OP_SHLQ:
3110        /*
3111         * Likewise for SHL/RCL/SHL/RCL/... but, if CF is not in the sign
3112         * bit, we might as well fish CF out of EFLAGS and save a shift.
3113         */
3114        if (want_carry && (!need_flags || s->cc_op == CC_OP_SHLB + MO_TL)) {
3115            tcg_gen_shri_tl(decode->cc_dst, cpu_cc_src, (8 << (s->cc_op - CC_OP_SHLB)) - 1);
3116            got_cf = true;
3117        }
3118        gen_mov_eflags(s, decode->cc_src);
3119        break;
3120
3121    default:
3122        gen_mov_eflags(s, decode->cc_src);
3123        break;
3124    }
3125
3126    if (need_flags) {
3127        /* If the flags could be left unmodified, always load them.  */
3128        if (!got_of) {
3129            tcg_gen_extract_tl(decode->cc_src2, decode->cc_src, ctz32(CC_O), 1);
3130            got_of = true;
3131        }
3132        if (!got_cf) {
3133            tcg_gen_extract_tl(decode->cc_dst, decode->cc_src, ctz32(CC_C), 1);
3134            got_cf = true;
3135        }
3136    }
3137    return got_cf;
3138}
3139
3140static void gen_rot_overflow(X86DecodedInsn *decode, TCGv result, TCGv old,
3141                             bool can_be_zero, TCGv count)
3142{
3143    MemOp ot = decode->op[0].ot;
3144    TCGv temp = can_be_zero ? tcg_temp_new() : decode->cc_src2;
3145
3146    tcg_gen_xor_tl(temp, old, result);
3147    tcg_gen_extract_tl(temp, temp, (8 << ot) - 1, 1);
3148    if (can_be_zero) {
3149        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_src2, count, tcg_constant_tl(0),
3150                           decode->cc_src2, temp);
3151    }
3152}
3153
3154/*
3155 * RCx operations are invariant modulo 8*operand_size+1.  For 8 and 16-bit operands,
3156 * this is less than 0x1f (the mask applied by gen_shift_count) so reduce further.
3157 */
3158static void gen_rotc_mod(MemOp ot, TCGv count)
3159{
3160    TCGv temp;
3161
3162    switch (ot) {
3163    case MO_8:
3164        temp = tcg_temp_new();
3165        tcg_gen_subi_tl(temp, count, 18);
3166        tcg_gen_movcond_tl(TCG_COND_GE, count, temp, tcg_constant_tl(0), temp, count);
3167        tcg_gen_subi_tl(temp, count, 9);
3168        tcg_gen_movcond_tl(TCG_COND_GE, count, temp, tcg_constant_tl(0), temp, count);
3169        break;
3170
3171    case MO_16:
3172        temp = tcg_temp_new();
3173        tcg_gen_subi_tl(temp, count, 17);
3174        tcg_gen_movcond_tl(TCG_COND_GE, count, temp, tcg_constant_tl(0), temp, count);
3175        break;
3176
3177    default:
3178        break;
3179    }
3180}
3181
3182/*
3183 * The idea here is that the bit to the right of the new bit 0 is the
3184 * new carry, and the bit to the right of the old bit 0 is the old carry.
3185 * Just like a regular rotation, the result of the rotation is composed
3186 * from a right shifted part and a left shifted part of s->T0.  The new carry
3187 * is extracted from the right-shifted portion, and the old carry is
3188 * inserted at the end of the left-shifted portion.
3189 *
3190 * Because of the separate shifts involving the carry, gen_RCL and gen_RCR
3191 * mostly operate on count-1.  This also comes in handy when computing
3192 * length - count, because (length-1) - (count-1) can be computed with
3193 * a XOR, and that is commutative unlike subtraction.
3194 */
3195static void gen_RCL(DisasContext *s, X86DecodedInsn *decode)
3196{
3197    bool have_1bit_cin, can_be_zero;
3198    TCGv count;
3199    TCGLabel *zero_label = NULL;
3200    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3201    TCGv low, high, low_count;
3202
3203    if (!count) {
3204        return;
3205    }
3206
3207    low = tcg_temp_new();
3208    high = tcg_temp_new();
3209    low_count = tcg_temp_new();
3210
3211    gen_rotc_mod(ot, count);
3212    have_1bit_cin = gen_eflags_adcox(s, decode, true, can_be_zero);
3213    if (can_be_zero) {
3214        zero_label = gen_new_label();
3215        tcg_gen_brcondi_tl(TCG_COND_EQ, count, 0, zero_label);
3216    }
3217
3218    /* Compute high part, including incoming carry.  */
3219    if (!have_1bit_cin || TCG_TARGET_deposit_tl_valid(1, TARGET_LONG_BITS - 1)) {
3220        /* high = (T0 << 1) | cin */
3221        TCGv cin = have_1bit_cin ? decode->cc_dst : decode->cc_src;
3222        tcg_gen_deposit_tl(high, cin, s->T0, 1, TARGET_LONG_BITS - 1);
3223    } else {
3224        /* Same as above but without deposit; cin in cc_dst.  */
3225        tcg_gen_add_tl(high, s->T0, decode->cc_dst);
3226        tcg_gen_add_tl(high, high, s->T0);
3227    }
3228    tcg_gen_subi_tl(count, count, 1);
3229    tcg_gen_shl_tl(high, high, count);
3230
3231    /* Compute low part and outgoing carry, incoming s->T0 is zero extended */
3232    tcg_gen_xori_tl(low_count, count, (8 << ot) - 1); /* LENGTH - 1 - (count - 1) */
3233    tcg_gen_shr_tl(low, s->T0, low_count);
3234    tcg_gen_andi_tl(decode->cc_dst, low, 1);
3235    tcg_gen_shri_tl(low, low, 1);
3236
3237    /* Compute result and outgoing overflow */
3238    tcg_gen_mov_tl(decode->cc_src2, s->T0);
3239    tcg_gen_or_tl(s->T0, low, high);
3240    gen_rot_overflow(decode, s->T0, decode->cc_src2, false, NULL);
3241
3242    if (zero_label) {
3243        gen_set_label(zero_label);
3244    }
3245}
3246
3247static void gen_RCR(DisasContext *s, X86DecodedInsn *decode)
3248{
3249    bool have_1bit_cin, can_be_zero;
3250    TCGv count;
3251    TCGLabel *zero_label = NULL;
3252    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3253    TCGv low, high, high_count;
3254
3255    if (!count) {
3256        return;
3257    }
3258
3259    low = tcg_temp_new();
3260    high = tcg_temp_new();
3261    high_count = tcg_temp_new();
3262
3263    gen_rotc_mod(ot, count);
3264    have_1bit_cin = gen_eflags_adcox(s, decode, true, can_be_zero);
3265    if (can_be_zero) {
3266        zero_label = gen_new_label();
3267        tcg_gen_brcondi_tl(TCG_COND_EQ, count, 0, zero_label);
3268    }
3269
3270    /* Save incoming carry into high, it will be shifted later.  */
3271    if (!have_1bit_cin || TCG_TARGET_deposit_tl_valid(1, TARGET_LONG_BITS - 1)) {
3272        TCGv cin = have_1bit_cin ? decode->cc_dst : decode->cc_src;
3273        tcg_gen_deposit_tl(high, cin, s->T0, 1, TARGET_LONG_BITS - 1);
3274    } else {
3275        /* Same as above but without deposit; cin in cc_dst.  */
3276        tcg_gen_add_tl(high, s->T0, decode->cc_dst);
3277        tcg_gen_add_tl(high, high, s->T0);
3278    }
3279
3280    /* Compute low part and outgoing carry, incoming s->T0 is zero extended */
3281    tcg_gen_subi_tl(count, count, 1);
3282    tcg_gen_shr_tl(low, s->T0, count);
3283    tcg_gen_andi_tl(decode->cc_dst, low, 1);
3284    tcg_gen_shri_tl(low, low, 1);
3285
3286    /* Move high part to the right position */
3287    tcg_gen_xori_tl(high_count, count, (8 << ot) - 1); /* LENGTH - 1 - (count - 1) */
3288    tcg_gen_shl_tl(high, high, high_count);
3289
3290    /* Compute result and outgoing overflow */
3291    tcg_gen_mov_tl(decode->cc_src2, s->T0);
3292    tcg_gen_or_tl(s->T0, low, high);
3293    gen_rot_overflow(decode, s->T0, decode->cc_src2, false, NULL);
3294
3295    if (zero_label) {
3296        gen_set_label(zero_label);
3297    }
3298}
3299
3300#ifdef CONFIG_USER_ONLY
3301static void gen_unreachable(DisasContext *s, X86DecodedInsn *decode)
3302{
3303    g_assert_not_reached();
3304}
3305#endif
3306
3307#ifndef CONFIG_USER_ONLY
3308static void gen_RDMSR(DisasContext *s, X86DecodedInsn *decode)
3309{
3310    gen_update_cc_op(s);
3311    gen_update_eip_cur(s);
3312    gen_helper_rdmsr(tcg_env);
3313}
3314#else
3315#define gen_RDMSR gen_unreachable
3316#endif
3317
3318static void gen_RDPMC(DisasContext *s, X86DecodedInsn *decode)
3319{
3320    gen_update_cc_op(s);
3321    gen_update_eip_cur(s);
3322    translator_io_start(&s->base);
3323    gen_helper_rdpmc(tcg_env);
3324    s->base.is_jmp = DISAS_NORETURN;
3325}
3326
3327static void gen_RDTSC(DisasContext *s, X86DecodedInsn *decode)
3328{
3329    gen_update_cc_op(s);
3330    gen_update_eip_cur(s);
3331    translator_io_start(&s->base);
3332    gen_helper_rdtsc(tcg_env);
3333}
3334
3335static void gen_RDxxBASE(DisasContext *s, X86DecodedInsn *decode)
3336{
3337    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
3338
3339    /* Preserve hflags bits by testing CR4 at runtime.  */
3340    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
3341    tcg_gen_mov_tl(s->T0, base);
3342}
3343
3344static void gen_RET(DisasContext *s, X86DecodedInsn *decode)
3345{
3346    int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
3347
3348    MemOp ot = gen_pop_T0(s);
3349    gen_stack_update(s, adjust + (1 << ot));
3350    gen_op_jmp_v(s, s->T0);
3351    gen_bnd_jmp(s);
3352    s->base.is_jmp = DISAS_JUMP;
3353}
3354
3355static void gen_RETF(DisasContext *s, X86DecodedInsn *decode)
3356{
3357    int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
3358
3359    if (!PE(s) || VM86(s)) {
3360        gen_lea_ss_ofs(s, s->A0, cpu_regs[R_ESP], 0);
3361        /* pop offset */
3362        gen_op_ld_v(s, s->dflag, s->T0, s->A0);
3363        /* NOTE: keeping EIP updated is not a problem in case of
3364           exception */
3365        gen_op_jmp_v(s, s->T0);
3366        /* pop selector */
3367        gen_add_A0_im(s, 1 << s->dflag);
3368        gen_op_ld_v(s, s->dflag, s->T0, s->A0);
3369        gen_op_movl_seg_real(s, R_CS, s->T0);
3370        /* add stack offset */
3371        gen_stack_update(s, adjust + (2 << s->dflag));
3372    } else {
3373        gen_update_cc_op(s);
3374        gen_update_eip_cur(s);
3375        gen_helper_lret_protected(tcg_env, tcg_constant_i32(s->dflag - 1),
3376                                  tcg_constant_i32(adjust));
3377    }
3378    s->base.is_jmp = DISAS_EOB_ONLY;
3379}
3380
3381/*
3382 * Return non-NULL if a 32-bit rotate works, after possibly replicating the input.
3383 * The input has already been zero-extended upon operand decode.
3384 */
3385static TCGv_i32 gen_rot_replicate(MemOp ot, TCGv in)
3386{
3387    TCGv_i32 temp;
3388    switch (ot) {
3389    case MO_8:
3390        temp = tcg_temp_new_i32();
3391        tcg_gen_trunc_tl_i32(temp, in);
3392        tcg_gen_muli_i32(temp, temp, 0x01010101);
3393        return temp;
3394
3395    case MO_16:
3396        temp = tcg_temp_new_i32();
3397        tcg_gen_trunc_tl_i32(temp, in);
3398        tcg_gen_deposit_i32(temp, temp, temp, 16, 16);
3399        return temp;
3400
3401#ifdef TARGET_X86_64
3402    case MO_32:
3403        temp = tcg_temp_new_i32();
3404        tcg_gen_trunc_tl_i32(temp, in);
3405        return temp;
3406#endif
3407
3408    default:
3409        return NULL;
3410    }
3411}
3412
3413static void gen_rot_carry(X86DecodedInsn *decode, TCGv result,
3414                          bool can_be_zero, TCGv count, int bit)
3415{
3416    if (!can_be_zero) {
3417        tcg_gen_extract_tl(decode->cc_dst, result, bit, 1);
3418    } else {
3419        TCGv temp = tcg_temp_new();
3420        tcg_gen_extract_tl(temp, result, bit, 1);
3421        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_dst, count, tcg_constant_tl(0),
3422                           decode->cc_dst, temp);
3423    }
3424}
3425
3426static void gen_ROL(DisasContext *s, X86DecodedInsn *decode)
3427{
3428    bool can_be_zero;
3429    TCGv count;
3430    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3431    TCGv_i32 temp32, count32;
3432    TCGv old = tcg_temp_new();
3433
3434    if (!count) {
3435        return;
3436    }
3437
3438    gen_eflags_adcox(s, decode, false, can_be_zero);
3439    tcg_gen_mov_tl(old, s->T0);
3440    temp32 = gen_rot_replicate(ot, s->T0);
3441    if (temp32) {
3442        count32 = tcg_temp_new_i32();
3443        tcg_gen_trunc_tl_i32(count32, count);
3444        tcg_gen_rotl_i32(temp32, temp32, count32);
3445        /* Zero extend to facilitate later optimization.  */
3446        tcg_gen_extu_i32_tl(s->T0, temp32);
3447    } else {
3448        tcg_gen_rotl_tl(s->T0, s->T0, count);
3449    }
3450    gen_rot_carry(decode, s->T0, can_be_zero, count, 0);
3451    gen_rot_overflow(decode, s->T0, old, can_be_zero, count);
3452}
3453
3454static void gen_ROR(DisasContext *s, X86DecodedInsn *decode)
3455{
3456    bool can_be_zero;
3457    TCGv count;
3458    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3459    TCGv_i32 temp32, count32;
3460    TCGv old = tcg_temp_new();
3461
3462    if (!count) {
3463        return;
3464    }
3465
3466    gen_eflags_adcox(s, decode, false, can_be_zero);
3467    tcg_gen_mov_tl(old, s->T0);
3468    temp32 = gen_rot_replicate(ot, s->T0);
3469    if (temp32) {
3470        count32 = tcg_temp_new_i32();
3471        tcg_gen_trunc_tl_i32(count32, count);
3472        tcg_gen_rotr_i32(temp32, temp32, count32);
3473        /* Zero extend to facilitate later optimization.  */
3474        tcg_gen_extu_i32_tl(s->T0, temp32);
3475        gen_rot_carry(decode, s->T0, can_be_zero, count, 31);
3476    } else {
3477        tcg_gen_rotr_tl(s->T0, s->T0, count);
3478        gen_rot_carry(decode, s->T0, can_be_zero, count, TARGET_LONG_BITS - 1);
3479    }
3480    gen_rot_overflow(decode, s->T0, old, can_be_zero, count);
3481}
3482
3483static void gen_RORX(DisasContext *s, X86DecodedInsn *decode)
3484{
3485    MemOp ot = decode->op[0].ot;
3486    int mask = ot == MO_64 ? 63 : 31;
3487    int b = decode->immediate & mask;
3488
3489    switch (ot) {
3490    case MO_32:
3491#ifdef TARGET_X86_64
3492        tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
3493        tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b);
3494        tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
3495        break;
3496
3497    case MO_64:
3498#endif
3499        tcg_gen_rotri_tl(s->T0, s->T0, b);
3500        break;
3501
3502    default:
3503        g_assert_not_reached();
3504    }
3505}
3506
3507#ifndef CONFIG_USER_ONLY
3508static void gen_RSM(DisasContext *s, X86DecodedInsn *decode)
3509{
3510    gen_helper_rsm(tcg_env);
3511    assume_cc_op(s, CC_OP_EFLAGS);
3512    s->base.is_jmp = DISAS_EOB_ONLY;
3513}
3514#else
3515#define gen_RSM gen_UD
3516#endif
3517
3518static void gen_SAHF(DisasContext *s, X86DecodedInsn *decode)
3519{
3520    if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
3521        return gen_illegal_opcode(s);
3522    }
3523    tcg_gen_shri_tl(s->T0, cpu_regs[R_EAX], 8);
3524    gen_compute_eflags(s);
3525    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
3526    tcg_gen_andi_tl(s->T0, s->T0, CC_S | CC_Z | CC_A | CC_P | CC_C);
3527    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, s->T0);
3528}
3529
3530static void gen_SALC(DisasContext *s, X86DecodedInsn *decode)
3531{
3532    gen_compute_eflags_c(s, s->T0);
3533    tcg_gen_neg_tl(s->T0, s->T0);
3534}
3535
3536static void gen_shift_dynamic_flags(DisasContext *s, X86DecodedInsn *decode, TCGv count, CCOp cc_op)
3537{
3538    TCGv_i32 count32 = tcg_temp_new_i32();
3539    TCGv_i32 old_cc_op;
3540
3541    decode->cc_op = CC_OP_DYNAMIC;
3542    decode->cc_op_dynamic = tcg_temp_new_i32();
3543
3544    assert(decode->cc_dst == s->T0);
3545    if (cc_op_live[s->cc_op] & USES_CC_DST) {
3546        decode->cc_dst = tcg_temp_new();
3547        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_dst, count, tcg_constant_tl(0),
3548                           cpu_cc_dst, s->T0);
3549    }
3550
3551    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
3552        tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_src, count, tcg_constant_tl(0),
3553                           cpu_cc_src, decode->cc_src);
3554    }
3555
3556    tcg_gen_trunc_tl_i32(count32, count);
3557    if (s->cc_op == CC_OP_DYNAMIC) {
3558        old_cc_op = cpu_cc_op;
3559    } else {
3560        old_cc_op = tcg_constant_i32(s->cc_op);
3561    }
3562    tcg_gen_movcond_i32(TCG_COND_EQ, decode->cc_op_dynamic, count32, tcg_constant_i32(0),
3563                        old_cc_op, tcg_constant_i32(cc_op));
3564}
3565
3566static void gen_SAR(DisasContext *s, X86DecodedInsn *decode)
3567{
3568    bool can_be_zero;
3569    TCGv count;
3570    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3571
3572    if (!count) {
3573        return;
3574    }
3575
3576    decode->cc_dst = s->T0;
3577    decode->cc_src = tcg_temp_new();
3578    tcg_gen_subi_tl(decode->cc_src, count, 1);
3579    tcg_gen_sar_tl(decode->cc_src, s->T0, decode->cc_src);
3580    tcg_gen_sar_tl(s->T0, s->T0, count);
3581    if (can_be_zero) {
3582        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
3583    } else {
3584        decode->cc_op = CC_OP_SARB + ot;
3585    }
3586}
3587
3588static void gen_SARX(DisasContext *s, X86DecodedInsn *decode)
3589{
3590    MemOp ot = decode->op[0].ot;
3591    int mask;
3592
3593    mask = ot == MO_64 ? 63 : 31;
3594    tcg_gen_andi_tl(s->T1, s->T1, mask);
3595    tcg_gen_sar_tl(s->T0, s->T0, s->T1);
3596}
3597
3598static void gen_SBB(DisasContext *s, X86DecodedInsn *decode)
3599{
3600    MemOp ot = decode->op[0].ot;
3601    TCGv c_in = tcg_temp_new();
3602
3603    gen_compute_eflags_c(s, c_in);
3604    if (s->prefix & PREFIX_LOCK) {
3605        tcg_gen_add_tl(s->T0, s->T1, c_in);
3606        tcg_gen_neg_tl(s->T0, s->T0);
3607        tcg_gen_atomic_add_fetch_tl(s->T0, s->A0, s->T0,
3608                                    s->mem_index, ot | MO_LE);
3609    } else {
3610        /*
3611         * TODO: SBB reg, reg could use gen_prepare_eflags_c followed by
3612         * negsetcond, and CC_OP_SUBB as the cc_op.
3613         */
3614        tcg_gen_sub_tl(s->T0, s->T0, s->T1);
3615        tcg_gen_sub_tl(s->T0, s->T0, c_in);
3616    }
3617    prepare_update3_cc(decode, s, CC_OP_SBBB + ot, c_in);
3618}
3619
3620static void gen_SCAS(DisasContext *s, X86DecodedInsn *decode)
3621{
3622    MemOp ot = decode->op[2].ot;
3623    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
3624        gen_repz_nz(s, ot, gen_scas);
3625    } else {
3626        gen_scas(s, ot);
3627    }
3628}
3629
3630static void gen_SETcc(DisasContext *s, X86DecodedInsn *decode)
3631{
3632    gen_setcc1(s, decode->b & 0xf, s->T0);
3633}
3634
3635static void gen_SFENCE(DisasContext *s, X86DecodedInsn *decode)
3636{
3637    tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
3638}
3639
3640static void gen_SHA1NEXTE(DisasContext *s, X86DecodedInsn *decode)
3641{
3642    gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
3643}
3644
3645static void gen_SHA1MSG1(DisasContext *s, X86DecodedInsn *decode)
3646{
3647    gen_helper_sha1msg1(OP_PTR0, OP_PTR1, OP_PTR2);
3648}
3649
3650static void gen_SHA1MSG2(DisasContext *s, X86DecodedInsn *decode)
3651{
3652    gen_helper_sha1msg2(OP_PTR0, OP_PTR1, OP_PTR2);
3653}
3654
3655static void gen_SHA1RNDS4(DisasContext *s, X86DecodedInsn *decode)
3656{
3657    switch(decode->immediate & 3) {
3658    case 0:
3659        gen_helper_sha1rnds4_f0(OP_PTR0, OP_PTR0, OP_PTR1);
3660        break;
3661    case 1:
3662        gen_helper_sha1rnds4_f1(OP_PTR0, OP_PTR0, OP_PTR1);
3663        break;
3664    case 2:
3665        gen_helper_sha1rnds4_f2(OP_PTR0, OP_PTR0, OP_PTR1);
3666        break;
3667    case 3:
3668        gen_helper_sha1rnds4_f3(OP_PTR0, OP_PTR0, OP_PTR1);
3669        break;
3670    }
3671}
3672
3673static void gen_SHA256MSG1(DisasContext *s, X86DecodedInsn *decode)
3674{
3675    gen_helper_sha256msg1(OP_PTR0, OP_PTR1, OP_PTR2);
3676}
3677
3678static void gen_SHA256MSG2(DisasContext *s, X86DecodedInsn *decode)
3679{
3680    gen_helper_sha256msg2(OP_PTR0, OP_PTR1, OP_PTR2);
3681}
3682
3683static void gen_SHA256RNDS2(DisasContext *s, X86DecodedInsn *decode)
3684{
3685    TCGv_i32 wk0 = tcg_temp_new_i32();
3686    TCGv_i32 wk1 = tcg_temp_new_i32();
3687
3688    tcg_gen_ld_i32(wk0, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(0)));
3689    tcg_gen_ld_i32(wk1, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(1)));
3690
3691    gen_helper_sha256rnds2(OP_PTR0, OP_PTR1, OP_PTR2, wk0, wk1);
3692}
3693
3694static void gen_SHL(DisasContext *s, X86DecodedInsn *decode)
3695{
3696    bool can_be_zero;
3697    TCGv count;
3698    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3699
3700    if (!count) {
3701        return;
3702    }
3703
3704    decode->cc_dst = s->T0;
3705    decode->cc_src = tcg_temp_new();
3706    tcg_gen_subi_tl(decode->cc_src, count, 1);
3707    tcg_gen_shl_tl(decode->cc_src, s->T0, decode->cc_src);
3708    tcg_gen_shl_tl(s->T0, s->T0, count);
3709    if (can_be_zero) {
3710        gen_shift_dynamic_flags(s, decode, count, CC_OP_SHLB + ot);
3711    } else {
3712        decode->cc_op = CC_OP_SHLB + ot;
3713    }
3714}
3715
3716static void gen_SHLD(DisasContext *s, X86DecodedInsn *decode)
3717{
3718    bool can_be_zero;
3719    TCGv count;
3720    int unit = decode->e.op3 == X86_TYPE_I ? X86_OP_IMM : X86_OP_INT;
3721    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, unit);
3722
3723    if (!count) {
3724        return;
3725    }
3726
3727    decode->cc_dst = s->T0;
3728    decode->cc_src = s->tmp0;
3729    gen_shiftd_rm_T1(s, ot, false, count);
3730    if (can_be_zero) {
3731        gen_shift_dynamic_flags(s, decode, count, CC_OP_SHLB + ot);
3732    } else {
3733        decode->cc_op = CC_OP_SHLB + ot;
3734    }
3735}
3736
3737static void gen_SHLX(DisasContext *s, X86DecodedInsn *decode)
3738{
3739    MemOp ot = decode->op[0].ot;
3740    int mask;
3741
3742    mask = ot == MO_64 ? 63 : 31;
3743    tcg_gen_andi_tl(s->T1, s->T1, mask);
3744    tcg_gen_shl_tl(s->T0, s->T0, s->T1);
3745}
3746
3747static void gen_SHR(DisasContext *s, X86DecodedInsn *decode)
3748{
3749    bool can_be_zero;
3750    TCGv count;
3751    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
3752
3753    if (!count) {
3754        return;
3755    }
3756
3757    decode->cc_dst = s->T0;
3758    decode->cc_src = tcg_temp_new();
3759    tcg_gen_subi_tl(decode->cc_src, count, 1);
3760    tcg_gen_shr_tl(decode->cc_src, s->T0, decode->cc_src);
3761    tcg_gen_shr_tl(s->T0, s->T0, count);
3762    if (can_be_zero) {
3763        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
3764    } else {
3765        decode->cc_op = CC_OP_SARB + ot;
3766    }
3767}
3768
3769static void gen_SHRD(DisasContext *s, X86DecodedInsn *decode)
3770{
3771    bool can_be_zero;
3772    TCGv count;
3773    int unit = decode->e.op3 == X86_TYPE_I ? X86_OP_IMM : X86_OP_INT;
3774    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, unit);
3775
3776    if (!count) {
3777        return;
3778    }
3779
3780    decode->cc_dst = s->T0;
3781    decode->cc_src = s->tmp0;
3782    gen_shiftd_rm_T1(s, ot, true, count);
3783    if (can_be_zero) {
3784        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
3785    } else {
3786        decode->cc_op = CC_OP_SARB + ot;
3787    }
3788}
3789
3790static void gen_SHRX(DisasContext *s, X86DecodedInsn *decode)
3791{
3792    MemOp ot = decode->op[0].ot;
3793    int mask;
3794
3795    mask = ot == MO_64 ? 63 : 31;
3796    tcg_gen_andi_tl(s->T1, s->T1, mask);
3797    tcg_gen_shr_tl(s->T0, s->T0, s->T1);
3798}
3799
3800static void gen_STC(DisasContext *s, X86DecodedInsn *decode)
3801{
3802    gen_compute_eflags(s);
3803    tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
3804}
3805
3806static void gen_STD(DisasContext *s, X86DecodedInsn *decode)
3807{
3808    tcg_gen_st_i32(tcg_constant_i32(-1), tcg_env, offsetof(CPUX86State, df));
3809}
3810
3811static void gen_STI(DisasContext *s, X86DecodedInsn *decode)
3812{
3813    gen_set_eflags(s, IF_MASK);
3814    s->base.is_jmp = DISAS_EOB_INHIBIT_IRQ;
3815}
3816
3817static void gen_VAESKEYGEN(DisasContext *s, X86DecodedInsn *decode)
3818{
3819    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
3820    assert(!s->vex_l);
3821    gen_helper_aeskeygenassist_xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
3822}
3823
3824static void gen_STMXCSR(DisasContext *s, X86DecodedInsn *decode)
3825{
3826    gen_helper_update_mxcsr(tcg_env);
3827    tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
3828}
3829
3830static void gen_STOS(DisasContext *s, X86DecodedInsn *decode)
3831{
3832    MemOp ot = decode->op[1].ot;
3833    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
3834        gen_repz(s, ot, gen_stos);
3835    } else {
3836        gen_stos(s, ot);
3837    }
3838}
3839
3840static void gen_SUB(DisasContext *s, X86DecodedInsn *decode)
3841{
3842    MemOp ot = decode->op[1].ot;
3843
3844    if (s->prefix & PREFIX_LOCK) {
3845        tcg_gen_neg_tl(s->T0, s->T1);
3846        tcg_gen_atomic_fetch_add_tl(s->cc_srcT, s->A0, s->T0,
3847                                    s->mem_index, ot | MO_LE);
3848        tcg_gen_sub_tl(s->T0, s->cc_srcT, s->T1);
3849    } else {
3850        tcg_gen_mov_tl(s->cc_srcT, s->T0);
3851        tcg_gen_sub_tl(s->T0, s->T0, s->T1);
3852    }
3853    prepare_update2_cc(decode, s, CC_OP_SUBB + ot);
3854}
3855
3856static void gen_SYSCALL(DisasContext *s, X86DecodedInsn *decode)
3857{
3858    gen_update_cc_op(s);
3859    gen_update_eip_cur(s);
3860    gen_helper_syscall(tcg_env, cur_insn_len_i32(s));
3861    if (LMA(s)) {
3862        assume_cc_op(s, CC_OP_EFLAGS);
3863    }
3864
3865    /*
3866     * TF handling for the syscall insn is different. The TF bit is checked
3867     * after the syscall insn completes. This allows #DB to not be
3868     * generated after one has entered CPL0 if TF is set in FMASK.
3869     */
3870    s->base.is_jmp = DISAS_EOB_RECHECK_TF;
3871}
3872
3873static void gen_SYSENTER(DisasContext *s, X86DecodedInsn *decode)
3874{
3875    gen_helper_sysenter(tcg_env);
3876    s->base.is_jmp = DISAS_EOB_ONLY;
3877}
3878
3879static void gen_SYSEXIT(DisasContext *s, X86DecodedInsn *decode)
3880{
3881    gen_helper_sysexit(tcg_env, tcg_constant_i32(s->dflag - 1));
3882    s->base.is_jmp = DISAS_EOB_ONLY;
3883}
3884
3885static void gen_SYSRET(DisasContext *s, X86DecodedInsn *decode)
3886{
3887    gen_helper_sysret(tcg_env, tcg_constant_i32(s->dflag - 1));
3888    if (LMA(s)) {
3889        assume_cc_op(s, CC_OP_EFLAGS);
3890    }
3891
3892    /*
3893     * TF handling for the sysret insn is different. The TF bit is checked
3894     * after the sysret insn completes. This allows #DB to be
3895     * generated "as if" the syscall insn in userspace has just
3896     * completed.
3897     */
3898    s->base.is_jmp = DISAS_EOB_RECHECK_TF;
3899}
3900
3901static void gen_TZCNT(DisasContext *s, X86DecodedInsn *decode)
3902{
3903    MemOp ot = decode->op[0].ot;
3904
3905    /* C bit (cc_src) is defined related to the input.  */
3906    decode->cc_src = tcg_temp_new();
3907    decode->cc_dst = s->T0;
3908    decode->cc_op = CC_OP_BMILGB + ot;
3909    tcg_gen_mov_tl(decode->cc_src, s->T0);
3910
3911    /* A zero input returns the operand size.  */
3912    tcg_gen_ctzi_tl(s->T0, s->T0, 8 << ot);
3913}
3914
3915static void gen_UD(DisasContext *s, X86DecodedInsn *decode)
3916{
3917    gen_illegal_opcode(s);
3918}
3919
3920static void gen_VAESIMC(DisasContext *s, X86DecodedInsn *decode)
3921{
3922    assert(!s->vex_l);
3923    gen_helper_aesimc_xmm(tcg_env, OP_PTR0, OP_PTR2);
3924}
3925
3926/*
3927 * 00 = v*ps Vps, Hps, Wpd
3928 * 66 = v*pd Vpd, Hpd, Wps
3929 * f3 = v*ss Vss, Hss, Wps
3930 * f2 = v*sd Vsd, Hsd, Wps
3931 */
3932#define SSE_CMP(x) { \
3933    gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \
3934    gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \
3935    gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm}
3936static const SSEFunc_0_eppp gen_helper_cmp_funcs[32][6] = {
3937    SSE_CMP(cmpeq),
3938    SSE_CMP(cmplt),
3939    SSE_CMP(cmple),
3940    SSE_CMP(cmpunord),
3941    SSE_CMP(cmpneq),
3942    SSE_CMP(cmpnlt),
3943    SSE_CMP(cmpnle),
3944    SSE_CMP(cmpord),
3945
3946    SSE_CMP(cmpequ),
3947    SSE_CMP(cmpnge),
3948    SSE_CMP(cmpngt),
3949    SSE_CMP(cmpfalse),
3950    SSE_CMP(cmpnequ),
3951    SSE_CMP(cmpge),
3952    SSE_CMP(cmpgt),
3953    SSE_CMP(cmptrue),
3954
3955    SSE_CMP(cmpeqs),
3956    SSE_CMP(cmpltq),
3957    SSE_CMP(cmpleq),
3958    SSE_CMP(cmpunords),
3959    SSE_CMP(cmpneqq),
3960    SSE_CMP(cmpnltq),
3961    SSE_CMP(cmpnleq),
3962    SSE_CMP(cmpords),
3963
3964    SSE_CMP(cmpequs),
3965    SSE_CMP(cmpngeq),
3966    SSE_CMP(cmpngtq),
3967    SSE_CMP(cmpfalses),
3968    SSE_CMP(cmpnequs),
3969    SSE_CMP(cmpgeq),
3970    SSE_CMP(cmpgtq),
3971    SSE_CMP(cmptrues),
3972};
3973#undef SSE_CMP
3974
3975static void gen_VCMP(DisasContext *s, X86DecodedInsn *decode)
3976{
3977    int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7);
3978    int b =
3979        s->prefix & PREFIX_REPZ  ? 2 /* ss */ :
3980        s->prefix & PREFIX_REPNZ ? 3 /* sd */ :
3981        !!(s->prefix & PREFIX_DATA) /* pd */ + (s->vex_l << 2);
3982
3983    gen_helper_cmp_funcs[index][b](tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
3984}
3985
3986static void gen_VCOMI(DisasContext *s, X86DecodedInsn *decode)
3987{
3988    SSEFunc_0_epp fn;
3989    fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss;
3990    fn(tcg_env, OP_PTR1, OP_PTR2);
3991    assume_cc_op(s, CC_OP_EFLAGS);
3992}
3993
3994static void gen_VCVTPD2PS(DisasContext *s, X86DecodedInsn *decode)
3995{
3996    if (s->vex_l) {
3997        gen_helper_cvtpd2ps_ymm(tcg_env, OP_PTR0, OP_PTR2);
3998    } else {
3999        gen_helper_cvtpd2ps_xmm(tcg_env, OP_PTR0, OP_PTR2);
4000    }
4001}
4002
4003static void gen_VCVTPS2PD(DisasContext *s, X86DecodedInsn *decode)
4004{
4005    if (s->vex_l) {
4006        gen_helper_cvtps2pd_ymm(tcg_env, OP_PTR0, OP_PTR2);
4007    } else {
4008        gen_helper_cvtps2pd_xmm(tcg_env, OP_PTR0, OP_PTR2);
4009    }
4010}
4011
4012static void gen_VCVTPS2PH(DisasContext *s, X86DecodedInsn *decode)
4013{
4014    gen_unary_imm_fp_sse(s, decode,
4015                      gen_helper_cvtps2ph_xmm,
4016                      gen_helper_cvtps2ph_ymm);
4017    /*
4018     * VCVTPS2PH is the only instruction that performs an operation on a
4019     * register source and then *stores* into memory.
4020     */
4021    if (decode->op[0].has_ea) {
4022        gen_store_sse(s, decode, decode->op[0].offset);
4023    }
4024}
4025
4026static void gen_VCVTSD2SS(DisasContext *s, X86DecodedInsn *decode)
4027{
4028    gen_helper_cvtsd2ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
4029}
4030
4031static void gen_VCVTSS2SD(DisasContext *s, X86DecodedInsn *decode)
4032{
4033    gen_helper_cvtss2sd(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
4034}
4035
4036static void gen_VCVTSI2Sx(DisasContext *s, X86DecodedInsn *decode)
4037{
4038    int vec_len = vector_len(s, decode);
4039    TCGv_i32 in;
4040
4041    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4042
4043#ifdef TARGET_X86_64
4044    MemOp ot = decode->op[2].ot;
4045    if (ot == MO_64) {
4046        if (s->prefix & PREFIX_REPNZ) {
4047            gen_helper_cvtsq2sd(tcg_env, OP_PTR0, s->T1);
4048        } else {
4049            gen_helper_cvtsq2ss(tcg_env, OP_PTR0, s->T1);
4050        }
4051        return;
4052    }
4053    in = s->tmp2_i32;
4054    tcg_gen_trunc_tl_i32(in, s->T1);
4055#else
4056    in = s->T1;
4057#endif
4058
4059    if (s->prefix & PREFIX_REPNZ) {
4060        gen_helper_cvtsi2sd(tcg_env, OP_PTR0, in);
4061    } else {
4062        gen_helper_cvtsi2ss(tcg_env, OP_PTR0, in);
4063    }
4064}
4065
4066static inline void gen_VCVTtSx2SI(DisasContext *s, X86DecodedInsn *decode,
4067                                  SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq,
4068                                  SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq)
4069{
4070    TCGv_i32 out;
4071
4072#ifdef TARGET_X86_64
4073    MemOp ot = decode->op[0].ot;
4074    if (ot == MO_64) {
4075        if (s->prefix & PREFIX_REPNZ) {
4076            sd2sq(s->T0, tcg_env, OP_PTR2);
4077        } else {
4078            ss2sq(s->T0, tcg_env, OP_PTR2);
4079        }
4080        return;
4081    }
4082
4083    out = s->tmp2_i32;
4084#else
4085    out = s->T0;
4086#endif
4087    if (s->prefix & PREFIX_REPNZ) {
4088        sd2si(out, tcg_env, OP_PTR2);
4089    } else {
4090        ss2si(out, tcg_env, OP_PTR2);
4091    }
4092#ifdef TARGET_X86_64
4093    tcg_gen_extu_i32_tl(s->T0, out);
4094#endif
4095}
4096
4097#ifndef TARGET_X86_64
4098#define gen_helper_cvtss2sq NULL
4099#define gen_helper_cvtsd2sq NULL
4100#define gen_helper_cvttss2sq NULL
4101#define gen_helper_cvttsd2sq NULL
4102#endif
4103
4104static void gen_VCVTSx2SI(DisasContext *s, X86DecodedInsn *decode)
4105{
4106    gen_VCVTtSx2SI(s, decode,
4107                   gen_helper_cvtss2si, gen_helper_cvtss2sq,
4108                   gen_helper_cvtsd2si, gen_helper_cvtsd2sq);
4109}
4110
4111static void gen_VCVTTSx2SI(DisasContext *s, X86DecodedInsn *decode)
4112{
4113    gen_VCVTtSx2SI(s, decode,
4114                   gen_helper_cvttss2si, gen_helper_cvttss2sq,
4115                   gen_helper_cvttsd2si, gen_helper_cvttsd2sq);
4116}
4117
4118static void gen_VEXTRACTx128(DisasContext *s, X86DecodedInsn *decode)
4119{
4120    int mask = decode->immediate & 1;
4121    int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask);
4122    if (decode->op[0].has_ea) {
4123        /* VEX-only instruction, no alignment requirements.  */
4124        gen_sto_env_A0(s, src_ofs, false);
4125    } else {
4126        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, 16, 16);
4127    }
4128}
4129
4130static void gen_VEXTRACTPS(DisasContext *s, X86DecodedInsn *decode)
4131{
4132    gen_pextr(s, decode, MO_32);
4133}
4134
4135static void gen_vinsertps(DisasContext *s, X86DecodedInsn *decode)
4136{
4137    int val = decode->immediate;
4138    int dest_word = (val >> 4) & 3;
4139    int new_mask = (val & 15) | (1 << dest_word);
4140    int vec_len = 16;
4141
4142    assert(!s->vex_l);
4143
4144    if (new_mask == 15) {
4145        /* All zeroes except possibly for the inserted element */
4146        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
4147    } else if (decode->op[1].offset != decode->op[0].offset) {
4148        gen_store_sse(s, decode, decode->op[1].offset);
4149    }
4150
4151    if (new_mask != (val & 15)) {
4152        tcg_gen_st_i32(s->tmp2_i32, tcg_env,
4153                       vector_elem_offset(&decode->op[0], MO_32, dest_word));
4154    }
4155
4156    if (new_mask != 15) {
4157        TCGv_i32 zero = tcg_constant_i32(0); /* float32_zero */
4158        int i;
4159        for (i = 0; i < 4; i++) {
4160            if ((val >> i) & 1) {
4161                tcg_gen_st_i32(zero, tcg_env,
4162                               vector_elem_offset(&decode->op[0], MO_32, i));
4163            }
4164        }
4165    }
4166}
4167
4168static void gen_VINSERTPS_r(DisasContext *s, X86DecodedInsn *decode)
4169{
4170    int val = decode->immediate;
4171    tcg_gen_ld_i32(s->tmp2_i32, tcg_env,
4172                   vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3));
4173    gen_vinsertps(s, decode);
4174}
4175
4176static void gen_VINSERTPS_m(DisasContext *s, X86DecodedInsn *decode)
4177{
4178    tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
4179    gen_vinsertps(s, decode);
4180}
4181
4182static void gen_VINSERTx128(DisasContext *s, X86DecodedInsn *decode)
4183{
4184    int mask = decode->immediate & 1;
4185    tcg_gen_gvec_mov(MO_64,
4186                     decode->op[0].offset + offsetof(YMMReg, YMM_X(mask)),
4187                     decode->op[2].offset + offsetof(YMMReg, YMM_X(0)), 16, 16);
4188    tcg_gen_gvec_mov(MO_64,
4189                     decode->op[0].offset + offsetof(YMMReg, YMM_X(!mask)),
4190                     decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
4191}
4192
4193static inline void gen_maskmov(DisasContext *s, X86DecodedInsn *decode,
4194                               SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
4195{
4196    if (!s->vex_l) {
4197        xmm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
4198    } else {
4199        ymm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
4200    }
4201}
4202
4203static void gen_VMASKMOVPD_st(DisasContext *s, X86DecodedInsn *decode)
4204{
4205    gen_maskmov(s, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
4206}
4207
4208static void gen_VMASKMOVPS_st(DisasContext *s, X86DecodedInsn *decode)
4209{
4210    gen_maskmov(s, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
4211}
4212
4213static void gen_VMOVHPx_ld(DisasContext *s, X86DecodedInsn *decode)
4214{
4215    gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4216    if (decode->op[0].offset != decode->op[1].offset) {
4217        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
4218        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4219    }
4220}
4221
4222static void gen_VMOVHPx_st(DisasContext *s, X86DecodedInsn *decode)
4223{
4224    gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
4225}
4226
4227static void gen_VMOVHPx(DisasContext *s, X86DecodedInsn *decode)
4228{
4229    if (decode->op[0].offset != decode->op[2].offset) {
4230        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
4231        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4232    }
4233    if (decode->op[0].offset != decode->op[1].offset) {
4234        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
4235        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4236    }
4237}
4238
4239static void gen_VMOVHLPS(DisasContext *s, X86DecodedInsn *decode)
4240{
4241    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
4242    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4243    if (decode->op[0].offset != decode->op[1].offset) {
4244        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
4245        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4246    }
4247}
4248
4249static void gen_VMOVLHPS(DisasContext *s, X86DecodedInsn *decode)
4250{
4251    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
4252    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
4253    if (decode->op[0].offset != decode->op[1].offset) {
4254        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
4255        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4256    }
4257}
4258
4259/*
4260 * Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
4261 * Use a gvec move to move everything above the bottom 64 bits.
4262 */
4263
4264static void gen_VMOVLPx(DisasContext *s, X86DecodedInsn *decode)
4265{
4266    int vec_len = vector_len(s, decode);
4267
4268    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
4269    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4270    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
4271}
4272
4273static void gen_VMOVLPx_ld(DisasContext *s, X86DecodedInsn *decode)
4274{
4275    int vec_len = vector_len(s, decode);
4276
4277    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
4278    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4279    tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
4280}
4281
4282static void gen_VMOVLPx_st(DisasContext *s, X86DecodedInsn *decode)
4283{
4284    tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
4285    tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
4286}
4287
4288static void gen_VMOVSD_ld(DisasContext *s, X86DecodedInsn *decode)
4289{
4290    TCGv_i64 zero = tcg_constant_i64(0);
4291
4292    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
4293    tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
4294    tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
4295}
4296
4297static void gen_VMOVSS(DisasContext *s, X86DecodedInsn *decode)
4298{
4299    int vec_len = vector_len(s, decode);
4300
4301    tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
4302    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
4303    tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
4304}
4305
4306static void gen_VMOVSS_ld(DisasContext *s, X86DecodedInsn *decode)
4307{
4308    int vec_len = vector_len(s, decode);
4309
4310    tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
4311    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
4312    tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
4313}
4314
4315static void gen_VMOVSS_st(DisasContext *s, X86DecodedInsn *decode)
4316{
4317    tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
4318    tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
4319}
4320
4321static void gen_VPMASKMOV_st(DisasContext *s, X86DecodedInsn *decode)
4322{
4323    if (s->vex_w) {
4324        gen_VMASKMOVPD_st(s, decode);
4325    } else {
4326        gen_VMASKMOVPS_st(s, decode);
4327    }
4328}
4329
4330static void gen_VPERMD(DisasContext *s, X86DecodedInsn *decode)
4331{
4332    assert(s->vex_l);
4333    gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
4334}
4335
4336static void gen_VPERM2x128(DisasContext *s, X86DecodedInsn *decode)
4337{
4338    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4339    assert(s->vex_l);
4340    gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
4341}
4342
4343static void gen_VPHMINPOSUW(DisasContext *s, X86DecodedInsn *decode)
4344{
4345    assert(!s->vex_l);
4346    gen_helper_phminposuw_xmm(tcg_env, OP_PTR0, OP_PTR2);
4347}
4348
4349static void gen_VROUNDSD(DisasContext *s, X86DecodedInsn *decode)
4350{
4351    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4352    assert(!s->vex_l);
4353    gen_helper_roundsd_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
4354}
4355
4356static void gen_VROUNDSS(DisasContext *s, X86DecodedInsn *decode)
4357{
4358    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
4359    assert(!s->vex_l);
4360    gen_helper_roundss_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
4361}
4362
4363static void gen_VSHUF(DisasContext *s, X86DecodedInsn *decode)
4364{
4365    TCGv_i32 imm = tcg_constant_i32(decode->immediate);
4366    SSEFunc_0_pppi ps, pd, fn;
4367    ps = s->vex_l ? gen_helper_shufps_ymm : gen_helper_shufps_xmm;
4368    pd = s->vex_l ? gen_helper_shufpd_ymm : gen_helper_shufpd_xmm;
4369    fn = s->prefix & PREFIX_DATA ? pd : ps;
4370    fn(OP_PTR0, OP_PTR1, OP_PTR2, imm);
4371}
4372
4373static void gen_VUCOMI(DisasContext *s, X86DecodedInsn *decode)
4374{
4375    SSEFunc_0_epp fn;
4376    fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss;
4377    fn(tcg_env, OP_PTR1, OP_PTR2);
4378    assume_cc_op(s, CC_OP_EFLAGS);
4379}
4380
4381static void gen_VZEROALL(DisasContext *s, X86DecodedInsn *decode)
4382{
4383    TCGv_ptr ptr = tcg_temp_new_ptr();
4384
4385    tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_regs));
4386    gen_helper_memset(ptr, ptr, tcg_constant_i32(0),
4387                      tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg)));
4388}
4389
4390static void gen_VZEROUPPER(DisasContext *s, X86DecodedInsn *decode)
4391{
4392    int i;
4393
4394    for (i = 0; i < CPU_NB_REGS; i++) {
4395        int offset = offsetof(CPUX86State, xmm_regs[i].ZMM_X(1));
4396        tcg_gen_gvec_dup_imm(MO_64, offset, 16, 16, 0);
4397    }
4398}
4399
4400static void gen_WAIT(DisasContext *s, X86DecodedInsn *decode)
4401{
4402    if ((s->flags & (HF_MP_MASK | HF_TS_MASK)) == (HF_MP_MASK | HF_TS_MASK)) {
4403        gen_NM_exception(s);
4404    } else {
4405        /* needs to be treated as I/O because of ferr_irq */
4406        translator_io_start(&s->base);
4407        gen_helper_fwait(tcg_env);
4408    }
4409}
4410
4411#ifndef CONFIG_USER_ONLY
4412static void gen_WRMSR(DisasContext *s, X86DecodedInsn *decode)
4413{
4414    gen_update_cc_op(s);
4415    gen_update_eip_cur(s);
4416    gen_helper_wrmsr(tcg_env);
4417    s->base.is_jmp = DISAS_EOB_NEXT;
4418}
4419#else
4420#define gen_WRMSR gen_unreachable
4421#endif
4422
4423static void gen_WRxxBASE(DisasContext *s, X86DecodedInsn *decode)
4424{
4425    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
4426
4427    /* Preserve hflags bits by testing CR4 at runtime.  */
4428    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
4429    tcg_gen_mov_tl(base, s->T0);
4430}
4431
4432static void gen_XADD(DisasContext *s, X86DecodedInsn *decode)
4433{
4434    MemOp ot = decode->op[1].ot;
4435
4436    decode->cc_dst = tcg_temp_new();
4437    decode->cc_src = s->T1;
4438    decode->cc_op = CC_OP_ADDB + ot;
4439
4440    if (s->prefix & PREFIX_LOCK) {
4441        tcg_gen_atomic_fetch_add_tl(s->T0, s->A0, s->T1, s->mem_index, ot | MO_LE);
4442        tcg_gen_add_tl(decode->cc_dst, s->T0, s->T1);
4443    } else {
4444        tcg_gen_add_tl(decode->cc_dst, s->T0, s->T1);
4445        /*
4446         * NOTE: writing memory first is important for MMU exceptions,
4447         * but "new result" wins for XADD AX, AX.
4448         */
4449        gen_writeback(s, decode, 0, decode->cc_dst);
4450    }
4451    if (decode->op[0].has_ea || decode->op[2].n != decode->op[0].n) {
4452        gen_writeback(s, decode, 2, s->T0);
4453    }
4454}
4455
4456static void gen_XCHG(DisasContext *s, X86DecodedInsn *decode)
4457{
4458    if (s->prefix & PREFIX_LOCK) {
4459        tcg_gen_atomic_xchg_tl(s->T0, s->A0, s->T1,
4460                               s->mem_index, decode->op[0].ot | MO_LE);
4461        /* now store old value into register operand */
4462        gen_op_mov_reg_v(s, decode->op[2].ot, decode->op[2].n, s->T0);
4463    } else {
4464        /* move destination value into source operand, source preserved in T1 */
4465        gen_op_mov_reg_v(s, decode->op[2].ot, decode->op[2].n, s->T0);
4466        tcg_gen_mov_tl(s->T0, s->T1);
4467    }
4468}
4469
4470static void gen_XLAT(DisasContext *s, X86DecodedInsn *decode)
4471{
4472    /* AL is already zero-extended into s->T0.  */
4473    tcg_gen_add_tl(s->A0, cpu_regs[R_EBX], s->T0);
4474    gen_lea_v_seg(s, s->A0, R_DS, s->override);
4475    gen_op_ld_v(s, MO_8, s->T0, s->A0);
4476}
4477
4478static void gen_XOR(DisasContext *s, X86DecodedInsn *decode)
4479{
4480    /* special case XOR reg, reg */
4481    if (decode->op[1].unit == X86_OP_INT &&
4482        decode->op[2].unit == X86_OP_INT &&
4483        decode->op[1].n == decode->op[2].n) {
4484        tcg_gen_movi_tl(s->T0, 0);
4485        decode->cc_op = CC_OP_CLR;
4486    } else {
4487        MemOp ot = decode->op[1].ot;
4488
4489        if (s->prefix & PREFIX_LOCK) {
4490            tcg_gen_atomic_xor_fetch_tl(s->T0, s->A0, s->T1,
4491                                        s->mem_index, ot | MO_LE);
4492        } else {
4493            tcg_gen_xor_tl(s->T0, s->T0, s->T1);
4494        }
4495        prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
4496    }
4497}
4498
4499static void gen_XRSTOR(DisasContext *s, X86DecodedInsn *decode)
4500{
4501    TCGv_i64 features = tcg_temp_new_i64();
4502
4503    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
4504    gen_helper_xrstor(tcg_env, s->A0, features);
4505    if (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_MPX) {
4506        /*
4507         * XRSTOR is how MPX is enabled, which changes how
4508         * we translate.  Thus we need to end the TB.
4509         */
4510        s->base.is_jmp = DISAS_EOB_NEXT;
4511    }
4512}
4513
4514static void gen_XSAVE(DisasContext *s, X86DecodedInsn *decode)
4515{
4516    TCGv_i64 features = tcg_temp_new_i64();
4517
4518    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
4519    gen_helper_xsave(tcg_env, s->A0, features);
4520}
4521
4522static void gen_XSAVEOPT(DisasContext *s, X86DecodedInsn *decode)
4523{
4524    TCGv_i64 features = tcg_temp_new_i64();
4525
4526    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
4527    gen_helper_xsave(tcg_env, s->A0, features);
4528}
4529