xref: /openbmc/qemu/target/i386/tcg/emit.c.inc (revision e4751d340a49b117b90a411b179b8c892cf43d85)
1/*
2 * New-style TCG opcode generator for i386 instructions
3 *
4 *  Copyright (c) 2022 Red Hat, Inc.
5 *
6 * Author: Paolo Bonzini <pbonzini@redhat.com>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg])
23
24typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
25typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
26typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
27typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
28                               TCGv_ptr reg_c);
29typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
30                                TCGv_ptr reg_c, TCGv_ptr reg_d);
31typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
32                               TCGv_i32 val);
33typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
34                                TCGv_ptr reg_c, TCGv_i32 val);
35typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
36typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c,
37                               TCGv_i32 val);
38typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
39                               TCGv val);
40typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
41                                 TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale);
42typedef void (*SSEFunc_0_eppppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
43                                  TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 flags);
44typedef void (*SSEFunc_0_eppppii)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
45                                  TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 even,
46                                  TCGv_i32 odd);
47
48static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
49{
50    return tcg_constant_i32(val);
51}
52
53static void gen_NM_exception(DisasContext *s)
54{
55    gen_exception(s, EXCP07_PREX);
56}
57
58static void gen_load_ea(DisasContext *s, AddressParts *mem, bool is_vsib)
59{
60    TCGv ea = gen_lea_modrm_1(s, *mem, is_vsib);
61    gen_lea_v_seg(s, s->aflag, ea, mem->def_seg, s->override);
62}
63
64static inline int mmx_offset(MemOp ot)
65{
66    switch (ot) {
67    case MO_8:
68        return offsetof(MMXReg, MMX_B(0));
69    case MO_16:
70        return offsetof(MMXReg, MMX_W(0));
71    case MO_32:
72        return offsetof(MMXReg, MMX_L(0));
73    case MO_64:
74        return offsetof(MMXReg, MMX_Q(0));
75    default:
76        g_assert_not_reached();
77    }
78}
79
80static inline int xmm_offset(MemOp ot)
81{
82    switch (ot) {
83    case MO_8:
84        return offsetof(ZMMReg, ZMM_B(0));
85    case MO_16:
86        return offsetof(ZMMReg, ZMM_W(0));
87    case MO_32:
88        return offsetof(ZMMReg, ZMM_L(0));
89    case MO_64:
90        return offsetof(ZMMReg, ZMM_Q(0));
91    case MO_128:
92        return offsetof(ZMMReg, ZMM_X(0));
93    case MO_256:
94        return offsetof(ZMMReg, ZMM_Y(0));
95    default:
96        g_assert_not_reached();
97    }
98}
99
100static int vector_reg_offset(X86DecodedOp *op)
101{
102    assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
103
104    if (op->unit == X86_OP_MMX) {
105        return op->offset - mmx_offset(op->ot);
106    } else {
107        return op->offset - xmm_offset(op->ot);
108    }
109}
110
111static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n)
112{
113    int base_ofs = vector_reg_offset(op);
114    switch(ot) {
115    case MO_8:
116        if (op->unit == X86_OP_MMX) {
117            return base_ofs + offsetof(MMXReg, MMX_B(n));
118        } else {
119            return base_ofs + offsetof(ZMMReg, ZMM_B(n));
120        }
121    case MO_16:
122        if (op->unit == X86_OP_MMX) {
123            return base_ofs + offsetof(MMXReg, MMX_W(n));
124        } else {
125            return base_ofs + offsetof(ZMMReg, ZMM_W(n));
126        }
127    case MO_32:
128        if (op->unit == X86_OP_MMX) {
129            return base_ofs + offsetof(MMXReg, MMX_L(n));
130        } else {
131            return base_ofs + offsetof(ZMMReg, ZMM_L(n));
132        }
133    case MO_64:
134        if (op->unit == X86_OP_MMX) {
135            return base_ofs;
136        } else {
137            return base_ofs + offsetof(ZMMReg, ZMM_Q(n));
138        }
139    case MO_128:
140        assert(op->unit == X86_OP_SSE);
141        return base_ofs + offsetof(ZMMReg, ZMM_X(n));
142    case MO_256:
143        assert(op->unit == X86_OP_SSE);
144        return base_ofs + offsetof(ZMMReg, ZMM_Y(n));
145    default:
146        g_assert_not_reached();
147    }
148}
149
150static void compute_mmx_offset(X86DecodedOp *op)
151{
152    if (!op->has_ea) {
153        op->offset = offsetof(CPUX86State, fpregs[op->n].mmx) + mmx_offset(op->ot);
154    } else {
155        op->offset = offsetof(CPUX86State, mmx_t0) + mmx_offset(op->ot);
156    }
157}
158
159static void compute_xmm_offset(X86DecodedOp *op)
160{
161    if (!op->has_ea) {
162        op->offset = ZMM_OFFSET(op->n) + xmm_offset(op->ot);
163    } else {
164        op->offset = offsetof(CPUX86State, xmm_t0) + xmm_offset(op->ot);
165    }
166}
167
168static void gen_load_sse(DisasContext *s, TCGv temp, MemOp ot, int dest_ofs, bool aligned)
169{
170    switch(ot) {
171    case MO_8:
172        gen_op_ld_v(s, MO_8, temp, s->A0);
173        tcg_gen_st8_tl(temp, tcg_env, dest_ofs);
174        break;
175    case MO_16:
176        gen_op_ld_v(s, MO_16, temp, s->A0);
177        tcg_gen_st16_tl(temp, tcg_env, dest_ofs);
178        break;
179    case MO_32:
180        gen_op_ld_v(s, MO_32, temp, s->A0);
181        tcg_gen_st32_tl(temp, tcg_env, dest_ofs);
182        break;
183    case MO_64:
184        gen_ldq_env_A0(s, dest_ofs);
185        break;
186    case MO_128:
187        gen_ldo_env_A0(s, dest_ofs, aligned);
188        break;
189    case MO_256:
190        gen_ldy_env_A0(s, dest_ofs, aligned);
191        break;
192    default:
193        g_assert_not_reached();
194    }
195}
196
197static bool sse_needs_alignment(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
198{
199    switch (decode->e.vex_class) {
200    case 2:
201    case 4:
202        if ((s->prefix & PREFIX_VEX) ||
203            decode->e.vex_special == X86_VEX_SSEUnaligned) {
204            /* MOST legacy SSE instructions require aligned memory operands, but not all.  */
205            return false;
206        }
207        /* fall through */
208    case 1:
209        return ot >= MO_128;
210
211    default:
212        return false;
213    }
214}
215
216static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
217{
218    X86DecodedOp *op = &decode->op[opn];
219
220    switch (op->unit) {
221    case X86_OP_SKIP:
222        return;
223    case X86_OP_SEG:
224        tcg_gen_ld32u_tl(v, tcg_env,
225                         offsetof(CPUX86State,segs[op->n].selector));
226        break;
227    case X86_OP_CR:
228        tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, cr[op->n]));
229        break;
230    case X86_OP_DR:
231        tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, dr[op->n]));
232        break;
233    case X86_OP_INT:
234        if (op->has_ea) {
235            if (v == s->T0 && decode->e.special == X86_SPECIAL_SExtT0) {
236                gen_op_ld_v(s, op->ot | MO_SIGN, v, s->A0);
237            } else {
238                gen_op_ld_v(s, op->ot, v, s->A0);
239            }
240
241        } else if (op->ot == MO_8 && byte_reg_is_xH(s, op->n)) {
242            if (v == s->T0 && decode->e.special == X86_SPECIAL_SExtT0) {
243                tcg_gen_sextract_tl(v, cpu_regs[op->n - 4], 8, 8);
244            } else {
245                tcg_gen_extract_tl(v, cpu_regs[op->n - 4], 8, 8);
246            }
247
248        } else if (op->ot < MO_TL && v == s->T0 &&
249                   (decode->e.special == X86_SPECIAL_SExtT0 ||
250                    decode->e.special == X86_SPECIAL_ZExtT0)) {
251            if (decode->e.special == X86_SPECIAL_SExtT0) {
252                tcg_gen_ext_tl(v, cpu_regs[op->n], op->ot | MO_SIGN);
253            } else {
254                tcg_gen_ext_tl(v, cpu_regs[op->n], op->ot);
255            }
256
257        } else {
258            tcg_gen_mov_tl(v, cpu_regs[op->n]);
259        }
260        break;
261    case X86_OP_IMM:
262        tcg_gen_movi_tl(v, decode->immediate);
263        break;
264
265    case X86_OP_MMX:
266        compute_mmx_offset(op);
267        goto load_vector;
268
269    case X86_OP_SSE:
270        compute_xmm_offset(op);
271    load_vector:
272        if (op->has_ea) {
273            bool aligned = sse_needs_alignment(s, decode, op->ot);
274            gen_load_sse(s, v, op->ot, op->offset, aligned);
275        }
276        break;
277
278    default:
279        g_assert_not_reached();
280    }
281}
282
283static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn)
284{
285    X86DecodedOp *op = &decode->op[opn];
286    if (op->v_ptr) {
287        return op->v_ptr;
288    }
289    op->v_ptr = tcg_temp_new_ptr();
290
291    /* The temporary points to the MMXReg or ZMMReg.  */
292    tcg_gen_addi_ptr(op->v_ptr, tcg_env, vector_reg_offset(op));
293    return op->v_ptr;
294}
295
296#define OP_PTR0 op_ptr(decode, 0)
297#define OP_PTR1 op_ptr(decode, 1)
298#define OP_PTR2 op_ptr(decode, 2)
299
300static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
301{
302    X86DecodedOp *op = &decode->op[opn];
303    switch (op->unit) {
304    case X86_OP_SKIP:
305        break;
306    case X86_OP_SEG:
307        /* Note that gen_movl_seg_T0 takes care of interrupt shadow and TF.  */
308        gen_movl_seg_T0(s, op->n);
309        break;
310    case X86_OP_INT:
311        if (op->has_ea) {
312            gen_op_st_v(s, op->ot, v, s->A0);
313        } else {
314            gen_op_mov_reg_v(s, op->ot, op->n, v);
315        }
316        break;
317    case X86_OP_MMX:
318        break;
319    case X86_OP_SSE:
320        if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot <= MO_128) {
321            tcg_gen_gvec_dup_imm(MO_64,
322                                 offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)),
323                                 16, 16, 0);
324        }
325        break;
326    case X86_OP_CR:
327    case X86_OP_DR:
328    default:
329        g_assert_not_reached();
330    }
331}
332
333static inline int vector_len(DisasContext *s, X86DecodedInsn *decode)
334{
335    if (decode->e.special == X86_SPECIAL_MMX &&
336        !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
337        return 8;
338    }
339    return s->vex_l ? 32 : 16;
340}
341
342static void prepare_update1_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op)
343{
344    decode->cc_dst = s->T0;
345    decode->cc_op = op;
346}
347
348static void prepare_update2_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op)
349{
350    decode->cc_src = s->T1;
351    decode->cc_dst = s->T0;
352    decode->cc_op = op;
353}
354
355static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs)
356{
357    MemOp ot = decode->op[0].ot;
358    int vec_len = vector_len(s, decode);
359    bool aligned = sse_needs_alignment(s, decode, ot);
360
361    if (!decode->op[0].has_ea) {
362        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len);
363        return;
364    }
365
366    switch (ot) {
367    case MO_64:
368        gen_stq_env_A0(s, src_ofs);
369        break;
370    case MO_128:
371        gen_sto_env_A0(s, src_ofs, aligned);
372        break;
373    case MO_256:
374        gen_sty_env_A0(s, src_ofs, aligned);
375        break;
376    default:
377        g_assert_not_reached();
378    }
379}
380
381static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b)
382{
383    gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b);
384}
385
386#define FN_3DNOW_MOVE ((SSEFunc_0_epp) (uintptr_t) 1)
387static const SSEFunc_0_epp fns_3dnow[] = {
388    [0x0c] = gen_helper_pi2fw,
389    [0x0d] = gen_helper_pi2fd,
390    [0x1c] = gen_helper_pf2iw,
391    [0x1d] = gen_helper_pf2id,
392    [0x8a] = gen_helper_pfnacc,
393    [0x8e] = gen_helper_pfpnacc,
394    [0x90] = gen_helper_pfcmpge,
395    [0x94] = gen_helper_pfmin,
396    [0x96] = gen_helper_pfrcp,
397    [0x97] = gen_helper_pfrsqrt,
398    [0x9a] = gen_helper_pfsub,
399    [0x9e] = gen_helper_pfadd,
400    [0xa0] = gen_helper_pfcmpgt,
401    [0xa4] = gen_helper_pfmax,
402    [0xa6] = FN_3DNOW_MOVE, /* PFRCPIT1; no need to actually increase precision */
403    [0xa7] = FN_3DNOW_MOVE, /* PFRSQIT1 */
404    [0xb6] = FN_3DNOW_MOVE, /* PFRCPIT2 */
405    [0xaa] = gen_helper_pfsubr,
406    [0xae] = gen_helper_pfacc,
407    [0xb0] = gen_helper_pfcmpeq,
408    [0xb4] = gen_helper_pfmul,
409    [0xb7] = gen_helper_pmulhrw_mmx,
410    [0xbb] = gen_helper_pswapd,
411    [0xbf] = gen_helper_pavgusb,
412};
413
414static void gen_3dnow(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
415{
416    uint8_t b = decode->immediate;
417    SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL;
418
419    if (!fn) {
420        gen_illegal_opcode(s);
421        return;
422    }
423    if (s->flags & HF_TS_MASK) {
424        gen_NM_exception(s);
425        return;
426    }
427    if (s->flags & HF_EM_MASK) {
428        gen_illegal_opcode(s);
429        return;
430    }
431
432    gen_helper_enter_mmx(tcg_env);
433    if (fn == FN_3DNOW_MOVE) {
434       tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset);
435       tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset);
436    } else {
437       fn(tcg_env, OP_PTR0, OP_PTR1);
438    }
439}
440
441/*
442 * 00 = v*ps Vps, Hps, Wpd
443 * 66 = v*pd Vpd, Hpd, Wps
444 * f3 = v*ss Vss, Hss, Wps
445 * f2 = v*sd Vsd, Hsd, Wps
446 */
447static inline void gen_unary_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
448                              SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm,
449                              SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm,
450                              SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
451{
452    if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
453        SSEFunc_0_eppp fn = s->prefix & PREFIX_REPZ ? ss : sd;
454        if (!fn) {
455            gen_illegal_opcode(s);
456            return;
457        }
458        fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
459    } else {
460        SSEFunc_0_epp ps, pd, fn;
461        ps = s->vex_l ? ps_ymm : ps_xmm;
462        pd = s->vex_l ? pd_ymm : pd_xmm;
463        fn = s->prefix & PREFIX_DATA ? pd : ps;
464        if (!fn) {
465            gen_illegal_opcode(s);
466            return;
467        }
468        fn(tcg_env, OP_PTR0, OP_PTR2);
469    }
470}
471#define UNARY_FP_SSE(uname, lname)                                                 \
472static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
473{                                                                                  \
474    gen_unary_fp_sse(s, env, decode,                                               \
475                     gen_helper_##lname##pd_xmm,                                   \
476                     gen_helper_##lname##ps_xmm,                                   \
477                     gen_helper_##lname##pd_ymm,                                   \
478                     gen_helper_##lname##ps_ymm,                                   \
479                     gen_helper_##lname##sd,                                       \
480                     gen_helper_##lname##ss);                                      \
481}
482UNARY_FP_SSE(VSQRT, sqrt)
483
484/*
485 * 00 = v*ps Vps, Hps, Wpd
486 * 66 = v*pd Vpd, Hpd, Wps
487 * f3 = v*ss Vss, Hss, Wps
488 * f2 = v*sd Vsd, Hsd, Wps
489 */
490static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
491                              SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
492                              SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm,
493                              SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
494{
495    SSEFunc_0_eppp ps, pd, fn;
496    if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
497        fn = s->prefix & PREFIX_REPZ ? ss : sd;
498    } else {
499        ps = s->vex_l ? ps_ymm : ps_xmm;
500        pd = s->vex_l ? pd_ymm : pd_xmm;
501        fn = s->prefix & PREFIX_DATA ? pd : ps;
502    }
503    if (fn) {
504        fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
505    } else {
506        gen_illegal_opcode(s);
507    }
508}
509
510#define FP_SSE(uname, lname)                                                       \
511static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
512{                                                                                  \
513    gen_fp_sse(s, env, decode,                                                     \
514               gen_helper_##lname##pd_xmm,                                         \
515               gen_helper_##lname##ps_xmm,                                         \
516               gen_helper_##lname##pd_ymm,                                         \
517               gen_helper_##lname##ps_ymm,                                         \
518               gen_helper_##lname##sd,                                             \
519               gen_helper_##lname##ss);                                            \
520}
521FP_SSE(VADD, add)
522FP_SSE(VMUL, mul)
523FP_SSE(VSUB, sub)
524FP_SSE(VMIN, min)
525FP_SSE(VDIV, div)
526FP_SSE(VMAX, max)
527
528#define FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, even, odd)                         \
529static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
530{                                                                                  \
531    SSEFunc_0_eppppii xmm = s->vex_w ? gen_helper_fma4pd_xmm : gen_helper_fma4ps_xmm; \
532    SSEFunc_0_eppppii ymm = s->vex_w ? gen_helper_fma4pd_ymm : gen_helper_fma4ps_ymm; \
533    SSEFunc_0_eppppii fn = s->vex_l ? ymm : xmm;                                   \
534                                                                                   \
535    fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2,                                         \
536       tcg_constant_i32(even),                                                     \
537       tcg_constant_i32((even) ^ (odd)));                                          \
538}
539
540#define FMA_SSE(uname, ptr0, ptr1, ptr2, flags)                                    \
541FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, flags, flags)                              \
542static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
543{                                                                                  \
544    SSEFunc_0_eppppi fn = s->vex_w ? gen_helper_fma4sd : gen_helper_fma4ss;        \
545                                                                                   \
546    fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2,                                         \
547       tcg_constant_i32(flags));                                                   \
548}                                                                                  \
549
550FMA_SSE(VFMADD231,  OP_PTR1, OP_PTR2, OP_PTR0, 0)
551FMA_SSE(VFMADD213,  OP_PTR1, OP_PTR0, OP_PTR2, 0)
552FMA_SSE(VFMADD132,  OP_PTR0, OP_PTR2, OP_PTR1, 0)
553
554FMA_SSE(VFNMADD231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_product)
555FMA_SSE(VFNMADD213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_product)
556FMA_SSE(VFNMADD132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_product)
557
558FMA_SSE(VFMSUB231,  OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c)
559FMA_SSE(VFMSUB213,  OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c)
560FMA_SSE(VFMSUB132,  OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c)
561
562FMA_SSE(VFNMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c|float_muladd_negate_product)
563FMA_SSE(VFNMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c|float_muladd_negate_product)
564FMA_SSE(VFNMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c|float_muladd_negate_product)
565
566FMA_SSE_PACKED(VFMADDSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c, 0)
567FMA_SSE_PACKED(VFMADDSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c, 0)
568FMA_SSE_PACKED(VFMADDSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c, 0)
569
570FMA_SSE_PACKED(VFMSUBADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0, float_muladd_negate_c)
571FMA_SSE_PACKED(VFMSUBADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0, float_muladd_negate_c)
572FMA_SSE_PACKED(VFMSUBADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0, float_muladd_negate_c)
573
574#define FP_UNPACK_SSE(uname, lname)                                                \
575static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
576{                                                                                  \
577    /* PS maps to the DQ integer instruction, PD maps to QDQ.  */                  \
578    gen_fp_sse(s, env, decode,                                                     \
579               gen_helper_##lname##qdq_xmm,                                        \
580               gen_helper_##lname##dq_xmm,                                         \
581               gen_helper_##lname##qdq_ymm,                                        \
582               gen_helper_##lname##dq_ymm,                                         \
583               NULL, NULL);                                                        \
584}
585FP_UNPACK_SSE(VUNPCKLPx, punpckl)
586FP_UNPACK_SSE(VUNPCKHPx, punpckh)
587
588/*
589 * 00 = v*ps Vps, Wpd
590 * f3 = v*ss Vss, Wps
591 */
592static inline void gen_unary_fp32_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
593                                      SSEFunc_0_epp ps_xmm,
594                                      SSEFunc_0_epp ps_ymm,
595                                      SSEFunc_0_eppp ss)
596{
597    if ((s->prefix & (PREFIX_DATA | PREFIX_REPNZ)) != 0) {
598        goto illegal_op;
599    } else if (s->prefix & PREFIX_REPZ) {
600        if (!ss) {
601            goto illegal_op;
602        }
603        ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
604    } else {
605        SSEFunc_0_epp fn = s->vex_l ? ps_ymm : ps_xmm;
606        if (!fn) {
607            goto illegal_op;
608        }
609        fn(tcg_env, OP_PTR0, OP_PTR2);
610    }
611    return;
612
613illegal_op:
614    gen_illegal_opcode(s);
615}
616#define UNARY_FP32_SSE(uname, lname)                                               \
617static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
618{                                                                                  \
619    gen_unary_fp32_sse(s, env, decode,                                             \
620                       gen_helper_##lname##ps_xmm,                                 \
621                       gen_helper_##lname##ps_ymm,                                 \
622                       gen_helper_##lname##ss);                                    \
623}
624UNARY_FP32_SSE(VRSQRT, rsqrt)
625UNARY_FP32_SSE(VRCP, rcp)
626
627/*
628 * 66 = v*pd Vpd, Hpd, Wpd
629 * f2 = v*ps Vps, Hps, Wps
630 */
631static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
632                                         SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
633                                         SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
634{
635    SSEFunc_0_eppp ps, pd, fn;
636    ps = s->vex_l ? ps_ymm : ps_xmm;
637    pd = s->vex_l ? pd_ymm : pd_xmm;
638    fn = s->prefix & PREFIX_DATA ? pd : ps;
639    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
640}
641#define HORIZONTAL_FP_SSE(uname, lname)                                            \
642static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
643{                                                                                  \
644    gen_horizontal_fp_sse(s, env, decode,                                          \
645                          gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm,  \
646                          gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
647}
648HORIZONTAL_FP_SSE(VHADD, hadd)
649HORIZONTAL_FP_SSE(VHSUB, hsub)
650HORIZONTAL_FP_SSE(VADDSUB, addsub)
651
652static inline void gen_ternary_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
653                                   int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm)
654{
655    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
656    TCGv_ptr ptr3 = tcg_temp_new_ptr();
657
658    /* The format of the fourth input is Lx */
659    tcg_gen_addi_ptr(ptr3, tcg_env, ZMM_OFFSET(op3));
660    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
661}
662#define TERNARY_SSE(uname, uvname, lname)                                          \
663static void gen_##uvname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
664{                                                                                  \
665    gen_ternary_sse(s, env, decode, (uint8_t)decode->immediate >> 4,               \
666                    gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);           \
667}                                                                                  \
668static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
669{                                                                                  \
670    gen_ternary_sse(s, env, decode, 0,                                             \
671                  gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);             \
672}
673TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
674TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
675TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
676
677static inline void gen_binary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
678                                      SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
679{
680    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
681    if (!s->vex_l) {
682        xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
683    } else {
684        ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
685    }
686}
687
688#define BINARY_IMM_SSE(uname, lname)                                               \
689static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
690{                                                                                  \
691    gen_binary_imm_sse(s, env, decode,                                             \
692                       gen_helper_##lname##_xmm,                                   \
693                       gen_helper_##lname##_ymm);                                  \
694}
695
696BINARY_IMM_SSE(VBLENDPD,   blendpd)
697BINARY_IMM_SSE(VBLENDPS,   blendps)
698BINARY_IMM_SSE(VPBLENDW,   pblendw)
699BINARY_IMM_SSE(VDDPS,      dpps)
700#define gen_helper_dppd_ymm NULL
701BINARY_IMM_SSE(VDDPD,      dppd)
702BINARY_IMM_SSE(VMPSADBW,   mpsadbw)
703BINARY_IMM_SSE(PCLMULQDQ,  pclmulqdq)
704
705
706#define UNARY_INT_GVEC(uname, func, ...)                                           \
707static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
708{                                                                                  \
709    int vec_len = vector_len(s, decode);                                          \
710                                                                                   \
711    func(__VA_ARGS__, decode->op[0].offset,                                        \
712         decode->op[2].offset, vec_len, vec_len);                                  \
713}
714UNARY_INT_GVEC(PABSB,          tcg_gen_gvec_abs, MO_8)
715UNARY_INT_GVEC(PABSW,          tcg_gen_gvec_abs, MO_16)
716UNARY_INT_GVEC(PABSD,          tcg_gen_gvec_abs, MO_32)
717UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128)
718UNARY_INT_GVEC(VPBROADCASTB,   tcg_gen_gvec_dup_mem, MO_8)
719UNARY_INT_GVEC(VPBROADCASTW,   tcg_gen_gvec_dup_mem, MO_16)
720UNARY_INT_GVEC(VPBROADCASTD,   tcg_gen_gvec_dup_mem, MO_32)
721UNARY_INT_GVEC(VPBROADCASTQ,   tcg_gen_gvec_dup_mem, MO_64)
722
723
724#define BINARY_INT_GVEC(uname, func, ...)                                          \
725static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
726{                                                                                  \
727    int vec_len = vector_len(s, decode);                                          \
728                                                                                   \
729    func(__VA_ARGS__,                                                              \
730         decode->op[0].offset, decode->op[1].offset,                               \
731         decode->op[2].offset, vec_len, vec_len);                                  \
732}
733
734BINARY_INT_GVEC(PADDB,   tcg_gen_gvec_add, MO_8)
735BINARY_INT_GVEC(PADDW,   tcg_gen_gvec_add, MO_16)
736BINARY_INT_GVEC(PADDD,   tcg_gen_gvec_add, MO_32)
737BINARY_INT_GVEC(PADDQ,   tcg_gen_gvec_add, MO_64)
738BINARY_INT_GVEC(PADDSB,  tcg_gen_gvec_ssadd, MO_8)
739BINARY_INT_GVEC(PADDSW,  tcg_gen_gvec_ssadd, MO_16)
740BINARY_INT_GVEC(PADDUSB, tcg_gen_gvec_usadd, MO_8)
741BINARY_INT_GVEC(PADDUSW, tcg_gen_gvec_usadd, MO_16)
742BINARY_INT_GVEC(PAND,    tcg_gen_gvec_and, MO_64)
743BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8)
744BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32)
745BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16)
746BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64)
747BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8)
748BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16)
749BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32)
750BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64)
751BINARY_INT_GVEC(PMAXSB,  tcg_gen_gvec_smax, MO_8)
752BINARY_INT_GVEC(PMAXSW,  tcg_gen_gvec_smax, MO_16)
753BINARY_INT_GVEC(PMAXSD,  tcg_gen_gvec_smax, MO_32)
754BINARY_INT_GVEC(PMAXUB,  tcg_gen_gvec_umax, MO_8)
755BINARY_INT_GVEC(PMAXUW,  tcg_gen_gvec_umax, MO_16)
756BINARY_INT_GVEC(PMAXUD,  tcg_gen_gvec_umax, MO_32)
757BINARY_INT_GVEC(PMINSB,  tcg_gen_gvec_smin, MO_8)
758BINARY_INT_GVEC(PMINSW,  tcg_gen_gvec_smin, MO_16)
759BINARY_INT_GVEC(PMINSD,  tcg_gen_gvec_smin, MO_32)
760BINARY_INT_GVEC(PMINUB,  tcg_gen_gvec_umin, MO_8)
761BINARY_INT_GVEC(PMINUW,  tcg_gen_gvec_umin, MO_16)
762BINARY_INT_GVEC(PMINUD,  tcg_gen_gvec_umin, MO_32)
763BINARY_INT_GVEC(PMULLW,  tcg_gen_gvec_mul, MO_16)
764BINARY_INT_GVEC(PMULLD,  tcg_gen_gvec_mul, MO_32)
765BINARY_INT_GVEC(POR,     tcg_gen_gvec_or, MO_64)
766BINARY_INT_GVEC(PSUBB,   tcg_gen_gvec_sub, MO_8)
767BINARY_INT_GVEC(PSUBW,   tcg_gen_gvec_sub, MO_16)
768BINARY_INT_GVEC(PSUBD,   tcg_gen_gvec_sub, MO_32)
769BINARY_INT_GVEC(PSUBQ,   tcg_gen_gvec_sub, MO_64)
770BINARY_INT_GVEC(PSUBSB,  tcg_gen_gvec_sssub, MO_8)
771BINARY_INT_GVEC(PSUBSW,  tcg_gen_gvec_sssub, MO_16)
772BINARY_INT_GVEC(PSUBUSB, tcg_gen_gvec_ussub, MO_8)
773BINARY_INT_GVEC(PSUBUSW, tcg_gen_gvec_ussub, MO_16)
774BINARY_INT_GVEC(PXOR,    tcg_gen_gvec_xor, MO_64)
775
776
777/*
778 * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
779 * 66 = vp* Vx, Hx, Wx
780 *
781 * These are really the same encoding, because 1) V is the same as P when VEX.V
782 * is not present 2) P and Q are the same as H and W apart from MM/XMM
783 */
784static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
785                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
786{
787    assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
788
789    if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
790        /* VEX encoding is not applicable to MMX instructions.  */
791        gen_illegal_opcode(s);
792        return;
793    }
794    if (!(s->prefix & PREFIX_DATA)) {
795        mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
796    } else if (!s->vex_l) {
797        xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
798    } else {
799        ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
800    }
801}
802
803
804#define BINARY_INT_MMX(uname, lname)                                               \
805static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
806{                                                                                  \
807    gen_binary_int_sse(s, env, decode,                                             \
808                          gen_helper_##lname##_mmx,                                \
809                          gen_helper_##lname##_xmm,                                \
810                          gen_helper_##lname##_ymm);                               \
811}
812BINARY_INT_MMX(PUNPCKLBW,  punpcklbw)
813BINARY_INT_MMX(PUNPCKLWD,  punpcklwd)
814BINARY_INT_MMX(PUNPCKLDQ,  punpckldq)
815BINARY_INT_MMX(PACKSSWB,   packsswb)
816BINARY_INT_MMX(PACKUSWB,   packuswb)
817BINARY_INT_MMX(PUNPCKHBW,  punpckhbw)
818BINARY_INT_MMX(PUNPCKHWD,  punpckhwd)
819BINARY_INT_MMX(PUNPCKHDQ,  punpckhdq)
820BINARY_INT_MMX(PACKSSDW,   packssdw)
821
822BINARY_INT_MMX(PAVGB,   pavgb)
823BINARY_INT_MMX(PAVGW,   pavgw)
824BINARY_INT_MMX(PMADDWD, pmaddwd)
825BINARY_INT_MMX(PMULHUW, pmulhuw)
826BINARY_INT_MMX(PMULHW,  pmulhw)
827BINARY_INT_MMX(PMULUDQ, pmuludq)
828BINARY_INT_MMX(PSADBW,  psadbw)
829
830BINARY_INT_MMX(PSLLW_r, psllw)
831BINARY_INT_MMX(PSLLD_r, pslld)
832BINARY_INT_MMX(PSLLQ_r, psllq)
833BINARY_INT_MMX(PSRLW_r, psrlw)
834BINARY_INT_MMX(PSRLD_r, psrld)
835BINARY_INT_MMX(PSRLQ_r, psrlq)
836BINARY_INT_MMX(PSRAW_r, psraw)
837BINARY_INT_MMX(PSRAD_r, psrad)
838
839BINARY_INT_MMX(PHADDW,    phaddw)
840BINARY_INT_MMX(PHADDSW,   phaddsw)
841BINARY_INT_MMX(PHADDD,    phaddd)
842BINARY_INT_MMX(PHSUBW,    phsubw)
843BINARY_INT_MMX(PHSUBSW,   phsubsw)
844BINARY_INT_MMX(PHSUBD,    phsubd)
845BINARY_INT_MMX(PMADDUBSW, pmaddubsw)
846BINARY_INT_MMX(PSHUFB,    pshufb)
847BINARY_INT_MMX(PSIGNB,    psignb)
848BINARY_INT_MMX(PSIGNW,    psignw)
849BINARY_INT_MMX(PSIGND,    psignd)
850BINARY_INT_MMX(PMULHRSW,  pmulhrsw)
851
852/* Instructions with no MMX equivalent.  */
853#define BINARY_INT_SSE(uname, lname)                                               \
854static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
855{                                                                                  \
856    gen_binary_int_sse(s, env, decode,                                             \
857                          NULL,                                                    \
858                          gen_helper_##lname##_xmm,                                \
859                          gen_helper_##lname##_ymm);                               \
860}
861
862/* Instructions with no MMX equivalent.  */
863BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
864BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
865BINARY_INT_SSE(VPACKUSDW,  packusdw)
866BINARY_INT_SSE(VPERMILPS,  vpermilps)
867BINARY_INT_SSE(VPERMILPD,  vpermilpd)
868BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd)
869BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq)
870
871BINARY_INT_SSE(PMULDQ,    pmuldq)
872
873BINARY_INT_SSE(VAESDEC, aesdec)
874BINARY_INT_SSE(VAESDECLAST, aesdeclast)
875BINARY_INT_SSE(VAESENC, aesenc)
876BINARY_INT_SSE(VAESENCLAST, aesenclast)
877
878#define UNARY_CMP_SSE(uname, lname)                                                \
879static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
880{                                                                                  \
881    if (!s->vex_l) {                                                               \
882        gen_helper_##lname##_xmm(tcg_env, OP_PTR1, OP_PTR2);                       \
883    } else {                                                                       \
884        gen_helper_##lname##_ymm(tcg_env, OP_PTR1, OP_PTR2);                       \
885    }                                                                              \
886    set_cc_op(s, CC_OP_EFLAGS);                                                    \
887}
888UNARY_CMP_SSE(VPTEST,     ptest)
889UNARY_CMP_SSE(VTESTPS,    vtestps)
890UNARY_CMP_SSE(VTESTPD,    vtestpd)
891
892static inline void gen_unary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
893                                     SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
894{
895    if (!s->vex_l) {
896        xmm(tcg_env, OP_PTR0, OP_PTR2);
897    } else {
898        ymm(tcg_env, OP_PTR0, OP_PTR2);
899    }
900}
901
902#define UNARY_INT_SSE(uname, lname)                                                \
903static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
904{                                                                                  \
905    gen_unary_int_sse(s, env, decode,                                              \
906                      gen_helper_##lname##_xmm,                                    \
907                      gen_helper_##lname##_ymm);                                   \
908}
909
910UNARY_INT_SSE(VPMOVSXBW,    pmovsxbw)
911UNARY_INT_SSE(VPMOVSXBD,    pmovsxbd)
912UNARY_INT_SSE(VPMOVSXBQ,    pmovsxbq)
913UNARY_INT_SSE(VPMOVSXWD,    pmovsxwd)
914UNARY_INT_SSE(VPMOVSXWQ,    pmovsxwq)
915UNARY_INT_SSE(VPMOVSXDQ,    pmovsxdq)
916
917UNARY_INT_SSE(VPMOVZXBW,    pmovzxbw)
918UNARY_INT_SSE(VPMOVZXBD,    pmovzxbd)
919UNARY_INT_SSE(VPMOVZXBQ,    pmovzxbq)
920UNARY_INT_SSE(VPMOVZXWD,    pmovzxwd)
921UNARY_INT_SSE(VPMOVZXWQ,    pmovzxwq)
922UNARY_INT_SSE(VPMOVZXDQ,    pmovzxdq)
923
924UNARY_INT_SSE(VMOVSLDUP,    pmovsldup)
925UNARY_INT_SSE(VMOVSHDUP,    pmovshdup)
926UNARY_INT_SSE(VMOVDDUP,     pmovdldup)
927
928UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
929UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
930UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
931UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps)
932UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq)
933UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq)
934UNARY_INT_SSE(VCVTPH2PS, cvtph2ps)
935
936
937static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
938                                     SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
939{
940    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
941    if (!s->vex_l) {
942        xmm(OP_PTR0, OP_PTR1, imm);
943    } else {
944        ymm(OP_PTR0, OP_PTR1, imm);
945    }
946}
947
948#define UNARY_IMM_SSE(uname, lname)                                                \
949static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
950{                                                                                  \
951    gen_unary_imm_sse(s, env, decode,                                              \
952                      gen_helper_##lname##_xmm,                                    \
953                      gen_helper_##lname##_ymm);                                   \
954}
955
956UNARY_IMM_SSE(PSHUFD,     pshufd)
957UNARY_IMM_SSE(PSHUFHW,    pshufhw)
958UNARY_IMM_SSE(PSHUFLW,    pshuflw)
959#define gen_helper_vpermq_xmm NULL
960UNARY_IMM_SSE(VPERMQ,      vpermq)
961UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm)
962UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm)
963
964static inline void gen_unary_imm_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
965                                        SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm)
966{
967    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
968    if (!s->vex_l) {
969        xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
970    } else {
971        ymm(tcg_env, OP_PTR0, OP_PTR1, imm);
972    }
973}
974
975#define UNARY_IMM_FP_SSE(uname, lname)                                             \
976static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
977{                                                                                  \
978    gen_unary_imm_fp_sse(s, env, decode,                                           \
979                      gen_helper_##lname##_xmm,                                    \
980                      gen_helper_##lname##_ymm);                                   \
981}
982
983UNARY_IMM_FP_SSE(VROUNDPS,    roundps)
984UNARY_IMM_FP_SSE(VROUNDPD,    roundpd)
985
986static inline void gen_vexw_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
987                                SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
988                                SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
989{
990    SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm;
991    SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm;
992    SSEFunc_0_eppp fn = s->vex_w ? q : d;
993    fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
994}
995
996/* VEX.W affects whether to operate on 32- or 64-bit elements.  */
997#define VEXW_AVX(uname, lname)                                                     \
998static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
999{                                                                                  \
1000    gen_vexw_avx(s, env, decode,                                                   \
1001                 gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
1002                 gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
1003}
1004VEXW_AVX(VPSLLV,    vpsllv)
1005VEXW_AVX(VPSRLV,    vpsrlv)
1006VEXW_AVX(VPSRAV,    vpsrav)
1007VEXW_AVX(VPMASKMOV, vpmaskmov)
1008
1009/* Same as above, but with extra arguments to the helper.  */
1010static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
1011                                SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
1012                                SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
1013{
1014    SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
1015    SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
1016    SSEFunc_0_epppti fn = s->vex_w ? q : d;
1017    TCGv_i32 scale = tcg_constant_i32(decode->mem.scale);
1018    TCGv_ptr index = tcg_temp_new_ptr();
1019
1020    /* Pass third input as (index, base, scale) */
1021    tcg_gen_addi_ptr(index, tcg_env, ZMM_OFFSET(decode->mem.index));
1022    fn(tcg_env, OP_PTR0, OP_PTR1, index, s->A0, scale);
1023
1024    /*
1025     * There are two output operands, so zero OP1's high 128 bits
1026     * in the VEX.128 case.
1027     */
1028    if (!s->vex_l) {
1029        int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1);
1030        tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0);
1031    }
1032}
1033#define VSIB_AVX(uname, lname)                                                     \
1034static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
1035{                                                                                  \
1036    gen_vsib_avx(s, env, decode,                                                   \
1037                 gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
1038                 gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
1039}
1040VSIB_AVX(VPGATHERD, vpgatherd)
1041VSIB_AVX(VPGATHERQ, vpgatherq)
1042
1043/* ADCX/ADOX do not have memory operands and can use set_cc_op.  */
1044static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
1045{
1046    int opposite_cc_op;
1047    TCGv carry_in = NULL;
1048    TCGv carry_out = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
1049    TCGv zero;
1050
1051    if (cc_op == s->cc_op || s->cc_op == CC_OP_ADCOX) {
1052        /* Re-use the carry-out from a previous round.  */
1053        carry_in = carry_out;
1054    } else {
1055        /* We don't have a carry-in, get it out of EFLAGS.  */
1056        if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
1057            gen_compute_eflags(s);
1058        }
1059        carry_in = s->tmp0;
1060        tcg_gen_extract_tl(carry_in, cpu_cc_src,
1061            ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
1062    }
1063
1064    switch (ot) {
1065#ifdef TARGET_X86_64
1066    case MO_32:
1067        /* If TL is 64-bit just do everything in 64-bit arithmetic.  */
1068        tcg_gen_ext32u_tl(s->T0, s->T0);
1069        tcg_gen_ext32u_tl(s->T1, s->T1);
1070        tcg_gen_add_i64(s->T0, s->T0, s->T1);
1071        tcg_gen_add_i64(s->T0, s->T0, carry_in);
1072        tcg_gen_shri_i64(carry_out, s->T0, 32);
1073        break;
1074#endif
1075    default:
1076        zero = tcg_constant_tl(0);
1077        tcg_gen_add2_tl(s->T0, carry_out, s->T0, zero, carry_in, zero);
1078        tcg_gen_add2_tl(s->T0, carry_out, s->T0, carry_out, s->T1, zero);
1079        break;
1080    }
1081
1082    opposite_cc_op = cc_op == CC_OP_ADCX ? CC_OP_ADOX : CC_OP_ADCX;
1083    if (s->cc_op == CC_OP_ADCOX || s->cc_op == opposite_cc_op) {
1084        /* Merge with the carry-out from the opposite instruction.  */
1085        set_cc_op(s, CC_OP_ADCOX);
1086    } else {
1087        set_cc_op(s, cc_op);
1088    }
1089}
1090
1091static void gen_ADCX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1092{
1093    gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADCX);
1094}
1095
1096static void gen_ADOX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1097{
1098    gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADOX);
1099}
1100
1101static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1102{
1103    MemOp ot = decode->op[0].ot;
1104
1105    tcg_gen_andc_tl(s->T0, s->T1, s->T0);
1106    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1107}
1108
1109static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1110{
1111    MemOp ot = decode->op[0].ot;
1112    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
1113    TCGv zero = tcg_constant_tl(0);
1114    TCGv mone = tcg_constant_tl(-1);
1115
1116    /*
1117     * Extract START, and shift the operand.
1118     * Shifts larger than operand size get zeros.
1119     */
1120    tcg_gen_ext8u_tl(s->A0, s->T1);
1121    tcg_gen_shr_tl(s->T0, s->T0, s->A0);
1122
1123    tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero);
1124
1125    /*
1126     * Extract the LEN into an inverse mask.  Lengths larger than
1127     * operand size get all zeros, length 0 gets all ones.
1128     */
1129    tcg_gen_extract_tl(s->A0, s->T1, 8, 8);
1130    tcg_gen_shl_tl(s->T1, mone, s->A0);
1131    tcg_gen_movcond_tl(TCG_COND_LEU, s->T1, s->A0, bound, s->T1, zero);
1132    tcg_gen_andc_tl(s->T0, s->T0, s->T1);
1133
1134    prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
1135}
1136
1137/* BLSI do not have memory operands and can use set_cc_op.  */
1138static void gen_BLSI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1139{
1140    MemOp ot = decode->op[0].ot;
1141
1142    tcg_gen_mov_tl(cpu_cc_src, s->T0);
1143    tcg_gen_neg_tl(s->T1, s->T0);
1144    tcg_gen_and_tl(s->T0, s->T0, s->T1);
1145    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
1146    set_cc_op(s, CC_OP_BMILGB + ot);
1147}
1148
1149/* BLSMSK do not have memory operands and can use set_cc_op.  */
1150static void gen_BLSMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1151{
1152    MemOp ot = decode->op[0].ot;
1153
1154    tcg_gen_mov_tl(cpu_cc_src, s->T0);
1155    tcg_gen_subi_tl(s->T1, s->T0, 1);
1156    tcg_gen_xor_tl(s->T0, s->T0, s->T1);
1157    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
1158    set_cc_op(s, CC_OP_BMILGB + ot);
1159}
1160
1161/* BLSR do not have memory operands and can use set_cc_op.  */
1162static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1163{
1164    MemOp ot = decode->op[0].ot;
1165
1166    tcg_gen_mov_tl(cpu_cc_src, s->T0);
1167    tcg_gen_subi_tl(s->T1, s->T0, 1);
1168    tcg_gen_and_tl(s->T0, s->T0, s->T1);
1169    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
1170    set_cc_op(s, CC_OP_BMILGB + ot);
1171}
1172
1173static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1174{
1175    MemOp ot = decode->op[0].ot;
1176    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
1177    TCGv zero = tcg_constant_tl(0);
1178    TCGv mone = tcg_constant_tl(-1);
1179
1180    tcg_gen_ext8u_tl(s->T1, s->T1);
1181
1182    tcg_gen_shl_tl(s->A0, mone, s->T1);
1183    tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->T1, bound, s->A0, zero);
1184    tcg_gen_andc_tl(s->T0, s->T0, s->A0);
1185    /*
1186     * Note that since we're using BMILG (in order to get O
1187     * cleared) we need to store the inverse into C.
1188     */
1189    tcg_gen_setcond_tl(TCG_COND_LEU, s->T1, s->T1, bound);
1190    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
1191}
1192
1193static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1194{
1195    TCGLabel *label_top = gen_new_label();
1196    TCGLabel *label_bottom = gen_new_label();
1197    TCGv oldv = tcg_temp_new();
1198    TCGv newv = tcg_temp_new();
1199    TCGv cmpv = tcg_temp_new();
1200    TCGCond cond;
1201
1202    TCGv cmp_lhs, cmp_rhs;
1203    MemOp ot, ot_full;
1204
1205    int jcc_op = (decode->b >> 1) & 7;
1206    static const TCGCond cond_table[8] = {
1207        [JCC_O] = TCG_COND_LT,  /* test sign bit by comparing against 0 */
1208        [JCC_B] = TCG_COND_LTU,
1209        [JCC_Z] = TCG_COND_EQ,
1210        [JCC_BE] = TCG_COND_LEU,
1211        [JCC_S] = TCG_COND_LT,  /* test sign bit by comparing against 0 */
1212        [JCC_P] = TCG_COND_EQ,  /* even parity - tests low bit of popcount */
1213        [JCC_L] = TCG_COND_LT,
1214        [JCC_LE] = TCG_COND_LE,
1215    };
1216
1217    cond = cond_table[jcc_op];
1218    if (decode->b & 1) {
1219        cond = tcg_invert_cond(cond);
1220    }
1221
1222    ot = decode->op[0].ot;
1223    ot_full = ot | MO_LE;
1224    if (jcc_op >= JCC_S) {
1225        /*
1226         * Sign-extend values before subtracting for S, P (zero/sign extension
1227         * does not matter there) L, LE and their inverses.
1228         */
1229        ot_full |= MO_SIGN;
1230    }
1231
1232    /*
1233     * cmpv will be moved to cc_src *after* cpu_regs[] is written back, so use
1234     * tcg_gen_ext_tl instead of gen_ext_tl.
1235     */
1236    tcg_gen_ext_tl(cmpv, cpu_regs[decode->op[1].n], ot_full);
1237
1238    /*
1239     * Cmpxchg loop starts here.
1240     * - s->T1: addition operand (from decoder)
1241     * - s->A0: dest address (from decoder)
1242     * - s->cc_srcT: memory operand (lhs for comparison)
1243     * - cmpv: rhs for comparison
1244     */
1245    gen_set_label(label_top);
1246    gen_op_ld_v(s, ot_full, s->cc_srcT, s->A0);
1247    tcg_gen_sub_tl(s->T0, s->cc_srcT, cmpv);
1248
1249    /* Compute the comparison result by hand, to avoid clobbering cc_*.  */
1250    switch (jcc_op) {
1251    case JCC_O:
1252        /* (src1 ^ src2) & (src1 ^ dst). newv is only used here for a moment */
1253        tcg_gen_xor_tl(newv, s->cc_srcT, s->T0);
1254        tcg_gen_xor_tl(s->tmp0, s->cc_srcT, cmpv);
1255        tcg_gen_and_tl(s->tmp0, s->tmp0, newv);
1256        tcg_gen_sextract_tl(s->tmp0, s->tmp0, 0, 8 << ot);
1257        cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0);
1258        break;
1259
1260    case JCC_P:
1261        tcg_gen_ext8u_tl(s->tmp0, s->T0);
1262        tcg_gen_ctpop_tl(s->tmp0, s->tmp0);
1263        tcg_gen_andi_tl(s->tmp0, s->tmp0, 1);
1264        cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0);
1265        break;
1266
1267    case JCC_S:
1268        tcg_gen_sextract_tl(s->tmp0, s->T0, 0, 8 << ot);
1269        cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0);
1270        break;
1271
1272    default:
1273        cmp_lhs = s->cc_srcT, cmp_rhs = cmpv;
1274        break;
1275    }
1276
1277    /* Compute new value: if condition does not hold, just store back s->cc_srcT */
1278    tcg_gen_add_tl(newv, s->cc_srcT, s->T1);
1279    tcg_gen_movcond_tl(cond, newv, cmp_lhs, cmp_rhs, newv, s->cc_srcT);
1280    tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, s->cc_srcT, newv, s->mem_index, ot_full);
1281
1282    /* Exit unconditionally if cmpxchg succeeded.  */
1283    tcg_gen_brcond_tl(TCG_COND_EQ, oldv, s->cc_srcT, label_bottom);
1284
1285    /* Try again if there was actually a store to make.  */
1286    tcg_gen_brcond_tl(cond, cmp_lhs, cmp_rhs, label_top);
1287    gen_set_label(label_bottom);
1288
1289    /* Store old value to registers only after a successful store.  */
1290    gen_writeback(s, decode, 1, s->cc_srcT);
1291
1292    decode->cc_dst = s->T0;
1293    decode->cc_src = cmpv;
1294    decode->cc_op = CC_OP_SUBB + ot;
1295}
1296
1297static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1298{
1299    MemOp ot = decode->op[2].ot;
1300
1301    tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
1302    gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
1303}
1304
1305static void gen_CVTPI2Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1306{
1307    gen_helper_enter_mmx(tcg_env);
1308    if (s->prefix & PREFIX_DATA) {
1309        gen_helper_cvtpi2pd(tcg_env, OP_PTR0, OP_PTR2);
1310    } else {
1311        gen_helper_cvtpi2ps(tcg_env, OP_PTR0, OP_PTR2);
1312    }
1313}
1314
1315static void gen_CVTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1316{
1317    gen_helper_enter_mmx(tcg_env);
1318    if (s->prefix & PREFIX_DATA) {
1319        gen_helper_cvtpd2pi(tcg_env, OP_PTR0, OP_PTR2);
1320    } else {
1321        gen_helper_cvtps2pi(tcg_env, OP_PTR0, OP_PTR2);
1322    }
1323}
1324
1325static void gen_CVTTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1326{
1327    gen_helper_enter_mmx(tcg_env);
1328    if (s->prefix & PREFIX_DATA) {
1329        gen_helper_cvttpd2pi(tcg_env, OP_PTR0, OP_PTR2);
1330    } else {
1331        gen_helper_cvttps2pi(tcg_env, OP_PTR0, OP_PTR2);
1332    }
1333}
1334
1335static void gen_EMMS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1336{
1337    gen_helper_emms(tcg_env);
1338}
1339
1340static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1341{
1342    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
1343    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
1344
1345    gen_helper_extrq_i(tcg_env, OP_PTR0, index, length);
1346}
1347
1348static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1349{
1350    gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
1351}
1352
1353static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1354{
1355    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
1356    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
1357
1358    gen_helper_insertq_i(tcg_env, OP_PTR0, OP_PTR1, index, length);
1359}
1360
1361static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1362{
1363    gen_helper_insertq_r(tcg_env, OP_PTR0, OP_PTR2);
1364}
1365
1366static void gen_LDMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1367{
1368    tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T1);
1369    gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
1370}
1371
1372static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1373{
1374    gen_lea_v_seg(s, s->aflag, cpu_regs[R_EDI], R_DS, s->override);
1375
1376    if (s->prefix & PREFIX_DATA) {
1377        gen_helper_maskmov_xmm(tcg_env, OP_PTR1, OP_PTR2, s->A0);
1378    } else {
1379        gen_helper_maskmov_mmx(tcg_env, OP_PTR1, OP_PTR2, s->A0);
1380    }
1381}
1382
1383static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1384{
1385    MemOp ot = decode->op[0].ot;
1386
1387    /* M operand type does not load/store */
1388    if (decode->e.op0 == X86_TYPE_M) {
1389        tcg_gen_qemu_st_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
1390    } else {
1391        tcg_gen_qemu_ld_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
1392    }
1393}
1394
1395static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1396{
1397    MemOp ot = decode->op[2].ot;
1398
1399    switch (ot) {
1400    case MO_32:
1401#ifdef TARGET_X86_64
1402        tcg_gen_ld32u_tl(s->T0, tcg_env, decode->op[2].offset);
1403        break;
1404    case MO_64:
1405#endif
1406        tcg_gen_ld_tl(s->T0, tcg_env, decode->op[2].offset);
1407        break;
1408    default:
1409        abort();
1410    }
1411}
1412
1413static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1414{
1415    MemOp ot = decode->op[2].ot;
1416    int vec_len = vector_len(s, decode);
1417    int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0);
1418
1419    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1420
1421    switch (ot) {
1422    case MO_32:
1423#ifdef TARGET_X86_64
1424        tcg_gen_st32_tl(s->T1, tcg_env, lo_ofs);
1425        break;
1426    case MO_64:
1427#endif
1428        tcg_gen_st_tl(s->T1, tcg_env, lo_ofs);
1429        break;
1430    default:
1431        g_assert_not_reached();
1432    }
1433}
1434
1435static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1436{
1437    gen_store_sse(s, decode, decode->op[2].offset);
1438}
1439
1440static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1441{
1442    typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn;
1443    ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm;
1444    pd = s->vex_l ? gen_helper_movmskpd_ymm : gen_helper_movmskpd_xmm;
1445    fn = s->prefix & PREFIX_DATA ? pd : ps;
1446    fn(s->tmp2_i32, tcg_env, OP_PTR2);
1447    tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
1448}
1449
1450static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1451{
1452    int vec_len = vector_len(s, decode);
1453    int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
1454
1455    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
1456    if (decode->op[0].has_ea) {
1457        tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
1458    } else {
1459        /*
1460         * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
1461         * seem to work, but it does not on big-endian platforms; the cleared parts
1462         * are always at higher addresses, but cross-endian emulation inverts the
1463         * byte order so that the cleared parts need to be at *lower* addresses.
1464         * Because oprsz is 8, we see this here even for SSE; but more in general,
1465         * it disqualifies using oprsz < maxsz to emulate VEX128.
1466         */
1467        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1468        tcg_gen_st_i64(s->tmp1_i64, tcg_env, lo_ofs);
1469    }
1470}
1471
1472static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1473{
1474    gen_helper_enter_mmx(tcg_env);
1475    /* Otherwise the same as any other movq.  */
1476    return gen_MOVQ(s, env, decode);
1477}
1478
1479static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1480{
1481    MemOp ot = decode->op[0].ot;
1482
1483    /* low part of result in VEX.vvvv, high in MODRM */
1484    switch (ot) {
1485    case MO_32:
1486#ifdef TARGET_X86_64
1487        tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
1488        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
1489        tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32,
1490                          s->tmp2_i32, s->tmp3_i32);
1491        tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32);
1492        tcg_gen_extu_i32_tl(s->T0, s->tmp3_i32);
1493        break;
1494
1495    case MO_64:
1496#endif
1497        tcg_gen_mulu2_tl(cpu_regs[s->vex_v], s->T0, s->T0, s->T1);
1498        break;
1499
1500    default:
1501        g_assert_not_reached();
1502    }
1503}
1504
1505static void gen_PALIGNR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1506{
1507    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1508    if (!(s->prefix & PREFIX_DATA)) {
1509        gen_helper_palignr_mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
1510    } else if (!s->vex_l) {
1511        gen_helper_palignr_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
1512    } else {
1513        gen_helper_palignr_ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
1514    }
1515}
1516
1517static void gen_PANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1518{
1519    int vec_len = vector_len(s, decode);
1520
1521    /* Careful, operand order is reversed!  */
1522    tcg_gen_gvec_andc(MO_64,
1523                      decode->op[0].offset, decode->op[2].offset,
1524                      decode->op[1].offset, vec_len, vec_len);
1525}
1526
1527static void gen_PCMPESTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1528{
1529    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1530    gen_helper_pcmpestri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
1531    set_cc_op(s, CC_OP_EFLAGS);
1532}
1533
1534static void gen_PCMPESTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1535{
1536    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1537    gen_helper_pcmpestrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
1538    set_cc_op(s, CC_OP_EFLAGS);
1539    if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
1540        tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
1541                             16, 16, 0);
1542    }
1543}
1544
1545static void gen_PCMPISTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1546{
1547    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1548    gen_helper_pcmpistri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
1549    set_cc_op(s, CC_OP_EFLAGS);
1550}
1551
1552static void gen_PCMPISTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1553{
1554    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1555    gen_helper_pcmpistrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
1556    set_cc_op(s, CC_OP_EFLAGS);
1557    if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
1558        tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
1559                             16, 16, 0);
1560    }
1561}
1562
1563static void gen_PDEP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1564{
1565    gen_helper_pdep(s->T0, s->T0, s->T1);
1566}
1567
1568static void gen_PEXT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1569{
1570    gen_helper_pext(s->T0, s->T0, s->T1);
1571}
1572
1573static inline void gen_pextr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
1574{
1575    int vec_len = vector_len(s, decode);
1576    int mask = (vec_len >> ot) - 1;
1577    int val = decode->immediate & mask;
1578
1579    switch (ot) {
1580    case MO_8:
1581        tcg_gen_ld8u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
1582        break;
1583    case MO_16:
1584        tcg_gen_ld16u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
1585        break;
1586    case MO_32:
1587#ifdef TARGET_X86_64
1588        tcg_gen_ld32u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
1589        break;
1590    case MO_64:
1591#endif
1592        tcg_gen_ld_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
1593        break;
1594    default:
1595        abort();
1596    }
1597}
1598
1599static void gen_PEXTRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1600{
1601    gen_pextr(s, env, decode, MO_8);
1602}
1603
1604static void gen_PEXTRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1605{
1606    gen_pextr(s, env, decode, MO_16);
1607}
1608
1609static void gen_PEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1610{
1611    MemOp ot = decode->op[0].ot;
1612    gen_pextr(s, env, decode, ot);
1613}
1614
1615static inline void gen_pinsr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
1616{
1617    int vec_len = vector_len(s, decode);
1618    int mask = (vec_len >> ot) - 1;
1619    int val = decode->immediate & mask;
1620
1621    if (decode->op[1].offset != decode->op[0].offset) {
1622        assert(vec_len == 16);
1623        gen_store_sse(s, decode, decode->op[1].offset);
1624    }
1625
1626    switch (ot) {
1627    case MO_8:
1628        tcg_gen_st8_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
1629        break;
1630    case MO_16:
1631        tcg_gen_st16_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
1632        break;
1633    case MO_32:
1634#ifdef TARGET_X86_64
1635        tcg_gen_st32_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
1636        break;
1637    case MO_64:
1638#endif
1639        tcg_gen_st_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
1640        break;
1641    default:
1642        abort();
1643    }
1644}
1645
1646static void gen_PINSRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1647{
1648    gen_pinsr(s, env, decode, MO_8);
1649}
1650
1651static void gen_PINSRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1652{
1653    gen_pinsr(s, env, decode, MO_16);
1654}
1655
1656static void gen_PINSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1657{
1658    gen_pinsr(s, env, decode, decode->op[2].ot);
1659}
1660
1661static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
1662{
1663    TCGv_i64 t = tcg_temp_new_i64();
1664
1665    tcg_gen_andi_i64(d, s, 0x8080808080808080ull);
1666
1667    /*
1668     * After each shift+or pair:
1669     * 0:  a.......b.......c.......d.......e.......f.......g.......h.......
1670     * 7:  ab......bc......cd......de......ef......fg......gh......h.......
1671     * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h.......
1672     * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h.......
1673     * The result is left in the high bits of the word.
1674     */
1675    tcg_gen_shli_i64(t, d, 7);
1676    tcg_gen_or_i64(d, d, t);
1677    tcg_gen_shli_i64(t, d, 14);
1678    tcg_gen_or_i64(d, d, t);
1679    tcg_gen_shli_i64(t, d, 28);
1680    tcg_gen_or_i64(d, d, t);
1681}
1682
1683static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s)
1684{
1685    TCGv_vec t = tcg_temp_new_vec_matching(d);
1686    TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80);
1687
1688    /* See above */
1689    tcg_gen_and_vec(vece, d, s, m);
1690    tcg_gen_shli_vec(vece, t, d, 7);
1691    tcg_gen_or_vec(vece, d, d, t);
1692    tcg_gen_shli_vec(vece, t, d, 14);
1693    tcg_gen_or_vec(vece, d, d, t);
1694    tcg_gen_shli_vec(vece, t, d, 28);
1695    tcg_gen_or_vec(vece, d, d, t);
1696}
1697
1698#ifdef TARGET_X86_64
1699#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i64
1700#else
1701#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i32
1702#endif
1703
1704static void gen_PMOVMSKB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1705{
1706    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
1707    static const GVecGen2 g = {
1708        .fni8 = gen_pmovmskb_i64,
1709        .fniv = gen_pmovmskb_vec,
1710        .opt_opc = vecop_list,
1711        .vece = MO_64,
1712        .prefer_i64 = TCG_TARGET_REG_BITS == 64
1713    };
1714    MemOp ot = decode->op[2].ot;
1715    int vec_len = vector_len(s, decode);
1716    TCGv t = tcg_temp_new();
1717
1718    tcg_gen_gvec_2(offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), decode->op[2].offset,
1719                   vec_len, vec_len, &g);
1720    tcg_gen_ld8u_tl(s->T0, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
1721    while (vec_len > 8) {
1722        vec_len -= 8;
1723        if (TCG_TARGET_HAS_extract2_tl) {
1724            /*
1725             * Load the next byte of the result into the high byte of T.
1726             * TCG does a similar expansion of deposit to shl+extract2; by
1727             * loading the whole word, the shift left is avoided.
1728             */
1729#ifdef TARGET_X86_64
1730            tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_Q((vec_len - 1) / 8)));
1731#else
1732            tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L((vec_len - 1) / 4)));
1733#endif
1734
1735            tcg_gen_extract2_tl(s->T0, t, s->T0, TARGET_LONG_BITS - 8);
1736        } else {
1737            /*
1738             * The _previous_ value is deposited into bits 8 and higher of t.  Because
1739             * those bits are known to be zero after ld8u, this becomes a shift+or
1740             * if deposit is not available.
1741             */
1742            tcg_gen_ld8u_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
1743            tcg_gen_deposit_tl(s->T0, t, s->T0, 8, TARGET_LONG_BITS - 8);
1744        }
1745    }
1746}
1747
1748static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1749{
1750    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
1751    gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm);
1752}
1753
1754static void gen_PSRLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1755{
1756    int vec_len = vector_len(s, decode);
1757
1758    if (decode->immediate >= 16) {
1759        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1760    } else {
1761        tcg_gen_gvec_shri(MO_16,
1762                          decode->op[0].offset, decode->op[1].offset,
1763                          decode->immediate, vec_len, vec_len);
1764    }
1765}
1766
1767static void gen_PSLLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1768{
1769    int vec_len = vector_len(s, decode);
1770
1771    if (decode->immediate >= 16) {
1772        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1773    } else {
1774        tcg_gen_gvec_shli(MO_16,
1775                          decode->op[0].offset, decode->op[1].offset,
1776                          decode->immediate, vec_len, vec_len);
1777    }
1778}
1779
1780static void gen_PSRAW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1781{
1782    int vec_len = vector_len(s, decode);
1783
1784    if (decode->immediate >= 16) {
1785        decode->immediate = 15;
1786    }
1787    tcg_gen_gvec_sari(MO_16,
1788                      decode->op[0].offset, decode->op[1].offset,
1789                      decode->immediate, vec_len, vec_len);
1790}
1791
1792static void gen_PSRLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1793{
1794    int vec_len = vector_len(s, decode);
1795
1796    if (decode->immediate >= 32) {
1797        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1798    } else {
1799        tcg_gen_gvec_shri(MO_32,
1800                          decode->op[0].offset, decode->op[1].offset,
1801                          decode->immediate, vec_len, vec_len);
1802    }
1803}
1804
1805static void gen_PSLLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1806{
1807    int vec_len = vector_len(s, decode);
1808
1809    if (decode->immediate >= 32) {
1810        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1811    } else {
1812        tcg_gen_gvec_shli(MO_32,
1813                          decode->op[0].offset, decode->op[1].offset,
1814                          decode->immediate, vec_len, vec_len);
1815    }
1816}
1817
1818static void gen_PSRAD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1819{
1820    int vec_len = vector_len(s, decode);
1821
1822    if (decode->immediate >= 32) {
1823        decode->immediate = 31;
1824    }
1825    tcg_gen_gvec_sari(MO_32,
1826                      decode->op[0].offset, decode->op[1].offset,
1827                      decode->immediate, vec_len, vec_len);
1828}
1829
1830static void gen_PSRLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1831{
1832    int vec_len = vector_len(s, decode);
1833
1834    if (decode->immediate >= 64) {
1835        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1836    } else {
1837        tcg_gen_gvec_shri(MO_64,
1838                          decode->op[0].offset, decode->op[1].offset,
1839                          decode->immediate, vec_len, vec_len);
1840    }
1841}
1842
1843static void gen_PSLLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1844{
1845    int vec_len = vector_len(s, decode);
1846
1847    if (decode->immediate >= 64) {
1848        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
1849    } else {
1850        tcg_gen_gvec_shli(MO_64,
1851                          decode->op[0].offset, decode->op[1].offset,
1852                          decode->immediate, vec_len, vec_len);
1853    }
1854}
1855
1856static TCGv_ptr make_imm8u_xmm_vec(uint8_t imm, int vec_len)
1857{
1858    MemOp ot = vec_len == 16 ? MO_128 : MO_256;
1859    TCGv_i32 imm_v = tcg_constant8u_i32(imm);
1860    TCGv_ptr ptr = tcg_temp_new_ptr();
1861
1862    tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot),
1863                         vec_len, vec_len, 0);
1864
1865    tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_t0));
1866    tcg_gen_st_i32(imm_v, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
1867    return ptr;
1868}
1869
1870static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1871{
1872    int vec_len = vector_len(s, decode);
1873    TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
1874
1875    if (s->vex_l) {
1876        gen_helper_psrldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
1877    } else {
1878        gen_helper_psrldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
1879    }
1880}
1881
1882static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1883{
1884    int vec_len = vector_len(s, decode);
1885    TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
1886
1887    if (s->vex_l) {
1888        gen_helper_pslldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
1889    } else {
1890        gen_helper_pslldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
1891    }
1892}
1893
1894static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1895{
1896    MemOp ot = decode->op[0].ot;
1897    int mask = ot == MO_64 ? 63 : 31;
1898    int b = decode->immediate & mask;
1899
1900    switch (ot) {
1901    case MO_32:
1902#ifdef TARGET_X86_64
1903        tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
1904        tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b);
1905        tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
1906        break;
1907
1908    case MO_64:
1909#endif
1910        tcg_gen_rotri_tl(s->T0, s->T0, b);
1911        break;
1912
1913    default:
1914        g_assert_not_reached();
1915    }
1916}
1917
1918static void gen_SARX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1919{
1920    MemOp ot = decode->op[0].ot;
1921    int mask;
1922
1923    mask = ot == MO_64 ? 63 : 31;
1924    tcg_gen_andi_tl(s->T1, s->T1, mask);
1925    tcg_gen_sar_tl(s->T0, s->T0, s->T1);
1926}
1927
1928static void gen_SHA1NEXTE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1929{
1930    gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
1931}
1932
1933static void gen_SHA1MSG1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1934{
1935    gen_helper_sha1msg1(OP_PTR0, OP_PTR1, OP_PTR2);
1936}
1937
1938static void gen_SHA1MSG2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1939{
1940    gen_helper_sha1msg2(OP_PTR0, OP_PTR1, OP_PTR2);
1941}
1942
1943static void gen_SHA1RNDS4(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1944{
1945    switch(decode->immediate & 3) {
1946    case 0:
1947        gen_helper_sha1rnds4_f0(OP_PTR0, OP_PTR0, OP_PTR1);
1948        break;
1949    case 1:
1950        gen_helper_sha1rnds4_f1(OP_PTR0, OP_PTR0, OP_PTR1);
1951        break;
1952    case 2:
1953        gen_helper_sha1rnds4_f2(OP_PTR0, OP_PTR0, OP_PTR1);
1954        break;
1955    case 3:
1956        gen_helper_sha1rnds4_f3(OP_PTR0, OP_PTR0, OP_PTR1);
1957        break;
1958    }
1959}
1960
1961static void gen_SHA256MSG1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1962{
1963    gen_helper_sha256msg1(OP_PTR0, OP_PTR1, OP_PTR2);
1964}
1965
1966static void gen_SHA256MSG2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1967{
1968    gen_helper_sha256msg2(OP_PTR0, OP_PTR1, OP_PTR2);
1969}
1970
1971static void gen_SHA256RNDS2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1972{
1973    TCGv_i32 wk0 = tcg_temp_new_i32();
1974    TCGv_i32 wk1 = tcg_temp_new_i32();
1975
1976    tcg_gen_ld_i32(wk0, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(0)));
1977    tcg_gen_ld_i32(wk1, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(1)));
1978
1979    gen_helper_sha256rnds2(OP_PTR0, OP_PTR1, OP_PTR2, wk0, wk1);
1980}
1981
1982static void gen_SHLX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1983{
1984    MemOp ot = decode->op[0].ot;
1985    int mask;
1986
1987    mask = ot == MO_64 ? 63 : 31;
1988    tcg_gen_andi_tl(s->T1, s->T1, mask);
1989    tcg_gen_shl_tl(s->T0, s->T0, s->T1);
1990}
1991
1992static void gen_SHRX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
1993{
1994    MemOp ot = decode->op[0].ot;
1995    int mask;
1996
1997    mask = ot == MO_64 ? 63 : 31;
1998    tcg_gen_andi_tl(s->T1, s->T1, mask);
1999    tcg_gen_shr_tl(s->T0, s->T0, s->T1);
2000}
2001
2002static void gen_VAESKEYGEN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2003{
2004    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2005    assert(!s->vex_l);
2006    gen_helper_aeskeygenassist_xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
2007}
2008
2009static void gen_STMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2010{
2011    gen_helper_update_mxcsr(tcg_env);
2012    tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
2013}
2014
2015static void gen_VAESIMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2016{
2017    assert(!s->vex_l);
2018    gen_helper_aesimc_xmm(tcg_env, OP_PTR0, OP_PTR2);
2019}
2020
2021/*
2022 * 00 = v*ps Vps, Hps, Wpd
2023 * 66 = v*pd Vpd, Hpd, Wps
2024 * f3 = v*ss Vss, Hss, Wps
2025 * f2 = v*sd Vsd, Hsd, Wps
2026 */
2027#define SSE_CMP(x) { \
2028    gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \
2029    gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \
2030    gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm}
2031static const SSEFunc_0_eppp gen_helper_cmp_funcs[32][6] = {
2032    SSE_CMP(cmpeq),
2033    SSE_CMP(cmplt),
2034    SSE_CMP(cmple),
2035    SSE_CMP(cmpunord),
2036    SSE_CMP(cmpneq),
2037    SSE_CMP(cmpnlt),
2038    SSE_CMP(cmpnle),
2039    SSE_CMP(cmpord),
2040
2041    SSE_CMP(cmpequ),
2042    SSE_CMP(cmpnge),
2043    SSE_CMP(cmpngt),
2044    SSE_CMP(cmpfalse),
2045    SSE_CMP(cmpnequ),
2046    SSE_CMP(cmpge),
2047    SSE_CMP(cmpgt),
2048    SSE_CMP(cmptrue),
2049
2050    SSE_CMP(cmpeqs),
2051    SSE_CMP(cmpltq),
2052    SSE_CMP(cmpleq),
2053    SSE_CMP(cmpunords),
2054    SSE_CMP(cmpneqq),
2055    SSE_CMP(cmpnltq),
2056    SSE_CMP(cmpnleq),
2057    SSE_CMP(cmpords),
2058
2059    SSE_CMP(cmpequs),
2060    SSE_CMP(cmpngeq),
2061    SSE_CMP(cmpngtq),
2062    SSE_CMP(cmpfalses),
2063    SSE_CMP(cmpnequs),
2064    SSE_CMP(cmpgeq),
2065    SSE_CMP(cmpgtq),
2066    SSE_CMP(cmptrues),
2067};
2068#undef SSE_CMP
2069
2070static void gen_VCMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2071{
2072    int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7);
2073    int b =
2074        s->prefix & PREFIX_REPZ  ? 2 /* ss */ :
2075        s->prefix & PREFIX_REPNZ ? 3 /* sd */ :
2076        !!(s->prefix & PREFIX_DATA) /* pd */ + (s->vex_l << 2);
2077
2078    gen_helper_cmp_funcs[index][b](tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
2079}
2080
2081static void gen_VCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2082{
2083    SSEFunc_0_epp fn;
2084    fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss;
2085    fn(tcg_env, OP_PTR1, OP_PTR2);
2086    set_cc_op(s, CC_OP_EFLAGS);
2087}
2088
2089static void gen_VCVTPD2PS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2090{
2091    if (s->vex_l) {
2092        gen_helper_cvtpd2ps_ymm(tcg_env, OP_PTR0, OP_PTR2);
2093    } else {
2094        gen_helper_cvtpd2ps_xmm(tcg_env, OP_PTR0, OP_PTR2);
2095    }
2096}
2097
2098static void gen_VCVTPS2PD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2099{
2100    if (s->vex_l) {
2101        gen_helper_cvtps2pd_ymm(tcg_env, OP_PTR0, OP_PTR2);
2102    } else {
2103        gen_helper_cvtps2pd_xmm(tcg_env, OP_PTR0, OP_PTR2);
2104    }
2105}
2106
2107static void gen_VCVTPS2PH(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2108{
2109    gen_unary_imm_fp_sse(s, env, decode,
2110                      gen_helper_cvtps2ph_xmm,
2111                      gen_helper_cvtps2ph_ymm);
2112    /*
2113     * VCVTPS2PH is the only instruction that performs an operation on a
2114     * register source and then *stores* into memory.
2115     */
2116    if (decode->op[0].has_ea) {
2117        gen_store_sse(s, decode, decode->op[0].offset);
2118    }
2119}
2120
2121static void gen_VCVTSD2SS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2122{
2123    gen_helper_cvtsd2ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
2124}
2125
2126static void gen_VCVTSS2SD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2127{
2128    gen_helper_cvtss2sd(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
2129}
2130
2131static void gen_VCVTSI2Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2132{
2133    int vec_len = vector_len(s, decode);
2134    TCGv_i32 in;
2135
2136    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
2137
2138#ifdef TARGET_X86_64
2139    MemOp ot = decode->op[2].ot;
2140    if (ot == MO_64) {
2141        if (s->prefix & PREFIX_REPNZ) {
2142            gen_helper_cvtsq2sd(tcg_env, OP_PTR0, s->T1);
2143        } else {
2144            gen_helper_cvtsq2ss(tcg_env, OP_PTR0, s->T1);
2145        }
2146        return;
2147    }
2148    in = s->tmp2_i32;
2149    tcg_gen_trunc_tl_i32(in, s->T1);
2150#else
2151    in = s->T1;
2152#endif
2153
2154    if (s->prefix & PREFIX_REPNZ) {
2155        gen_helper_cvtsi2sd(tcg_env, OP_PTR0, in);
2156    } else {
2157        gen_helper_cvtsi2ss(tcg_env, OP_PTR0, in);
2158    }
2159}
2160
2161static inline void gen_VCVTtSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
2162                                  SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq,
2163                                  SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq)
2164{
2165    TCGv_i32 out;
2166
2167#ifdef TARGET_X86_64
2168    MemOp ot = decode->op[0].ot;
2169    if (ot == MO_64) {
2170        if (s->prefix & PREFIX_REPNZ) {
2171            sd2sq(s->T0, tcg_env, OP_PTR2);
2172        } else {
2173            ss2sq(s->T0, tcg_env, OP_PTR2);
2174        }
2175        return;
2176    }
2177
2178    out = s->tmp2_i32;
2179#else
2180    out = s->T0;
2181#endif
2182    if (s->prefix & PREFIX_REPNZ) {
2183        sd2si(out, tcg_env, OP_PTR2);
2184    } else {
2185        ss2si(out, tcg_env, OP_PTR2);
2186    }
2187#ifdef TARGET_X86_64
2188    tcg_gen_extu_i32_tl(s->T0, out);
2189#endif
2190}
2191
2192#ifndef TARGET_X86_64
2193#define gen_helper_cvtss2sq NULL
2194#define gen_helper_cvtsd2sq NULL
2195#define gen_helper_cvttss2sq NULL
2196#define gen_helper_cvttsd2sq NULL
2197#endif
2198
2199static void gen_VCVTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2200{
2201    gen_VCVTtSx2SI(s, env, decode,
2202                   gen_helper_cvtss2si, gen_helper_cvtss2sq,
2203                   gen_helper_cvtsd2si, gen_helper_cvtsd2sq);
2204}
2205
2206static void gen_VCVTTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2207{
2208    gen_VCVTtSx2SI(s, env, decode,
2209                   gen_helper_cvttss2si, gen_helper_cvttss2sq,
2210                   gen_helper_cvttsd2si, gen_helper_cvttsd2sq);
2211}
2212
2213static void gen_VEXTRACTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2214{
2215    int mask = decode->immediate & 1;
2216    int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask);
2217    if (decode->op[0].has_ea) {
2218        /* VEX-only instruction, no alignment requirements.  */
2219        gen_sto_env_A0(s, src_ofs, false);
2220    } else {
2221        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, 16, 16);
2222    }
2223}
2224
2225static void gen_VEXTRACTPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2226{
2227    gen_pextr(s, env, decode, MO_32);
2228}
2229
2230static void gen_vinsertps(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2231{
2232    int val = decode->immediate;
2233    int dest_word = (val >> 4) & 3;
2234    int new_mask = (val & 15) | (1 << dest_word);
2235    int vec_len = 16;
2236
2237    assert(!s->vex_l);
2238
2239    if (new_mask == 15) {
2240        /* All zeroes except possibly for the inserted element */
2241        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2242    } else if (decode->op[1].offset != decode->op[0].offset) {
2243        gen_store_sse(s, decode, decode->op[1].offset);
2244    }
2245
2246    if (new_mask != (val & 15)) {
2247        tcg_gen_st_i32(s->tmp2_i32, tcg_env,
2248                       vector_elem_offset(&decode->op[0], MO_32, dest_word));
2249    }
2250
2251    if (new_mask != 15) {
2252        TCGv_i32 zero = tcg_constant_i32(0); /* float32_zero */
2253        int i;
2254        for (i = 0; i < 4; i++) {
2255            if ((val >> i) & 1) {
2256                tcg_gen_st_i32(zero, tcg_env,
2257                               vector_elem_offset(&decode->op[0], MO_32, i));
2258            }
2259        }
2260    }
2261}
2262
2263static void gen_VINSERTPS_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2264{
2265    int val = decode->immediate;
2266    tcg_gen_ld_i32(s->tmp2_i32, tcg_env,
2267                   vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3));
2268    gen_vinsertps(s, env, decode);
2269}
2270
2271static void gen_VINSERTPS_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2272{
2273    tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
2274    gen_vinsertps(s, env, decode);
2275}
2276
2277static void gen_VINSERTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2278{
2279    int mask = decode->immediate & 1;
2280    tcg_gen_gvec_mov(MO_64,
2281                     decode->op[0].offset + offsetof(YMMReg, YMM_X(mask)),
2282                     decode->op[2].offset + offsetof(YMMReg, YMM_X(0)), 16, 16);
2283    tcg_gen_gvec_mov(MO_64,
2284                     decode->op[0].offset + offsetof(YMMReg, YMM_X(!mask)),
2285                     decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
2286}
2287
2288static inline void gen_maskmov(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
2289                               SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
2290{
2291    if (!s->vex_l) {
2292        xmm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
2293    } else {
2294        ymm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
2295    }
2296}
2297
2298static void gen_VMASKMOVPD_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2299{
2300    gen_maskmov(s, env, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
2301}
2302
2303static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2304{
2305    gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
2306}
2307
2308static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2309{
2310    gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
2311    if (decode->op[0].offset != decode->op[1].offset) {
2312        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
2313        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
2314    }
2315}
2316
2317static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2318{
2319    gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
2320}
2321
2322static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2323{
2324    if (decode->op[0].offset != decode->op[2].offset) {
2325        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
2326        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
2327    }
2328    if (decode->op[0].offset != decode->op[1].offset) {
2329        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
2330        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
2331    }
2332}
2333
2334static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2335{
2336    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
2337    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
2338    if (decode->op[0].offset != decode->op[1].offset) {
2339        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
2340        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
2341    }
2342}
2343
2344static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2345{
2346    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
2347    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
2348    if (decode->op[0].offset != decode->op[1].offset) {
2349        tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
2350        tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
2351    }
2352}
2353
2354/*
2355 * Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
2356 * Use a gvec move to move everything above the bottom 64 bits.
2357 */
2358
2359static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2360{
2361    int vec_len = vector_len(s, decode);
2362
2363    tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
2364    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
2365    tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
2366}
2367
2368static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2369{
2370    int vec_len = vector_len(s, decode);
2371
2372    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
2373    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
2374    tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
2375}
2376
2377static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2378{
2379    tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
2380    tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
2381}
2382
2383static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2384{
2385    TCGv_i64 zero = tcg_constant_i64(0);
2386
2387    tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
2388    tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
2389    tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
2390}
2391
2392static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2393{
2394    int vec_len = vector_len(s, decode);
2395
2396    tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
2397    tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
2398    tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
2399}
2400
2401static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2402{
2403    int vec_len = vector_len(s, decode);
2404
2405    tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
2406    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
2407    tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
2408}
2409
2410static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2411{
2412    tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
2413    tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
2414}
2415
2416static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2417{
2418    if (s->vex_w) {
2419        gen_VMASKMOVPD_st(s, env, decode);
2420    } else {
2421        gen_VMASKMOVPS_st(s, env, decode);
2422    }
2423}
2424
2425static void gen_VPERMD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2426{
2427    assert(s->vex_l);
2428    gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
2429}
2430
2431static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2432{
2433    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2434    assert(s->vex_l);
2435    gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
2436}
2437
2438static void gen_VPHMINPOSUW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2439{
2440    assert(!s->vex_l);
2441    gen_helper_phminposuw_xmm(tcg_env, OP_PTR0, OP_PTR2);
2442}
2443
2444static void gen_VROUNDSD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2445{
2446    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2447    assert(!s->vex_l);
2448    gen_helper_roundsd_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2449}
2450
2451static void gen_VROUNDSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2452{
2453    TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
2454    assert(!s->vex_l);
2455    gen_helper_roundss_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
2456}
2457
2458static void gen_VSHUF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2459{
2460    TCGv_i32 imm = tcg_constant_i32(decode->immediate);
2461    SSEFunc_0_pppi ps, pd, fn;
2462    ps = s->vex_l ? gen_helper_shufps_ymm : gen_helper_shufps_xmm;
2463    pd = s->vex_l ? gen_helper_shufpd_ymm : gen_helper_shufpd_xmm;
2464    fn = s->prefix & PREFIX_DATA ? pd : ps;
2465    fn(OP_PTR0, OP_PTR1, OP_PTR2, imm);
2466}
2467
2468static void gen_VUCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2469{
2470    SSEFunc_0_epp fn;
2471    fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss;
2472    fn(tcg_env, OP_PTR1, OP_PTR2);
2473    set_cc_op(s, CC_OP_EFLAGS);
2474}
2475
2476static void gen_VZEROALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2477{
2478    TCGv_ptr ptr = tcg_temp_new_ptr();
2479
2480    tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_regs));
2481    gen_helper_memset(ptr, ptr, tcg_constant_i32(0),
2482                      tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg)));
2483}
2484
2485static void gen_VZEROUPPER(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
2486{
2487    int i;
2488
2489    for (i = 0; i < CPU_NB_REGS; i++) {
2490        int offset = offsetof(CPUX86State, xmm_regs[i].ZMM_X(1));
2491        tcg_gen_gvec_dup_imm(MO_64, offset, 16, 16, 0);
2492    }
2493}
2494