xref: /openbmc/qemu/target/i386/ops_sse.h (revision fcf5ef2ab52c621a4617ebbef36bf43b4003f4c0)
1*fcf5ef2aSThomas Huth /*
2*fcf5ef2aSThomas Huth  *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3*fcf5ef2aSThomas Huth  *
4*fcf5ef2aSThomas Huth  *  Copyright (c) 2005 Fabrice Bellard
5*fcf5ef2aSThomas Huth  *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
6*fcf5ef2aSThomas Huth  *
7*fcf5ef2aSThomas Huth  * This library is free software; you can redistribute it and/or
8*fcf5ef2aSThomas Huth  * modify it under the terms of the GNU Lesser General Public
9*fcf5ef2aSThomas Huth  * License as published by the Free Software Foundation; either
10*fcf5ef2aSThomas Huth  * version 2 of the License, or (at your option) any later version.
11*fcf5ef2aSThomas Huth  *
12*fcf5ef2aSThomas Huth  * This library is distributed in the hope that it will be useful,
13*fcf5ef2aSThomas Huth  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14*fcf5ef2aSThomas Huth  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15*fcf5ef2aSThomas Huth  * Lesser General Public License for more details.
16*fcf5ef2aSThomas Huth  *
17*fcf5ef2aSThomas Huth  * You should have received a copy of the GNU Lesser General Public
18*fcf5ef2aSThomas Huth  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19*fcf5ef2aSThomas Huth  */
20*fcf5ef2aSThomas Huth 
21*fcf5ef2aSThomas Huth #include "crypto/aes.h"
22*fcf5ef2aSThomas Huth 
23*fcf5ef2aSThomas Huth #if SHIFT == 0
24*fcf5ef2aSThomas Huth #define Reg MMXReg
25*fcf5ef2aSThomas Huth #define XMM_ONLY(...)
26*fcf5ef2aSThomas Huth #define B(n) MMX_B(n)
27*fcf5ef2aSThomas Huth #define W(n) MMX_W(n)
28*fcf5ef2aSThomas Huth #define L(n) MMX_L(n)
29*fcf5ef2aSThomas Huth #define Q(n) MMX_Q(n)
30*fcf5ef2aSThomas Huth #define SUFFIX _mmx
31*fcf5ef2aSThomas Huth #else
32*fcf5ef2aSThomas Huth #define Reg ZMMReg
33*fcf5ef2aSThomas Huth #define XMM_ONLY(...) __VA_ARGS__
34*fcf5ef2aSThomas Huth #define B(n) ZMM_B(n)
35*fcf5ef2aSThomas Huth #define W(n) ZMM_W(n)
36*fcf5ef2aSThomas Huth #define L(n) ZMM_L(n)
37*fcf5ef2aSThomas Huth #define Q(n) ZMM_Q(n)
38*fcf5ef2aSThomas Huth #define SUFFIX _xmm
39*fcf5ef2aSThomas Huth #endif
40*fcf5ef2aSThomas Huth 
41*fcf5ef2aSThomas Huth void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
42*fcf5ef2aSThomas Huth {
43*fcf5ef2aSThomas Huth     int shift;
44*fcf5ef2aSThomas Huth 
45*fcf5ef2aSThomas Huth     if (s->Q(0) > 15) {
46*fcf5ef2aSThomas Huth         d->Q(0) = 0;
47*fcf5ef2aSThomas Huth #if SHIFT == 1
48*fcf5ef2aSThomas Huth         d->Q(1) = 0;
49*fcf5ef2aSThomas Huth #endif
50*fcf5ef2aSThomas Huth     } else {
51*fcf5ef2aSThomas Huth         shift = s->B(0);
52*fcf5ef2aSThomas Huth         d->W(0) >>= shift;
53*fcf5ef2aSThomas Huth         d->W(1) >>= shift;
54*fcf5ef2aSThomas Huth         d->W(2) >>= shift;
55*fcf5ef2aSThomas Huth         d->W(3) >>= shift;
56*fcf5ef2aSThomas Huth #if SHIFT == 1
57*fcf5ef2aSThomas Huth         d->W(4) >>= shift;
58*fcf5ef2aSThomas Huth         d->W(5) >>= shift;
59*fcf5ef2aSThomas Huth         d->W(6) >>= shift;
60*fcf5ef2aSThomas Huth         d->W(7) >>= shift;
61*fcf5ef2aSThomas Huth #endif
62*fcf5ef2aSThomas Huth     }
63*fcf5ef2aSThomas Huth }
64*fcf5ef2aSThomas Huth 
65*fcf5ef2aSThomas Huth void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
66*fcf5ef2aSThomas Huth {
67*fcf5ef2aSThomas Huth     int shift;
68*fcf5ef2aSThomas Huth 
69*fcf5ef2aSThomas Huth     if (s->Q(0) > 15) {
70*fcf5ef2aSThomas Huth         shift = 15;
71*fcf5ef2aSThomas Huth     } else {
72*fcf5ef2aSThomas Huth         shift = s->B(0);
73*fcf5ef2aSThomas Huth     }
74*fcf5ef2aSThomas Huth     d->W(0) = (int16_t)d->W(0) >> shift;
75*fcf5ef2aSThomas Huth     d->W(1) = (int16_t)d->W(1) >> shift;
76*fcf5ef2aSThomas Huth     d->W(2) = (int16_t)d->W(2) >> shift;
77*fcf5ef2aSThomas Huth     d->W(3) = (int16_t)d->W(3) >> shift;
78*fcf5ef2aSThomas Huth #if SHIFT == 1
79*fcf5ef2aSThomas Huth     d->W(4) = (int16_t)d->W(4) >> shift;
80*fcf5ef2aSThomas Huth     d->W(5) = (int16_t)d->W(5) >> shift;
81*fcf5ef2aSThomas Huth     d->W(6) = (int16_t)d->W(6) >> shift;
82*fcf5ef2aSThomas Huth     d->W(7) = (int16_t)d->W(7) >> shift;
83*fcf5ef2aSThomas Huth #endif
84*fcf5ef2aSThomas Huth }
85*fcf5ef2aSThomas Huth 
86*fcf5ef2aSThomas Huth void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
87*fcf5ef2aSThomas Huth {
88*fcf5ef2aSThomas Huth     int shift;
89*fcf5ef2aSThomas Huth 
90*fcf5ef2aSThomas Huth     if (s->Q(0) > 15) {
91*fcf5ef2aSThomas Huth         d->Q(0) = 0;
92*fcf5ef2aSThomas Huth #if SHIFT == 1
93*fcf5ef2aSThomas Huth         d->Q(1) = 0;
94*fcf5ef2aSThomas Huth #endif
95*fcf5ef2aSThomas Huth     } else {
96*fcf5ef2aSThomas Huth         shift = s->B(0);
97*fcf5ef2aSThomas Huth         d->W(0) <<= shift;
98*fcf5ef2aSThomas Huth         d->W(1) <<= shift;
99*fcf5ef2aSThomas Huth         d->W(2) <<= shift;
100*fcf5ef2aSThomas Huth         d->W(3) <<= shift;
101*fcf5ef2aSThomas Huth #if SHIFT == 1
102*fcf5ef2aSThomas Huth         d->W(4) <<= shift;
103*fcf5ef2aSThomas Huth         d->W(5) <<= shift;
104*fcf5ef2aSThomas Huth         d->W(6) <<= shift;
105*fcf5ef2aSThomas Huth         d->W(7) <<= shift;
106*fcf5ef2aSThomas Huth #endif
107*fcf5ef2aSThomas Huth     }
108*fcf5ef2aSThomas Huth }
109*fcf5ef2aSThomas Huth 
110*fcf5ef2aSThomas Huth void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
111*fcf5ef2aSThomas Huth {
112*fcf5ef2aSThomas Huth     int shift;
113*fcf5ef2aSThomas Huth 
114*fcf5ef2aSThomas Huth     if (s->Q(0) > 31) {
115*fcf5ef2aSThomas Huth         d->Q(0) = 0;
116*fcf5ef2aSThomas Huth #if SHIFT == 1
117*fcf5ef2aSThomas Huth         d->Q(1) = 0;
118*fcf5ef2aSThomas Huth #endif
119*fcf5ef2aSThomas Huth     } else {
120*fcf5ef2aSThomas Huth         shift = s->B(0);
121*fcf5ef2aSThomas Huth         d->L(0) >>= shift;
122*fcf5ef2aSThomas Huth         d->L(1) >>= shift;
123*fcf5ef2aSThomas Huth #if SHIFT == 1
124*fcf5ef2aSThomas Huth         d->L(2) >>= shift;
125*fcf5ef2aSThomas Huth         d->L(3) >>= shift;
126*fcf5ef2aSThomas Huth #endif
127*fcf5ef2aSThomas Huth     }
128*fcf5ef2aSThomas Huth }
129*fcf5ef2aSThomas Huth 
130*fcf5ef2aSThomas Huth void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
131*fcf5ef2aSThomas Huth {
132*fcf5ef2aSThomas Huth     int shift;
133*fcf5ef2aSThomas Huth 
134*fcf5ef2aSThomas Huth     if (s->Q(0) > 31) {
135*fcf5ef2aSThomas Huth         shift = 31;
136*fcf5ef2aSThomas Huth     } else {
137*fcf5ef2aSThomas Huth         shift = s->B(0);
138*fcf5ef2aSThomas Huth     }
139*fcf5ef2aSThomas Huth     d->L(0) = (int32_t)d->L(0) >> shift;
140*fcf5ef2aSThomas Huth     d->L(1) = (int32_t)d->L(1) >> shift;
141*fcf5ef2aSThomas Huth #if SHIFT == 1
142*fcf5ef2aSThomas Huth     d->L(2) = (int32_t)d->L(2) >> shift;
143*fcf5ef2aSThomas Huth     d->L(3) = (int32_t)d->L(3) >> shift;
144*fcf5ef2aSThomas Huth #endif
145*fcf5ef2aSThomas Huth }
146*fcf5ef2aSThomas Huth 
147*fcf5ef2aSThomas Huth void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
148*fcf5ef2aSThomas Huth {
149*fcf5ef2aSThomas Huth     int shift;
150*fcf5ef2aSThomas Huth 
151*fcf5ef2aSThomas Huth     if (s->Q(0) > 31) {
152*fcf5ef2aSThomas Huth         d->Q(0) = 0;
153*fcf5ef2aSThomas Huth #if SHIFT == 1
154*fcf5ef2aSThomas Huth         d->Q(1) = 0;
155*fcf5ef2aSThomas Huth #endif
156*fcf5ef2aSThomas Huth     } else {
157*fcf5ef2aSThomas Huth         shift = s->B(0);
158*fcf5ef2aSThomas Huth         d->L(0) <<= shift;
159*fcf5ef2aSThomas Huth         d->L(1) <<= shift;
160*fcf5ef2aSThomas Huth #if SHIFT == 1
161*fcf5ef2aSThomas Huth         d->L(2) <<= shift;
162*fcf5ef2aSThomas Huth         d->L(3) <<= shift;
163*fcf5ef2aSThomas Huth #endif
164*fcf5ef2aSThomas Huth     }
165*fcf5ef2aSThomas Huth }
166*fcf5ef2aSThomas Huth 
167*fcf5ef2aSThomas Huth void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
168*fcf5ef2aSThomas Huth {
169*fcf5ef2aSThomas Huth     int shift;
170*fcf5ef2aSThomas Huth 
171*fcf5ef2aSThomas Huth     if (s->Q(0) > 63) {
172*fcf5ef2aSThomas Huth         d->Q(0) = 0;
173*fcf5ef2aSThomas Huth #if SHIFT == 1
174*fcf5ef2aSThomas Huth         d->Q(1) = 0;
175*fcf5ef2aSThomas Huth #endif
176*fcf5ef2aSThomas Huth     } else {
177*fcf5ef2aSThomas Huth         shift = s->B(0);
178*fcf5ef2aSThomas Huth         d->Q(0) >>= shift;
179*fcf5ef2aSThomas Huth #if SHIFT == 1
180*fcf5ef2aSThomas Huth         d->Q(1) >>= shift;
181*fcf5ef2aSThomas Huth #endif
182*fcf5ef2aSThomas Huth     }
183*fcf5ef2aSThomas Huth }
184*fcf5ef2aSThomas Huth 
185*fcf5ef2aSThomas Huth void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
186*fcf5ef2aSThomas Huth {
187*fcf5ef2aSThomas Huth     int shift;
188*fcf5ef2aSThomas Huth 
189*fcf5ef2aSThomas Huth     if (s->Q(0) > 63) {
190*fcf5ef2aSThomas Huth         d->Q(0) = 0;
191*fcf5ef2aSThomas Huth #if SHIFT == 1
192*fcf5ef2aSThomas Huth         d->Q(1) = 0;
193*fcf5ef2aSThomas Huth #endif
194*fcf5ef2aSThomas Huth     } else {
195*fcf5ef2aSThomas Huth         shift = s->B(0);
196*fcf5ef2aSThomas Huth         d->Q(0) <<= shift;
197*fcf5ef2aSThomas Huth #if SHIFT == 1
198*fcf5ef2aSThomas Huth         d->Q(1) <<= shift;
199*fcf5ef2aSThomas Huth #endif
200*fcf5ef2aSThomas Huth     }
201*fcf5ef2aSThomas Huth }
202*fcf5ef2aSThomas Huth 
203*fcf5ef2aSThomas Huth #if SHIFT == 1
204*fcf5ef2aSThomas Huth void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
205*fcf5ef2aSThomas Huth {
206*fcf5ef2aSThomas Huth     int shift, i;
207*fcf5ef2aSThomas Huth 
208*fcf5ef2aSThomas Huth     shift = s->L(0);
209*fcf5ef2aSThomas Huth     if (shift > 16) {
210*fcf5ef2aSThomas Huth         shift = 16;
211*fcf5ef2aSThomas Huth     }
212*fcf5ef2aSThomas Huth     for (i = 0; i < 16 - shift; i++) {
213*fcf5ef2aSThomas Huth         d->B(i) = d->B(i + shift);
214*fcf5ef2aSThomas Huth     }
215*fcf5ef2aSThomas Huth     for (i = 16 - shift; i < 16; i++) {
216*fcf5ef2aSThomas Huth         d->B(i) = 0;
217*fcf5ef2aSThomas Huth     }
218*fcf5ef2aSThomas Huth }
219*fcf5ef2aSThomas Huth 
220*fcf5ef2aSThomas Huth void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
221*fcf5ef2aSThomas Huth {
222*fcf5ef2aSThomas Huth     int shift, i;
223*fcf5ef2aSThomas Huth 
224*fcf5ef2aSThomas Huth     shift = s->L(0);
225*fcf5ef2aSThomas Huth     if (shift > 16) {
226*fcf5ef2aSThomas Huth         shift = 16;
227*fcf5ef2aSThomas Huth     }
228*fcf5ef2aSThomas Huth     for (i = 15; i >= shift; i--) {
229*fcf5ef2aSThomas Huth         d->B(i) = d->B(i - shift);
230*fcf5ef2aSThomas Huth     }
231*fcf5ef2aSThomas Huth     for (i = 0; i < shift; i++) {
232*fcf5ef2aSThomas Huth         d->B(i) = 0;
233*fcf5ef2aSThomas Huth     }
234*fcf5ef2aSThomas Huth }
235*fcf5ef2aSThomas Huth #endif
236*fcf5ef2aSThomas Huth 
237*fcf5ef2aSThomas Huth #define SSE_HELPER_B(name, F)                                   \
238*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
239*fcf5ef2aSThomas Huth     {                                                           \
240*fcf5ef2aSThomas Huth         d->B(0) = F(d->B(0), s->B(0));                          \
241*fcf5ef2aSThomas Huth         d->B(1) = F(d->B(1), s->B(1));                          \
242*fcf5ef2aSThomas Huth         d->B(2) = F(d->B(2), s->B(2));                          \
243*fcf5ef2aSThomas Huth         d->B(3) = F(d->B(3), s->B(3));                          \
244*fcf5ef2aSThomas Huth         d->B(4) = F(d->B(4), s->B(4));                          \
245*fcf5ef2aSThomas Huth         d->B(5) = F(d->B(5), s->B(5));                          \
246*fcf5ef2aSThomas Huth         d->B(6) = F(d->B(6), s->B(6));                          \
247*fcf5ef2aSThomas Huth         d->B(7) = F(d->B(7), s->B(7));                          \
248*fcf5ef2aSThomas Huth         XMM_ONLY(                                               \
249*fcf5ef2aSThomas Huth                  d->B(8) = F(d->B(8), s->B(8));                 \
250*fcf5ef2aSThomas Huth                  d->B(9) = F(d->B(9), s->B(9));                 \
251*fcf5ef2aSThomas Huth                  d->B(10) = F(d->B(10), s->B(10));              \
252*fcf5ef2aSThomas Huth                  d->B(11) = F(d->B(11), s->B(11));              \
253*fcf5ef2aSThomas Huth                  d->B(12) = F(d->B(12), s->B(12));              \
254*fcf5ef2aSThomas Huth                  d->B(13) = F(d->B(13), s->B(13));              \
255*fcf5ef2aSThomas Huth                  d->B(14) = F(d->B(14), s->B(14));              \
256*fcf5ef2aSThomas Huth                  d->B(15) = F(d->B(15), s->B(15));              \
257*fcf5ef2aSThomas Huth                                                         )       \
258*fcf5ef2aSThomas Huth             }
259*fcf5ef2aSThomas Huth 
260*fcf5ef2aSThomas Huth #define SSE_HELPER_W(name, F)                                   \
261*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
262*fcf5ef2aSThomas Huth     {                                                           \
263*fcf5ef2aSThomas Huth         d->W(0) = F(d->W(0), s->W(0));                          \
264*fcf5ef2aSThomas Huth         d->W(1) = F(d->W(1), s->W(1));                          \
265*fcf5ef2aSThomas Huth         d->W(2) = F(d->W(2), s->W(2));                          \
266*fcf5ef2aSThomas Huth         d->W(3) = F(d->W(3), s->W(3));                          \
267*fcf5ef2aSThomas Huth         XMM_ONLY(                                               \
268*fcf5ef2aSThomas Huth                  d->W(4) = F(d->W(4), s->W(4));                 \
269*fcf5ef2aSThomas Huth                  d->W(5) = F(d->W(5), s->W(5));                 \
270*fcf5ef2aSThomas Huth                  d->W(6) = F(d->W(6), s->W(6));                 \
271*fcf5ef2aSThomas Huth                  d->W(7) = F(d->W(7), s->W(7));                 \
272*fcf5ef2aSThomas Huth                                                         )       \
273*fcf5ef2aSThomas Huth             }
274*fcf5ef2aSThomas Huth 
275*fcf5ef2aSThomas Huth #define SSE_HELPER_L(name, F)                                   \
276*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
277*fcf5ef2aSThomas Huth     {                                                           \
278*fcf5ef2aSThomas Huth         d->L(0) = F(d->L(0), s->L(0));                          \
279*fcf5ef2aSThomas Huth         d->L(1) = F(d->L(1), s->L(1));                          \
280*fcf5ef2aSThomas Huth         XMM_ONLY(                                               \
281*fcf5ef2aSThomas Huth                  d->L(2) = F(d->L(2), s->L(2));                 \
282*fcf5ef2aSThomas Huth                  d->L(3) = F(d->L(3), s->L(3));                 \
283*fcf5ef2aSThomas Huth                                                         )       \
284*fcf5ef2aSThomas Huth             }
285*fcf5ef2aSThomas Huth 
286*fcf5ef2aSThomas Huth #define SSE_HELPER_Q(name, F)                                   \
287*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
288*fcf5ef2aSThomas Huth     {                                                           \
289*fcf5ef2aSThomas Huth         d->Q(0) = F(d->Q(0), s->Q(0));                          \
290*fcf5ef2aSThomas Huth         XMM_ONLY(                                               \
291*fcf5ef2aSThomas Huth                  d->Q(1) = F(d->Q(1), s->Q(1));                 \
292*fcf5ef2aSThomas Huth                                                         )       \
293*fcf5ef2aSThomas Huth             }
294*fcf5ef2aSThomas Huth 
295*fcf5ef2aSThomas Huth #if SHIFT == 0
296*fcf5ef2aSThomas Huth static inline int satub(int x)
297*fcf5ef2aSThomas Huth {
298*fcf5ef2aSThomas Huth     if (x < 0) {
299*fcf5ef2aSThomas Huth         return 0;
300*fcf5ef2aSThomas Huth     } else if (x > 255) {
301*fcf5ef2aSThomas Huth         return 255;
302*fcf5ef2aSThomas Huth     } else {
303*fcf5ef2aSThomas Huth         return x;
304*fcf5ef2aSThomas Huth     }
305*fcf5ef2aSThomas Huth }
306*fcf5ef2aSThomas Huth 
307*fcf5ef2aSThomas Huth static inline int satuw(int x)
308*fcf5ef2aSThomas Huth {
309*fcf5ef2aSThomas Huth     if (x < 0) {
310*fcf5ef2aSThomas Huth         return 0;
311*fcf5ef2aSThomas Huth     } else if (x > 65535) {
312*fcf5ef2aSThomas Huth         return 65535;
313*fcf5ef2aSThomas Huth     } else {
314*fcf5ef2aSThomas Huth         return x;
315*fcf5ef2aSThomas Huth     }
316*fcf5ef2aSThomas Huth }
317*fcf5ef2aSThomas Huth 
318*fcf5ef2aSThomas Huth static inline int satsb(int x)
319*fcf5ef2aSThomas Huth {
320*fcf5ef2aSThomas Huth     if (x < -128) {
321*fcf5ef2aSThomas Huth         return -128;
322*fcf5ef2aSThomas Huth     } else if (x > 127) {
323*fcf5ef2aSThomas Huth         return 127;
324*fcf5ef2aSThomas Huth     } else {
325*fcf5ef2aSThomas Huth         return x;
326*fcf5ef2aSThomas Huth     }
327*fcf5ef2aSThomas Huth }
328*fcf5ef2aSThomas Huth 
329*fcf5ef2aSThomas Huth static inline int satsw(int x)
330*fcf5ef2aSThomas Huth {
331*fcf5ef2aSThomas Huth     if (x < -32768) {
332*fcf5ef2aSThomas Huth         return -32768;
333*fcf5ef2aSThomas Huth     } else if (x > 32767) {
334*fcf5ef2aSThomas Huth         return 32767;
335*fcf5ef2aSThomas Huth     } else {
336*fcf5ef2aSThomas Huth         return x;
337*fcf5ef2aSThomas Huth     }
338*fcf5ef2aSThomas Huth }
339*fcf5ef2aSThomas Huth 
340*fcf5ef2aSThomas Huth #define FADD(a, b) ((a) + (b))
341*fcf5ef2aSThomas Huth #define FADDUB(a, b) satub((a) + (b))
342*fcf5ef2aSThomas Huth #define FADDUW(a, b) satuw((a) + (b))
343*fcf5ef2aSThomas Huth #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
344*fcf5ef2aSThomas Huth #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
345*fcf5ef2aSThomas Huth 
346*fcf5ef2aSThomas Huth #define FSUB(a, b) ((a) - (b))
347*fcf5ef2aSThomas Huth #define FSUBUB(a, b) satub((a) - (b))
348*fcf5ef2aSThomas Huth #define FSUBUW(a, b) satuw((a) - (b))
349*fcf5ef2aSThomas Huth #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
350*fcf5ef2aSThomas Huth #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
351*fcf5ef2aSThomas Huth #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
352*fcf5ef2aSThomas Huth #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
353*fcf5ef2aSThomas Huth #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
354*fcf5ef2aSThomas Huth #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
355*fcf5ef2aSThomas Huth 
356*fcf5ef2aSThomas Huth #define FAND(a, b) ((a) & (b))
357*fcf5ef2aSThomas Huth #define FANDN(a, b) ((~(a)) & (b))
358*fcf5ef2aSThomas Huth #define FOR(a, b) ((a) | (b))
359*fcf5ef2aSThomas Huth #define FXOR(a, b) ((a) ^ (b))
360*fcf5ef2aSThomas Huth 
361*fcf5ef2aSThomas Huth #define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
362*fcf5ef2aSThomas Huth #define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
363*fcf5ef2aSThomas Huth #define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
364*fcf5ef2aSThomas Huth #define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
365*fcf5ef2aSThomas Huth 
366*fcf5ef2aSThomas Huth #define FMULLW(a, b) ((a) * (b))
367*fcf5ef2aSThomas Huth #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
368*fcf5ef2aSThomas Huth #define FMULHUW(a, b) ((a) * (b) >> 16)
369*fcf5ef2aSThomas Huth #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
370*fcf5ef2aSThomas Huth 
371*fcf5ef2aSThomas Huth #define FAVG(a, b) (((a) + (b) + 1) >> 1)
372*fcf5ef2aSThomas Huth #endif
373*fcf5ef2aSThomas Huth 
374*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_paddb, FADD)
375*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_paddw, FADD)
376*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_paddl, FADD)
377*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_paddq, FADD)
378*fcf5ef2aSThomas Huth 
379*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_psubb, FSUB)
380*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_psubw, FSUB)
381*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_psubl, FSUB)
382*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_psubq, FSUB)
383*fcf5ef2aSThomas Huth 
384*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_paddusb, FADDUB)
385*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_paddsb, FADDSB)
386*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_psubusb, FSUBUB)
387*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_psubsb, FSUBSB)
388*fcf5ef2aSThomas Huth 
389*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_paddusw, FADDUW)
390*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_paddsw, FADDSW)
391*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_psubusw, FSUBUW)
392*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_psubsw, FSUBSW)
393*fcf5ef2aSThomas Huth 
394*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pminub, FMINUB)
395*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pmaxub, FMAXUB)
396*fcf5ef2aSThomas Huth 
397*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pminsw, FMINSW)
398*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmaxsw, FMAXSW)
399*fcf5ef2aSThomas Huth 
400*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_pand, FAND)
401*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_pandn, FANDN)
402*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_por, FOR)
403*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_pxor, FXOR)
404*fcf5ef2aSThomas Huth 
405*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
406*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
407*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
408*fcf5ef2aSThomas Huth 
409*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
410*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
411*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
412*fcf5ef2aSThomas Huth 
413*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmullw, FMULLW)
414*fcf5ef2aSThomas Huth #if SHIFT == 0
415*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmulhrw, FMULHRW)
416*fcf5ef2aSThomas Huth #endif
417*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmulhuw, FMULHUW)
418*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmulhw, FMULHW)
419*fcf5ef2aSThomas Huth 
420*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pavgb, FAVG)
421*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pavgw, FAVG)
422*fcf5ef2aSThomas Huth 
423*fcf5ef2aSThomas Huth void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
424*fcf5ef2aSThomas Huth {
425*fcf5ef2aSThomas Huth     d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
426*fcf5ef2aSThomas Huth #if SHIFT == 1
427*fcf5ef2aSThomas Huth     d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
428*fcf5ef2aSThomas Huth #endif
429*fcf5ef2aSThomas Huth }
430*fcf5ef2aSThomas Huth 
431*fcf5ef2aSThomas Huth void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
432*fcf5ef2aSThomas Huth {
433*fcf5ef2aSThomas Huth     int i;
434*fcf5ef2aSThomas Huth 
435*fcf5ef2aSThomas Huth     for (i = 0; i < (2 << SHIFT); i++) {
436*fcf5ef2aSThomas Huth         d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) +
437*fcf5ef2aSThomas Huth             (int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1);
438*fcf5ef2aSThomas Huth     }
439*fcf5ef2aSThomas Huth }
440*fcf5ef2aSThomas Huth 
441*fcf5ef2aSThomas Huth #if SHIFT == 0
442*fcf5ef2aSThomas Huth static inline int abs1(int a)
443*fcf5ef2aSThomas Huth {
444*fcf5ef2aSThomas Huth     if (a < 0) {
445*fcf5ef2aSThomas Huth         return -a;
446*fcf5ef2aSThomas Huth     } else {
447*fcf5ef2aSThomas Huth         return a;
448*fcf5ef2aSThomas Huth     }
449*fcf5ef2aSThomas Huth }
450*fcf5ef2aSThomas Huth #endif
451*fcf5ef2aSThomas Huth void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
452*fcf5ef2aSThomas Huth {
453*fcf5ef2aSThomas Huth     unsigned int val;
454*fcf5ef2aSThomas Huth 
455*fcf5ef2aSThomas Huth     val = 0;
456*fcf5ef2aSThomas Huth     val += abs1(d->B(0) - s->B(0));
457*fcf5ef2aSThomas Huth     val += abs1(d->B(1) - s->B(1));
458*fcf5ef2aSThomas Huth     val += abs1(d->B(2) - s->B(2));
459*fcf5ef2aSThomas Huth     val += abs1(d->B(3) - s->B(3));
460*fcf5ef2aSThomas Huth     val += abs1(d->B(4) - s->B(4));
461*fcf5ef2aSThomas Huth     val += abs1(d->B(5) - s->B(5));
462*fcf5ef2aSThomas Huth     val += abs1(d->B(6) - s->B(6));
463*fcf5ef2aSThomas Huth     val += abs1(d->B(7) - s->B(7));
464*fcf5ef2aSThomas Huth     d->Q(0) = val;
465*fcf5ef2aSThomas Huth #if SHIFT == 1
466*fcf5ef2aSThomas Huth     val = 0;
467*fcf5ef2aSThomas Huth     val += abs1(d->B(8) - s->B(8));
468*fcf5ef2aSThomas Huth     val += abs1(d->B(9) - s->B(9));
469*fcf5ef2aSThomas Huth     val += abs1(d->B(10) - s->B(10));
470*fcf5ef2aSThomas Huth     val += abs1(d->B(11) - s->B(11));
471*fcf5ef2aSThomas Huth     val += abs1(d->B(12) - s->B(12));
472*fcf5ef2aSThomas Huth     val += abs1(d->B(13) - s->B(13));
473*fcf5ef2aSThomas Huth     val += abs1(d->B(14) - s->B(14));
474*fcf5ef2aSThomas Huth     val += abs1(d->B(15) - s->B(15));
475*fcf5ef2aSThomas Huth     d->Q(1) = val;
476*fcf5ef2aSThomas Huth #endif
477*fcf5ef2aSThomas Huth }
478*fcf5ef2aSThomas Huth 
479*fcf5ef2aSThomas Huth void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
480*fcf5ef2aSThomas Huth                                   target_ulong a0)
481*fcf5ef2aSThomas Huth {
482*fcf5ef2aSThomas Huth     int i;
483*fcf5ef2aSThomas Huth 
484*fcf5ef2aSThomas Huth     for (i = 0; i < (8 << SHIFT); i++) {
485*fcf5ef2aSThomas Huth         if (s->B(i) & 0x80) {
486*fcf5ef2aSThomas Huth             cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
487*fcf5ef2aSThomas Huth         }
488*fcf5ef2aSThomas Huth     }
489*fcf5ef2aSThomas Huth }
490*fcf5ef2aSThomas Huth 
491*fcf5ef2aSThomas Huth void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
492*fcf5ef2aSThomas Huth {
493*fcf5ef2aSThomas Huth     d->L(0) = val;
494*fcf5ef2aSThomas Huth     d->L(1) = 0;
495*fcf5ef2aSThomas Huth #if SHIFT == 1
496*fcf5ef2aSThomas Huth     d->Q(1) = 0;
497*fcf5ef2aSThomas Huth #endif
498*fcf5ef2aSThomas Huth }
499*fcf5ef2aSThomas Huth 
500*fcf5ef2aSThomas Huth #ifdef TARGET_X86_64
501*fcf5ef2aSThomas Huth void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
502*fcf5ef2aSThomas Huth {
503*fcf5ef2aSThomas Huth     d->Q(0) = val;
504*fcf5ef2aSThomas Huth #if SHIFT == 1
505*fcf5ef2aSThomas Huth     d->Q(1) = 0;
506*fcf5ef2aSThomas Huth #endif
507*fcf5ef2aSThomas Huth }
508*fcf5ef2aSThomas Huth #endif
509*fcf5ef2aSThomas Huth 
510*fcf5ef2aSThomas Huth #if SHIFT == 0
511*fcf5ef2aSThomas Huth void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
512*fcf5ef2aSThomas Huth {
513*fcf5ef2aSThomas Huth     Reg r;
514*fcf5ef2aSThomas Huth 
515*fcf5ef2aSThomas Huth     r.W(0) = s->W(order & 3);
516*fcf5ef2aSThomas Huth     r.W(1) = s->W((order >> 2) & 3);
517*fcf5ef2aSThomas Huth     r.W(2) = s->W((order >> 4) & 3);
518*fcf5ef2aSThomas Huth     r.W(3) = s->W((order >> 6) & 3);
519*fcf5ef2aSThomas Huth     *d = r;
520*fcf5ef2aSThomas Huth }
521*fcf5ef2aSThomas Huth #else
522*fcf5ef2aSThomas Huth void helper_shufps(Reg *d, Reg *s, int order)
523*fcf5ef2aSThomas Huth {
524*fcf5ef2aSThomas Huth     Reg r;
525*fcf5ef2aSThomas Huth 
526*fcf5ef2aSThomas Huth     r.L(0) = d->L(order & 3);
527*fcf5ef2aSThomas Huth     r.L(1) = d->L((order >> 2) & 3);
528*fcf5ef2aSThomas Huth     r.L(2) = s->L((order >> 4) & 3);
529*fcf5ef2aSThomas Huth     r.L(3) = s->L((order >> 6) & 3);
530*fcf5ef2aSThomas Huth     *d = r;
531*fcf5ef2aSThomas Huth }
532*fcf5ef2aSThomas Huth 
533*fcf5ef2aSThomas Huth void helper_shufpd(Reg *d, Reg *s, int order)
534*fcf5ef2aSThomas Huth {
535*fcf5ef2aSThomas Huth     Reg r;
536*fcf5ef2aSThomas Huth 
537*fcf5ef2aSThomas Huth     r.Q(0) = d->Q(order & 1);
538*fcf5ef2aSThomas Huth     r.Q(1) = s->Q((order >> 1) & 1);
539*fcf5ef2aSThomas Huth     *d = r;
540*fcf5ef2aSThomas Huth }
541*fcf5ef2aSThomas Huth 
542*fcf5ef2aSThomas Huth void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
543*fcf5ef2aSThomas Huth {
544*fcf5ef2aSThomas Huth     Reg r;
545*fcf5ef2aSThomas Huth 
546*fcf5ef2aSThomas Huth     r.L(0) = s->L(order & 3);
547*fcf5ef2aSThomas Huth     r.L(1) = s->L((order >> 2) & 3);
548*fcf5ef2aSThomas Huth     r.L(2) = s->L((order >> 4) & 3);
549*fcf5ef2aSThomas Huth     r.L(3) = s->L((order >> 6) & 3);
550*fcf5ef2aSThomas Huth     *d = r;
551*fcf5ef2aSThomas Huth }
552*fcf5ef2aSThomas Huth 
553*fcf5ef2aSThomas Huth void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
554*fcf5ef2aSThomas Huth {
555*fcf5ef2aSThomas Huth     Reg r;
556*fcf5ef2aSThomas Huth 
557*fcf5ef2aSThomas Huth     r.W(0) = s->W(order & 3);
558*fcf5ef2aSThomas Huth     r.W(1) = s->W((order >> 2) & 3);
559*fcf5ef2aSThomas Huth     r.W(2) = s->W((order >> 4) & 3);
560*fcf5ef2aSThomas Huth     r.W(3) = s->W((order >> 6) & 3);
561*fcf5ef2aSThomas Huth     r.Q(1) = s->Q(1);
562*fcf5ef2aSThomas Huth     *d = r;
563*fcf5ef2aSThomas Huth }
564*fcf5ef2aSThomas Huth 
565*fcf5ef2aSThomas Huth void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
566*fcf5ef2aSThomas Huth {
567*fcf5ef2aSThomas Huth     Reg r;
568*fcf5ef2aSThomas Huth 
569*fcf5ef2aSThomas Huth     r.Q(0) = s->Q(0);
570*fcf5ef2aSThomas Huth     r.W(4) = s->W(4 + (order & 3));
571*fcf5ef2aSThomas Huth     r.W(5) = s->W(4 + ((order >> 2) & 3));
572*fcf5ef2aSThomas Huth     r.W(6) = s->W(4 + ((order >> 4) & 3));
573*fcf5ef2aSThomas Huth     r.W(7) = s->W(4 + ((order >> 6) & 3));
574*fcf5ef2aSThomas Huth     *d = r;
575*fcf5ef2aSThomas Huth }
576*fcf5ef2aSThomas Huth #endif
577*fcf5ef2aSThomas Huth 
578*fcf5ef2aSThomas Huth #if SHIFT == 1
579*fcf5ef2aSThomas Huth /* FPU ops */
580*fcf5ef2aSThomas Huth /* XXX: not accurate */
581*fcf5ef2aSThomas Huth 
582*fcf5ef2aSThomas Huth #define SSE_HELPER_S(name, F)                                           \
583*fcf5ef2aSThomas Huth     void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
584*fcf5ef2aSThomas Huth     {                                                                   \
585*fcf5ef2aSThomas Huth         d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
586*fcf5ef2aSThomas Huth         d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));                  \
587*fcf5ef2aSThomas Huth         d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));                  \
588*fcf5ef2aSThomas Huth         d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));                  \
589*fcf5ef2aSThomas Huth     }                                                                   \
590*fcf5ef2aSThomas Huth                                                                         \
591*fcf5ef2aSThomas Huth     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
592*fcf5ef2aSThomas Huth     {                                                                   \
593*fcf5ef2aSThomas Huth         d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
594*fcf5ef2aSThomas Huth     }                                                                   \
595*fcf5ef2aSThomas Huth                                                                         \
596*fcf5ef2aSThomas Huth     void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
597*fcf5ef2aSThomas Huth     {                                                                   \
598*fcf5ef2aSThomas Huth         d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
599*fcf5ef2aSThomas Huth         d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));                  \
600*fcf5ef2aSThomas Huth     }                                                                   \
601*fcf5ef2aSThomas Huth                                                                         \
602*fcf5ef2aSThomas Huth     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
603*fcf5ef2aSThomas Huth     {                                                                   \
604*fcf5ef2aSThomas Huth         d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
605*fcf5ef2aSThomas Huth     }
606*fcf5ef2aSThomas Huth 
607*fcf5ef2aSThomas Huth #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
608*fcf5ef2aSThomas Huth #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
609*fcf5ef2aSThomas Huth #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
610*fcf5ef2aSThomas Huth #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
611*fcf5ef2aSThomas Huth #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
612*fcf5ef2aSThomas Huth 
613*fcf5ef2aSThomas Huth /* Note that the choice of comparison op here is important to get the
614*fcf5ef2aSThomas Huth  * special cases right: for min and max Intel specifies that (-0,0),
615*fcf5ef2aSThomas Huth  * (NaN, anything) and (anything, NaN) return the second argument.
616*fcf5ef2aSThomas Huth  */
617*fcf5ef2aSThomas Huth #define FPU_MIN(size, a, b)                                     \
618*fcf5ef2aSThomas Huth     (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
619*fcf5ef2aSThomas Huth #define FPU_MAX(size, a, b)                                     \
620*fcf5ef2aSThomas Huth     (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
621*fcf5ef2aSThomas Huth 
622*fcf5ef2aSThomas Huth SSE_HELPER_S(add, FPU_ADD)
623*fcf5ef2aSThomas Huth SSE_HELPER_S(sub, FPU_SUB)
624*fcf5ef2aSThomas Huth SSE_HELPER_S(mul, FPU_MUL)
625*fcf5ef2aSThomas Huth SSE_HELPER_S(div, FPU_DIV)
626*fcf5ef2aSThomas Huth SSE_HELPER_S(min, FPU_MIN)
627*fcf5ef2aSThomas Huth SSE_HELPER_S(max, FPU_MAX)
628*fcf5ef2aSThomas Huth SSE_HELPER_S(sqrt, FPU_SQRT)
629*fcf5ef2aSThomas Huth 
630*fcf5ef2aSThomas Huth 
631*fcf5ef2aSThomas Huth /* float to float conversions */
632*fcf5ef2aSThomas Huth void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s)
633*fcf5ef2aSThomas Huth {
634*fcf5ef2aSThomas Huth     float32 s0, s1;
635*fcf5ef2aSThomas Huth 
636*fcf5ef2aSThomas Huth     s0 = s->ZMM_S(0);
637*fcf5ef2aSThomas Huth     s1 = s->ZMM_S(1);
638*fcf5ef2aSThomas Huth     d->ZMM_D(0) = float32_to_float64(s0, &env->sse_status);
639*fcf5ef2aSThomas Huth     d->ZMM_D(1) = float32_to_float64(s1, &env->sse_status);
640*fcf5ef2aSThomas Huth }
641*fcf5ef2aSThomas Huth 
642*fcf5ef2aSThomas Huth void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s)
643*fcf5ef2aSThomas Huth {
644*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
645*fcf5ef2aSThomas Huth     d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), &env->sse_status);
646*fcf5ef2aSThomas Huth     d->Q(1) = 0;
647*fcf5ef2aSThomas Huth }
648*fcf5ef2aSThomas Huth 
649*fcf5ef2aSThomas Huth void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
650*fcf5ef2aSThomas Huth {
651*fcf5ef2aSThomas Huth     d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
652*fcf5ef2aSThomas Huth }
653*fcf5ef2aSThomas Huth 
654*fcf5ef2aSThomas Huth void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
655*fcf5ef2aSThomas Huth {
656*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
657*fcf5ef2aSThomas Huth }
658*fcf5ef2aSThomas Huth 
659*fcf5ef2aSThomas Huth /* integer to float */
660*fcf5ef2aSThomas Huth void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s)
661*fcf5ef2aSThomas Huth {
662*fcf5ef2aSThomas Huth     d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), &env->sse_status);
663*fcf5ef2aSThomas Huth     d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), &env->sse_status);
664*fcf5ef2aSThomas Huth     d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), &env->sse_status);
665*fcf5ef2aSThomas Huth     d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), &env->sse_status);
666*fcf5ef2aSThomas Huth }
667*fcf5ef2aSThomas Huth 
668*fcf5ef2aSThomas Huth void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s)
669*fcf5ef2aSThomas Huth {
670*fcf5ef2aSThomas Huth     int32_t l0, l1;
671*fcf5ef2aSThomas Huth 
672*fcf5ef2aSThomas Huth     l0 = (int32_t)s->ZMM_L(0);
673*fcf5ef2aSThomas Huth     l1 = (int32_t)s->ZMM_L(1);
674*fcf5ef2aSThomas Huth     d->ZMM_D(0) = int32_to_float64(l0, &env->sse_status);
675*fcf5ef2aSThomas Huth     d->ZMM_D(1) = int32_to_float64(l1, &env->sse_status);
676*fcf5ef2aSThomas Huth }
677*fcf5ef2aSThomas Huth 
678*fcf5ef2aSThomas Huth void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
679*fcf5ef2aSThomas Huth {
680*fcf5ef2aSThomas Huth     d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
681*fcf5ef2aSThomas Huth     d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
682*fcf5ef2aSThomas Huth }
683*fcf5ef2aSThomas Huth 
684*fcf5ef2aSThomas Huth void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
685*fcf5ef2aSThomas Huth {
686*fcf5ef2aSThomas Huth     d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
687*fcf5ef2aSThomas Huth     d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
688*fcf5ef2aSThomas Huth }
689*fcf5ef2aSThomas Huth 
690*fcf5ef2aSThomas Huth void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
691*fcf5ef2aSThomas Huth {
692*fcf5ef2aSThomas Huth     d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
693*fcf5ef2aSThomas Huth }
694*fcf5ef2aSThomas Huth 
695*fcf5ef2aSThomas Huth void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
696*fcf5ef2aSThomas Huth {
697*fcf5ef2aSThomas Huth     d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
698*fcf5ef2aSThomas Huth }
699*fcf5ef2aSThomas Huth 
700*fcf5ef2aSThomas Huth #ifdef TARGET_X86_64
701*fcf5ef2aSThomas Huth void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
702*fcf5ef2aSThomas Huth {
703*fcf5ef2aSThomas Huth     d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
704*fcf5ef2aSThomas Huth }
705*fcf5ef2aSThomas Huth 
706*fcf5ef2aSThomas Huth void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
707*fcf5ef2aSThomas Huth {
708*fcf5ef2aSThomas Huth     d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
709*fcf5ef2aSThomas Huth }
710*fcf5ef2aSThomas Huth #endif
711*fcf5ef2aSThomas Huth 
712*fcf5ef2aSThomas Huth /* float to integer */
713*fcf5ef2aSThomas Huth void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
714*fcf5ef2aSThomas Huth {
715*fcf5ef2aSThomas Huth     d->ZMM_L(0) = float32_to_int32(s->ZMM_S(0), &env->sse_status);
716*fcf5ef2aSThomas Huth     d->ZMM_L(1) = float32_to_int32(s->ZMM_S(1), &env->sse_status);
717*fcf5ef2aSThomas Huth     d->ZMM_L(2) = float32_to_int32(s->ZMM_S(2), &env->sse_status);
718*fcf5ef2aSThomas Huth     d->ZMM_L(3) = float32_to_int32(s->ZMM_S(3), &env->sse_status);
719*fcf5ef2aSThomas Huth }
720*fcf5ef2aSThomas Huth 
721*fcf5ef2aSThomas Huth void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
722*fcf5ef2aSThomas Huth {
723*fcf5ef2aSThomas Huth     d->ZMM_L(0) = float64_to_int32(s->ZMM_D(0), &env->sse_status);
724*fcf5ef2aSThomas Huth     d->ZMM_L(1) = float64_to_int32(s->ZMM_D(1), &env->sse_status);
725*fcf5ef2aSThomas Huth     d->ZMM_Q(1) = 0;
726*fcf5ef2aSThomas Huth }
727*fcf5ef2aSThomas Huth 
728*fcf5ef2aSThomas Huth void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
729*fcf5ef2aSThomas Huth {
730*fcf5ef2aSThomas Huth     d->MMX_L(0) = float32_to_int32(s->ZMM_S(0), &env->sse_status);
731*fcf5ef2aSThomas Huth     d->MMX_L(1) = float32_to_int32(s->ZMM_S(1), &env->sse_status);
732*fcf5ef2aSThomas Huth }
733*fcf5ef2aSThomas Huth 
734*fcf5ef2aSThomas Huth void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
735*fcf5ef2aSThomas Huth {
736*fcf5ef2aSThomas Huth     d->MMX_L(0) = float64_to_int32(s->ZMM_D(0), &env->sse_status);
737*fcf5ef2aSThomas Huth     d->MMX_L(1) = float64_to_int32(s->ZMM_D(1), &env->sse_status);
738*fcf5ef2aSThomas Huth }
739*fcf5ef2aSThomas Huth 
740*fcf5ef2aSThomas Huth int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
741*fcf5ef2aSThomas Huth {
742*fcf5ef2aSThomas Huth     return float32_to_int32(s->ZMM_S(0), &env->sse_status);
743*fcf5ef2aSThomas Huth }
744*fcf5ef2aSThomas Huth 
745*fcf5ef2aSThomas Huth int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
746*fcf5ef2aSThomas Huth {
747*fcf5ef2aSThomas Huth     return float64_to_int32(s->ZMM_D(0), &env->sse_status);
748*fcf5ef2aSThomas Huth }
749*fcf5ef2aSThomas Huth 
750*fcf5ef2aSThomas Huth #ifdef TARGET_X86_64
751*fcf5ef2aSThomas Huth int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
752*fcf5ef2aSThomas Huth {
753*fcf5ef2aSThomas Huth     return float32_to_int64(s->ZMM_S(0), &env->sse_status);
754*fcf5ef2aSThomas Huth }
755*fcf5ef2aSThomas Huth 
756*fcf5ef2aSThomas Huth int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
757*fcf5ef2aSThomas Huth {
758*fcf5ef2aSThomas Huth     return float64_to_int64(s->ZMM_D(0), &env->sse_status);
759*fcf5ef2aSThomas Huth }
760*fcf5ef2aSThomas Huth #endif
761*fcf5ef2aSThomas Huth 
762*fcf5ef2aSThomas Huth /* float to integer truncated */
763*fcf5ef2aSThomas Huth void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
764*fcf5ef2aSThomas Huth {
765*fcf5ef2aSThomas Huth     d->ZMM_L(0) = float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
766*fcf5ef2aSThomas Huth     d->ZMM_L(1) = float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
767*fcf5ef2aSThomas Huth     d->ZMM_L(2) = float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status);
768*fcf5ef2aSThomas Huth     d->ZMM_L(3) = float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status);
769*fcf5ef2aSThomas Huth }
770*fcf5ef2aSThomas Huth 
771*fcf5ef2aSThomas Huth void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
772*fcf5ef2aSThomas Huth {
773*fcf5ef2aSThomas Huth     d->ZMM_L(0) = float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
774*fcf5ef2aSThomas Huth     d->ZMM_L(1) = float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
775*fcf5ef2aSThomas Huth     d->ZMM_Q(1) = 0;
776*fcf5ef2aSThomas Huth }
777*fcf5ef2aSThomas Huth 
778*fcf5ef2aSThomas Huth void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
779*fcf5ef2aSThomas Huth {
780*fcf5ef2aSThomas Huth     d->MMX_L(0) = float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
781*fcf5ef2aSThomas Huth     d->MMX_L(1) = float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
782*fcf5ef2aSThomas Huth }
783*fcf5ef2aSThomas Huth 
784*fcf5ef2aSThomas Huth void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
785*fcf5ef2aSThomas Huth {
786*fcf5ef2aSThomas Huth     d->MMX_L(0) = float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
787*fcf5ef2aSThomas Huth     d->MMX_L(1) = float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
788*fcf5ef2aSThomas Huth }
789*fcf5ef2aSThomas Huth 
790*fcf5ef2aSThomas Huth int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
791*fcf5ef2aSThomas Huth {
792*fcf5ef2aSThomas Huth     return float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
793*fcf5ef2aSThomas Huth }
794*fcf5ef2aSThomas Huth 
795*fcf5ef2aSThomas Huth int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
796*fcf5ef2aSThomas Huth {
797*fcf5ef2aSThomas Huth     return float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
798*fcf5ef2aSThomas Huth }
799*fcf5ef2aSThomas Huth 
800*fcf5ef2aSThomas Huth #ifdef TARGET_X86_64
801*fcf5ef2aSThomas Huth int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
802*fcf5ef2aSThomas Huth {
803*fcf5ef2aSThomas Huth     return float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
804*fcf5ef2aSThomas Huth }
805*fcf5ef2aSThomas Huth 
806*fcf5ef2aSThomas Huth int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
807*fcf5ef2aSThomas Huth {
808*fcf5ef2aSThomas Huth     return float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
809*fcf5ef2aSThomas Huth }
810*fcf5ef2aSThomas Huth #endif
811*fcf5ef2aSThomas Huth 
812*fcf5ef2aSThomas Huth void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
813*fcf5ef2aSThomas Huth {
814*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_div(float32_one,
815*fcf5ef2aSThomas Huth                               float32_sqrt(s->ZMM_S(0), &env->sse_status),
816*fcf5ef2aSThomas Huth                               &env->sse_status);
817*fcf5ef2aSThomas Huth     d->ZMM_S(1) = float32_div(float32_one,
818*fcf5ef2aSThomas Huth                               float32_sqrt(s->ZMM_S(1), &env->sse_status),
819*fcf5ef2aSThomas Huth                               &env->sse_status);
820*fcf5ef2aSThomas Huth     d->ZMM_S(2) = float32_div(float32_one,
821*fcf5ef2aSThomas Huth                               float32_sqrt(s->ZMM_S(2), &env->sse_status),
822*fcf5ef2aSThomas Huth                               &env->sse_status);
823*fcf5ef2aSThomas Huth     d->ZMM_S(3) = float32_div(float32_one,
824*fcf5ef2aSThomas Huth                               float32_sqrt(s->ZMM_S(3), &env->sse_status),
825*fcf5ef2aSThomas Huth                               &env->sse_status);
826*fcf5ef2aSThomas Huth }
827*fcf5ef2aSThomas Huth 
828*fcf5ef2aSThomas Huth void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
829*fcf5ef2aSThomas Huth {
830*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_div(float32_one,
831*fcf5ef2aSThomas Huth                               float32_sqrt(s->ZMM_S(0), &env->sse_status),
832*fcf5ef2aSThomas Huth                               &env->sse_status);
833*fcf5ef2aSThomas Huth }
834*fcf5ef2aSThomas Huth 
835*fcf5ef2aSThomas Huth void helper_rcpps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
836*fcf5ef2aSThomas Huth {
837*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
838*fcf5ef2aSThomas Huth     d->ZMM_S(1) = float32_div(float32_one, s->ZMM_S(1), &env->sse_status);
839*fcf5ef2aSThomas Huth     d->ZMM_S(2) = float32_div(float32_one, s->ZMM_S(2), &env->sse_status);
840*fcf5ef2aSThomas Huth     d->ZMM_S(3) = float32_div(float32_one, s->ZMM_S(3), &env->sse_status);
841*fcf5ef2aSThomas Huth }
842*fcf5ef2aSThomas Huth 
843*fcf5ef2aSThomas Huth void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
844*fcf5ef2aSThomas Huth {
845*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
846*fcf5ef2aSThomas Huth }
847*fcf5ef2aSThomas Huth 
848*fcf5ef2aSThomas Huth static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
849*fcf5ef2aSThomas Huth {
850*fcf5ef2aSThomas Huth     uint64_t mask;
851*fcf5ef2aSThomas Huth 
852*fcf5ef2aSThomas Huth     if (len == 0) {
853*fcf5ef2aSThomas Huth         mask = ~0LL;
854*fcf5ef2aSThomas Huth     } else {
855*fcf5ef2aSThomas Huth         mask = (1ULL << len) - 1;
856*fcf5ef2aSThomas Huth     }
857*fcf5ef2aSThomas Huth     return (src >> shift) & mask;
858*fcf5ef2aSThomas Huth }
859*fcf5ef2aSThomas Huth 
860*fcf5ef2aSThomas Huth void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
861*fcf5ef2aSThomas Huth {
862*fcf5ef2aSThomas Huth     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1), s->ZMM_B(0));
863*fcf5ef2aSThomas Huth }
864*fcf5ef2aSThomas Huth 
865*fcf5ef2aSThomas Huth void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
866*fcf5ef2aSThomas Huth {
867*fcf5ef2aSThomas Huth     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
868*fcf5ef2aSThomas Huth }
869*fcf5ef2aSThomas Huth 
870*fcf5ef2aSThomas Huth static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
871*fcf5ef2aSThomas Huth {
872*fcf5ef2aSThomas Huth     uint64_t mask;
873*fcf5ef2aSThomas Huth 
874*fcf5ef2aSThomas Huth     if (len == 0) {
875*fcf5ef2aSThomas Huth         mask = ~0ULL;
876*fcf5ef2aSThomas Huth     } else {
877*fcf5ef2aSThomas Huth         mask = (1ULL << len) - 1;
878*fcf5ef2aSThomas Huth     }
879*fcf5ef2aSThomas Huth     return (src & ~(mask << shift)) | ((src & mask) << shift);
880*fcf5ef2aSThomas Huth }
881*fcf5ef2aSThomas Huth 
882*fcf5ef2aSThomas Huth void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
883*fcf5ef2aSThomas Huth {
884*fcf5ef2aSThomas Huth     d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
885*fcf5ef2aSThomas Huth }
886*fcf5ef2aSThomas Huth 
887*fcf5ef2aSThomas Huth void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
888*fcf5ef2aSThomas Huth {
889*fcf5ef2aSThomas Huth     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
890*fcf5ef2aSThomas Huth }
891*fcf5ef2aSThomas Huth 
892*fcf5ef2aSThomas Huth void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
893*fcf5ef2aSThomas Huth {
894*fcf5ef2aSThomas Huth     ZMMReg r;
895*fcf5ef2aSThomas Huth 
896*fcf5ef2aSThomas Huth     r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
897*fcf5ef2aSThomas Huth     r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
898*fcf5ef2aSThomas Huth     r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
899*fcf5ef2aSThomas Huth     r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
900*fcf5ef2aSThomas Huth     *d = r;
901*fcf5ef2aSThomas Huth }
902*fcf5ef2aSThomas Huth 
903*fcf5ef2aSThomas Huth void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
904*fcf5ef2aSThomas Huth {
905*fcf5ef2aSThomas Huth     ZMMReg r;
906*fcf5ef2aSThomas Huth 
907*fcf5ef2aSThomas Huth     r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
908*fcf5ef2aSThomas Huth     r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
909*fcf5ef2aSThomas Huth     *d = r;
910*fcf5ef2aSThomas Huth }
911*fcf5ef2aSThomas Huth 
912*fcf5ef2aSThomas Huth void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
913*fcf5ef2aSThomas Huth {
914*fcf5ef2aSThomas Huth     ZMMReg r;
915*fcf5ef2aSThomas Huth 
916*fcf5ef2aSThomas Huth     r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
917*fcf5ef2aSThomas Huth     r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
918*fcf5ef2aSThomas Huth     r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
919*fcf5ef2aSThomas Huth     r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
920*fcf5ef2aSThomas Huth     *d = r;
921*fcf5ef2aSThomas Huth }
922*fcf5ef2aSThomas Huth 
923*fcf5ef2aSThomas Huth void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
924*fcf5ef2aSThomas Huth {
925*fcf5ef2aSThomas Huth     ZMMReg r;
926*fcf5ef2aSThomas Huth 
927*fcf5ef2aSThomas Huth     r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
928*fcf5ef2aSThomas Huth     r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
929*fcf5ef2aSThomas Huth     *d = r;
930*fcf5ef2aSThomas Huth }
931*fcf5ef2aSThomas Huth 
932*fcf5ef2aSThomas Huth void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
933*fcf5ef2aSThomas Huth {
934*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_sub(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status);
935*fcf5ef2aSThomas Huth     d->ZMM_S(1) = float32_add(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status);
936*fcf5ef2aSThomas Huth     d->ZMM_S(2) = float32_sub(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status);
937*fcf5ef2aSThomas Huth     d->ZMM_S(3) = float32_add(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status);
938*fcf5ef2aSThomas Huth }
939*fcf5ef2aSThomas Huth 
940*fcf5ef2aSThomas Huth void helper_addsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
941*fcf5ef2aSThomas Huth {
942*fcf5ef2aSThomas Huth     d->ZMM_D(0) = float64_sub(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
943*fcf5ef2aSThomas Huth     d->ZMM_D(1) = float64_add(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
944*fcf5ef2aSThomas Huth }
945*fcf5ef2aSThomas Huth 
946*fcf5ef2aSThomas Huth /* XXX: unordered */
947*fcf5ef2aSThomas Huth #define SSE_HELPER_CMP(name, F)                                         \
948*fcf5ef2aSThomas Huth     void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
949*fcf5ef2aSThomas Huth     {                                                                   \
950*fcf5ef2aSThomas Huth         d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
951*fcf5ef2aSThomas Huth         d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));                  \
952*fcf5ef2aSThomas Huth         d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));                  \
953*fcf5ef2aSThomas Huth         d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));                  \
954*fcf5ef2aSThomas Huth     }                                                                   \
955*fcf5ef2aSThomas Huth                                                                         \
956*fcf5ef2aSThomas Huth     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
957*fcf5ef2aSThomas Huth     {                                                                   \
958*fcf5ef2aSThomas Huth         d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
959*fcf5ef2aSThomas Huth     }                                                                   \
960*fcf5ef2aSThomas Huth                                                                         \
961*fcf5ef2aSThomas Huth     void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
962*fcf5ef2aSThomas Huth     {                                                                   \
963*fcf5ef2aSThomas Huth         d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
964*fcf5ef2aSThomas Huth         d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));                  \
965*fcf5ef2aSThomas Huth     }                                                                   \
966*fcf5ef2aSThomas Huth                                                                         \
967*fcf5ef2aSThomas Huth     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
968*fcf5ef2aSThomas Huth     {                                                                   \
969*fcf5ef2aSThomas Huth         d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
970*fcf5ef2aSThomas Huth     }
971*fcf5ef2aSThomas Huth 
972*fcf5ef2aSThomas Huth #define FPU_CMPEQ(size, a, b)                                           \
973*fcf5ef2aSThomas Huth     (float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0)
974*fcf5ef2aSThomas Huth #define FPU_CMPLT(size, a, b)                                           \
975*fcf5ef2aSThomas Huth     (float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0)
976*fcf5ef2aSThomas Huth #define FPU_CMPLE(size, a, b)                                           \
977*fcf5ef2aSThomas Huth     (float ## size ## _le(a, b, &env->sse_status) ? -1 : 0)
978*fcf5ef2aSThomas Huth #define FPU_CMPUNORD(size, a, b)                                        \
979*fcf5ef2aSThomas Huth     (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? -1 : 0)
980*fcf5ef2aSThomas Huth #define FPU_CMPNEQ(size, a, b)                                          \
981*fcf5ef2aSThomas Huth     (float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1)
982*fcf5ef2aSThomas Huth #define FPU_CMPNLT(size, a, b)                                          \
983*fcf5ef2aSThomas Huth     (float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1)
984*fcf5ef2aSThomas Huth #define FPU_CMPNLE(size, a, b)                                          \
985*fcf5ef2aSThomas Huth     (float ## size ## _le(a, b, &env->sse_status) ? 0 : -1)
986*fcf5ef2aSThomas Huth #define FPU_CMPORD(size, a, b)                                          \
987*fcf5ef2aSThomas Huth     (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1)
988*fcf5ef2aSThomas Huth 
989*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
990*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmplt, FPU_CMPLT)
991*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmple, FPU_CMPLE)
992*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
993*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
994*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
995*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
996*fcf5ef2aSThomas Huth SSE_HELPER_CMP(cmpord, FPU_CMPORD)
997*fcf5ef2aSThomas Huth 
998*fcf5ef2aSThomas Huth static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
999*fcf5ef2aSThomas Huth 
1000*fcf5ef2aSThomas Huth void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
1001*fcf5ef2aSThomas Huth {
1002*fcf5ef2aSThomas Huth     int ret;
1003*fcf5ef2aSThomas Huth     float32 s0, s1;
1004*fcf5ef2aSThomas Huth 
1005*fcf5ef2aSThomas Huth     s0 = d->ZMM_S(0);
1006*fcf5ef2aSThomas Huth     s1 = s->ZMM_S(0);
1007*fcf5ef2aSThomas Huth     ret = float32_compare_quiet(s0, s1, &env->sse_status);
1008*fcf5ef2aSThomas Huth     CC_SRC = comis_eflags[ret + 1];
1009*fcf5ef2aSThomas Huth }
1010*fcf5ef2aSThomas Huth 
1011*fcf5ef2aSThomas Huth void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
1012*fcf5ef2aSThomas Huth {
1013*fcf5ef2aSThomas Huth     int ret;
1014*fcf5ef2aSThomas Huth     float32 s0, s1;
1015*fcf5ef2aSThomas Huth 
1016*fcf5ef2aSThomas Huth     s0 = d->ZMM_S(0);
1017*fcf5ef2aSThomas Huth     s1 = s->ZMM_S(0);
1018*fcf5ef2aSThomas Huth     ret = float32_compare(s0, s1, &env->sse_status);
1019*fcf5ef2aSThomas Huth     CC_SRC = comis_eflags[ret + 1];
1020*fcf5ef2aSThomas Huth }
1021*fcf5ef2aSThomas Huth 
1022*fcf5ef2aSThomas Huth void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
1023*fcf5ef2aSThomas Huth {
1024*fcf5ef2aSThomas Huth     int ret;
1025*fcf5ef2aSThomas Huth     float64 d0, d1;
1026*fcf5ef2aSThomas Huth 
1027*fcf5ef2aSThomas Huth     d0 = d->ZMM_D(0);
1028*fcf5ef2aSThomas Huth     d1 = s->ZMM_D(0);
1029*fcf5ef2aSThomas Huth     ret = float64_compare_quiet(d0, d1, &env->sse_status);
1030*fcf5ef2aSThomas Huth     CC_SRC = comis_eflags[ret + 1];
1031*fcf5ef2aSThomas Huth }
1032*fcf5ef2aSThomas Huth 
1033*fcf5ef2aSThomas Huth void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
1034*fcf5ef2aSThomas Huth {
1035*fcf5ef2aSThomas Huth     int ret;
1036*fcf5ef2aSThomas Huth     float64 d0, d1;
1037*fcf5ef2aSThomas Huth 
1038*fcf5ef2aSThomas Huth     d0 = d->ZMM_D(0);
1039*fcf5ef2aSThomas Huth     d1 = s->ZMM_D(0);
1040*fcf5ef2aSThomas Huth     ret = float64_compare(d0, d1, &env->sse_status);
1041*fcf5ef2aSThomas Huth     CC_SRC = comis_eflags[ret + 1];
1042*fcf5ef2aSThomas Huth }
1043*fcf5ef2aSThomas Huth 
1044*fcf5ef2aSThomas Huth uint32_t helper_movmskps(CPUX86State *env, Reg *s)
1045*fcf5ef2aSThomas Huth {
1046*fcf5ef2aSThomas Huth     int b0, b1, b2, b3;
1047*fcf5ef2aSThomas Huth 
1048*fcf5ef2aSThomas Huth     b0 = s->ZMM_L(0) >> 31;
1049*fcf5ef2aSThomas Huth     b1 = s->ZMM_L(1) >> 31;
1050*fcf5ef2aSThomas Huth     b2 = s->ZMM_L(2) >> 31;
1051*fcf5ef2aSThomas Huth     b3 = s->ZMM_L(3) >> 31;
1052*fcf5ef2aSThomas Huth     return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
1053*fcf5ef2aSThomas Huth }
1054*fcf5ef2aSThomas Huth 
1055*fcf5ef2aSThomas Huth uint32_t helper_movmskpd(CPUX86State *env, Reg *s)
1056*fcf5ef2aSThomas Huth {
1057*fcf5ef2aSThomas Huth     int b0, b1;
1058*fcf5ef2aSThomas Huth 
1059*fcf5ef2aSThomas Huth     b0 = s->ZMM_L(1) >> 31;
1060*fcf5ef2aSThomas Huth     b1 = s->ZMM_L(3) >> 31;
1061*fcf5ef2aSThomas Huth     return b0 | (b1 << 1);
1062*fcf5ef2aSThomas Huth }
1063*fcf5ef2aSThomas Huth 
1064*fcf5ef2aSThomas Huth #endif
1065*fcf5ef2aSThomas Huth 
1066*fcf5ef2aSThomas Huth uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
1067*fcf5ef2aSThomas Huth {
1068*fcf5ef2aSThomas Huth     uint32_t val;
1069*fcf5ef2aSThomas Huth 
1070*fcf5ef2aSThomas Huth     val = 0;
1071*fcf5ef2aSThomas Huth     val |= (s->B(0) >> 7);
1072*fcf5ef2aSThomas Huth     val |= (s->B(1) >> 6) & 0x02;
1073*fcf5ef2aSThomas Huth     val |= (s->B(2) >> 5) & 0x04;
1074*fcf5ef2aSThomas Huth     val |= (s->B(3) >> 4) & 0x08;
1075*fcf5ef2aSThomas Huth     val |= (s->B(4) >> 3) & 0x10;
1076*fcf5ef2aSThomas Huth     val |= (s->B(5) >> 2) & 0x20;
1077*fcf5ef2aSThomas Huth     val |= (s->B(6) >> 1) & 0x40;
1078*fcf5ef2aSThomas Huth     val |= (s->B(7)) & 0x80;
1079*fcf5ef2aSThomas Huth #if SHIFT == 1
1080*fcf5ef2aSThomas Huth     val |= (s->B(8) << 1) & 0x0100;
1081*fcf5ef2aSThomas Huth     val |= (s->B(9) << 2) & 0x0200;
1082*fcf5ef2aSThomas Huth     val |= (s->B(10) << 3) & 0x0400;
1083*fcf5ef2aSThomas Huth     val |= (s->B(11) << 4) & 0x0800;
1084*fcf5ef2aSThomas Huth     val |= (s->B(12) << 5) & 0x1000;
1085*fcf5ef2aSThomas Huth     val |= (s->B(13) << 6) & 0x2000;
1086*fcf5ef2aSThomas Huth     val |= (s->B(14) << 7) & 0x4000;
1087*fcf5ef2aSThomas Huth     val |= (s->B(15) << 8) & 0x8000;
1088*fcf5ef2aSThomas Huth #endif
1089*fcf5ef2aSThomas Huth     return val;
1090*fcf5ef2aSThomas Huth }
1091*fcf5ef2aSThomas Huth 
1092*fcf5ef2aSThomas Huth void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1093*fcf5ef2aSThomas Huth {
1094*fcf5ef2aSThomas Huth     Reg r;
1095*fcf5ef2aSThomas Huth 
1096*fcf5ef2aSThomas Huth     r.B(0) = satsb((int16_t)d->W(0));
1097*fcf5ef2aSThomas Huth     r.B(1) = satsb((int16_t)d->W(1));
1098*fcf5ef2aSThomas Huth     r.B(2) = satsb((int16_t)d->W(2));
1099*fcf5ef2aSThomas Huth     r.B(3) = satsb((int16_t)d->W(3));
1100*fcf5ef2aSThomas Huth #if SHIFT == 1
1101*fcf5ef2aSThomas Huth     r.B(4) = satsb((int16_t)d->W(4));
1102*fcf5ef2aSThomas Huth     r.B(5) = satsb((int16_t)d->W(5));
1103*fcf5ef2aSThomas Huth     r.B(6) = satsb((int16_t)d->W(6));
1104*fcf5ef2aSThomas Huth     r.B(7) = satsb((int16_t)d->W(7));
1105*fcf5ef2aSThomas Huth #endif
1106*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
1107*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
1108*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
1109*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
1110*fcf5ef2aSThomas Huth #if SHIFT == 1
1111*fcf5ef2aSThomas Huth     r.B(12) = satsb((int16_t)s->W(4));
1112*fcf5ef2aSThomas Huth     r.B(13) = satsb((int16_t)s->W(5));
1113*fcf5ef2aSThomas Huth     r.B(14) = satsb((int16_t)s->W(6));
1114*fcf5ef2aSThomas Huth     r.B(15) = satsb((int16_t)s->W(7));
1115*fcf5ef2aSThomas Huth #endif
1116*fcf5ef2aSThomas Huth     *d = r;
1117*fcf5ef2aSThomas Huth }
1118*fcf5ef2aSThomas Huth 
1119*fcf5ef2aSThomas Huth void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1120*fcf5ef2aSThomas Huth {
1121*fcf5ef2aSThomas Huth     Reg r;
1122*fcf5ef2aSThomas Huth 
1123*fcf5ef2aSThomas Huth     r.B(0) = satub((int16_t)d->W(0));
1124*fcf5ef2aSThomas Huth     r.B(1) = satub((int16_t)d->W(1));
1125*fcf5ef2aSThomas Huth     r.B(2) = satub((int16_t)d->W(2));
1126*fcf5ef2aSThomas Huth     r.B(3) = satub((int16_t)d->W(3));
1127*fcf5ef2aSThomas Huth #if SHIFT == 1
1128*fcf5ef2aSThomas Huth     r.B(4) = satub((int16_t)d->W(4));
1129*fcf5ef2aSThomas Huth     r.B(5) = satub((int16_t)d->W(5));
1130*fcf5ef2aSThomas Huth     r.B(6) = satub((int16_t)d->W(6));
1131*fcf5ef2aSThomas Huth     r.B(7) = satub((int16_t)d->W(7));
1132*fcf5ef2aSThomas Huth #endif
1133*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
1134*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
1135*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
1136*fcf5ef2aSThomas Huth     r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
1137*fcf5ef2aSThomas Huth #if SHIFT == 1
1138*fcf5ef2aSThomas Huth     r.B(12) = satub((int16_t)s->W(4));
1139*fcf5ef2aSThomas Huth     r.B(13) = satub((int16_t)s->W(5));
1140*fcf5ef2aSThomas Huth     r.B(14) = satub((int16_t)s->W(6));
1141*fcf5ef2aSThomas Huth     r.B(15) = satub((int16_t)s->W(7));
1142*fcf5ef2aSThomas Huth #endif
1143*fcf5ef2aSThomas Huth     *d = r;
1144*fcf5ef2aSThomas Huth }
1145*fcf5ef2aSThomas Huth 
1146*fcf5ef2aSThomas Huth void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1147*fcf5ef2aSThomas Huth {
1148*fcf5ef2aSThomas Huth     Reg r;
1149*fcf5ef2aSThomas Huth 
1150*fcf5ef2aSThomas Huth     r.W(0) = satsw(d->L(0));
1151*fcf5ef2aSThomas Huth     r.W(1) = satsw(d->L(1));
1152*fcf5ef2aSThomas Huth #if SHIFT == 1
1153*fcf5ef2aSThomas Huth     r.W(2) = satsw(d->L(2));
1154*fcf5ef2aSThomas Huth     r.W(3) = satsw(d->L(3));
1155*fcf5ef2aSThomas Huth #endif
1156*fcf5ef2aSThomas Huth     r.W((2 << SHIFT) + 0) = satsw(s->L(0));
1157*fcf5ef2aSThomas Huth     r.W((2 << SHIFT) + 1) = satsw(s->L(1));
1158*fcf5ef2aSThomas Huth #if SHIFT == 1
1159*fcf5ef2aSThomas Huth     r.W(6) = satsw(s->L(2));
1160*fcf5ef2aSThomas Huth     r.W(7) = satsw(s->L(3));
1161*fcf5ef2aSThomas Huth #endif
1162*fcf5ef2aSThomas Huth     *d = r;
1163*fcf5ef2aSThomas Huth }
1164*fcf5ef2aSThomas Huth 
1165*fcf5ef2aSThomas Huth #define UNPCK_OP(base_name, base)                                       \
1166*fcf5ef2aSThomas Huth                                                                         \
1167*fcf5ef2aSThomas Huth     void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
1168*fcf5ef2aSThomas Huth                                                         Reg *d, Reg *s) \
1169*fcf5ef2aSThomas Huth     {                                                                   \
1170*fcf5ef2aSThomas Huth         Reg r;                                                          \
1171*fcf5ef2aSThomas Huth                                                                         \
1172*fcf5ef2aSThomas Huth         r.B(0) = d->B((base << (SHIFT + 2)) + 0);                       \
1173*fcf5ef2aSThomas Huth         r.B(1) = s->B((base << (SHIFT + 2)) + 0);                       \
1174*fcf5ef2aSThomas Huth         r.B(2) = d->B((base << (SHIFT + 2)) + 1);                       \
1175*fcf5ef2aSThomas Huth         r.B(3) = s->B((base << (SHIFT + 2)) + 1);                       \
1176*fcf5ef2aSThomas Huth         r.B(4) = d->B((base << (SHIFT + 2)) + 2);                       \
1177*fcf5ef2aSThomas Huth         r.B(5) = s->B((base << (SHIFT + 2)) + 2);                       \
1178*fcf5ef2aSThomas Huth         r.B(6) = d->B((base << (SHIFT + 2)) + 3);                       \
1179*fcf5ef2aSThomas Huth         r.B(7) = s->B((base << (SHIFT + 2)) + 3);                       \
1180*fcf5ef2aSThomas Huth         XMM_ONLY(                                                       \
1181*fcf5ef2aSThomas Huth                  r.B(8) = d->B((base << (SHIFT + 2)) + 4);              \
1182*fcf5ef2aSThomas Huth                  r.B(9) = s->B((base << (SHIFT + 2)) + 4);              \
1183*fcf5ef2aSThomas Huth                  r.B(10) = d->B((base << (SHIFT + 2)) + 5);             \
1184*fcf5ef2aSThomas Huth                  r.B(11) = s->B((base << (SHIFT + 2)) + 5);             \
1185*fcf5ef2aSThomas Huth                  r.B(12) = d->B((base << (SHIFT + 2)) + 6);             \
1186*fcf5ef2aSThomas Huth                  r.B(13) = s->B((base << (SHIFT + 2)) + 6);             \
1187*fcf5ef2aSThomas Huth                  r.B(14) = d->B((base << (SHIFT + 2)) + 7);             \
1188*fcf5ef2aSThomas Huth                  r.B(15) = s->B((base << (SHIFT + 2)) + 7);             \
1189*fcf5ef2aSThomas Huth                                                                       ) \
1190*fcf5ef2aSThomas Huth             *d = r;                                                     \
1191*fcf5ef2aSThomas Huth     }                                                                   \
1192*fcf5ef2aSThomas Huth                                                                         \
1193*fcf5ef2aSThomas Huth     void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
1194*fcf5ef2aSThomas Huth                                                         Reg *d, Reg *s) \
1195*fcf5ef2aSThomas Huth     {                                                                   \
1196*fcf5ef2aSThomas Huth         Reg r;                                                          \
1197*fcf5ef2aSThomas Huth                                                                         \
1198*fcf5ef2aSThomas Huth         r.W(0) = d->W((base << (SHIFT + 1)) + 0);                       \
1199*fcf5ef2aSThomas Huth         r.W(1) = s->W((base << (SHIFT + 1)) + 0);                       \
1200*fcf5ef2aSThomas Huth         r.W(2) = d->W((base << (SHIFT + 1)) + 1);                       \
1201*fcf5ef2aSThomas Huth         r.W(3) = s->W((base << (SHIFT + 1)) + 1);                       \
1202*fcf5ef2aSThomas Huth         XMM_ONLY(                                                       \
1203*fcf5ef2aSThomas Huth                  r.W(4) = d->W((base << (SHIFT + 1)) + 2);              \
1204*fcf5ef2aSThomas Huth                  r.W(5) = s->W((base << (SHIFT + 1)) + 2);              \
1205*fcf5ef2aSThomas Huth                  r.W(6) = d->W((base << (SHIFT + 1)) + 3);              \
1206*fcf5ef2aSThomas Huth                  r.W(7) = s->W((base << (SHIFT + 1)) + 3);              \
1207*fcf5ef2aSThomas Huth                                                                       ) \
1208*fcf5ef2aSThomas Huth             *d = r;                                                     \
1209*fcf5ef2aSThomas Huth     }                                                                   \
1210*fcf5ef2aSThomas Huth                                                                         \
1211*fcf5ef2aSThomas Huth     void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
1212*fcf5ef2aSThomas Huth                                                         Reg *d, Reg *s) \
1213*fcf5ef2aSThomas Huth     {                                                                   \
1214*fcf5ef2aSThomas Huth         Reg r;                                                          \
1215*fcf5ef2aSThomas Huth                                                                         \
1216*fcf5ef2aSThomas Huth         r.L(0) = d->L((base << SHIFT) + 0);                             \
1217*fcf5ef2aSThomas Huth         r.L(1) = s->L((base << SHIFT) + 0);                             \
1218*fcf5ef2aSThomas Huth         XMM_ONLY(                                                       \
1219*fcf5ef2aSThomas Huth                  r.L(2) = d->L((base << SHIFT) + 1);                    \
1220*fcf5ef2aSThomas Huth                  r.L(3) = s->L((base << SHIFT) + 1);                    \
1221*fcf5ef2aSThomas Huth                                                                       ) \
1222*fcf5ef2aSThomas Huth             *d = r;                                                     \
1223*fcf5ef2aSThomas Huth     }                                                                   \
1224*fcf5ef2aSThomas Huth                                                                         \
1225*fcf5ef2aSThomas Huth     XMM_ONLY(                                                           \
1226*fcf5ef2aSThomas Huth              void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \
1227*fcf5ef2aSThomas Huth                                                                   *env, \
1228*fcf5ef2aSThomas Huth                                                                   Reg *d, \
1229*fcf5ef2aSThomas Huth                                                                   Reg *s) \
1230*fcf5ef2aSThomas Huth              {                                                          \
1231*fcf5ef2aSThomas Huth                  Reg r;                                                 \
1232*fcf5ef2aSThomas Huth                                                                         \
1233*fcf5ef2aSThomas Huth                  r.Q(0) = d->Q(base);                                   \
1234*fcf5ef2aSThomas Huth                  r.Q(1) = s->Q(base);                                   \
1235*fcf5ef2aSThomas Huth                  *d = r;                                                \
1236*fcf5ef2aSThomas Huth              }                                                          \
1237*fcf5ef2aSThomas Huth                                                                         )
1238*fcf5ef2aSThomas Huth 
1239*fcf5ef2aSThomas Huth UNPCK_OP(l, 0)
1240*fcf5ef2aSThomas Huth UNPCK_OP(h, 1)
1241*fcf5ef2aSThomas Huth 
1242*fcf5ef2aSThomas Huth /* 3DNow! float ops */
1243*fcf5ef2aSThomas Huth #if SHIFT == 0
1244*fcf5ef2aSThomas Huth void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
1245*fcf5ef2aSThomas Huth {
1246*fcf5ef2aSThomas Huth     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1247*fcf5ef2aSThomas Huth     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1248*fcf5ef2aSThomas Huth }
1249*fcf5ef2aSThomas Huth 
1250*fcf5ef2aSThomas Huth void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
1251*fcf5ef2aSThomas Huth {
1252*fcf5ef2aSThomas Huth     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1253*fcf5ef2aSThomas Huth     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1254*fcf5ef2aSThomas Huth }
1255*fcf5ef2aSThomas Huth 
1256*fcf5ef2aSThomas Huth void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
1257*fcf5ef2aSThomas Huth {
1258*fcf5ef2aSThomas Huth     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1259*fcf5ef2aSThomas Huth     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1260*fcf5ef2aSThomas Huth }
1261*fcf5ef2aSThomas Huth 
1262*fcf5ef2aSThomas Huth void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
1263*fcf5ef2aSThomas Huth {
1264*fcf5ef2aSThomas Huth     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1265*fcf5ef2aSThomas Huth                                                        &env->mmx_status));
1266*fcf5ef2aSThomas Huth     d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1267*fcf5ef2aSThomas Huth                                                        &env->mmx_status));
1268*fcf5ef2aSThomas Huth }
1269*fcf5ef2aSThomas Huth 
1270*fcf5ef2aSThomas Huth void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1271*fcf5ef2aSThomas Huth {
1272*fcf5ef2aSThomas Huth     MMXReg r;
1273*fcf5ef2aSThomas Huth 
1274*fcf5ef2aSThomas Huth     r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1275*fcf5ef2aSThomas Huth     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1276*fcf5ef2aSThomas Huth     *d = r;
1277*fcf5ef2aSThomas Huth }
1278*fcf5ef2aSThomas Huth 
1279*fcf5ef2aSThomas Huth void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
1280*fcf5ef2aSThomas Huth {
1281*fcf5ef2aSThomas Huth     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1282*fcf5ef2aSThomas Huth     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1283*fcf5ef2aSThomas Huth }
1284*fcf5ef2aSThomas Huth 
1285*fcf5ef2aSThomas Huth void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
1286*fcf5ef2aSThomas Huth {
1287*fcf5ef2aSThomas Huth     d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1288*fcf5ef2aSThomas Huth                                    &env->mmx_status) ? -1 : 0;
1289*fcf5ef2aSThomas Huth     d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1290*fcf5ef2aSThomas Huth                                    &env->mmx_status) ? -1 : 0;
1291*fcf5ef2aSThomas Huth }
1292*fcf5ef2aSThomas Huth 
1293*fcf5ef2aSThomas Huth void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
1294*fcf5ef2aSThomas Huth {
1295*fcf5ef2aSThomas Huth     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1296*fcf5ef2aSThomas Huth                              &env->mmx_status) ? -1 : 0;
1297*fcf5ef2aSThomas Huth     d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1298*fcf5ef2aSThomas Huth                              &env->mmx_status) ? -1 : 0;
1299*fcf5ef2aSThomas Huth }
1300*fcf5ef2aSThomas Huth 
1301*fcf5ef2aSThomas Huth void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
1302*fcf5ef2aSThomas Huth {
1303*fcf5ef2aSThomas Huth     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1304*fcf5ef2aSThomas Huth                              &env->mmx_status) ? -1 : 0;
1305*fcf5ef2aSThomas Huth     d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1306*fcf5ef2aSThomas Huth                              &env->mmx_status) ? -1 : 0;
1307*fcf5ef2aSThomas Huth }
1308*fcf5ef2aSThomas Huth 
1309*fcf5ef2aSThomas Huth void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
1310*fcf5ef2aSThomas Huth {
1311*fcf5ef2aSThomas Huth     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
1312*fcf5ef2aSThomas Huth         d->MMX_S(0) = s->MMX_S(0);
1313*fcf5ef2aSThomas Huth     }
1314*fcf5ef2aSThomas Huth     if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
1315*fcf5ef2aSThomas Huth         d->MMX_S(1) = s->MMX_S(1);
1316*fcf5ef2aSThomas Huth     }
1317*fcf5ef2aSThomas Huth }
1318*fcf5ef2aSThomas Huth 
1319*fcf5ef2aSThomas Huth void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
1320*fcf5ef2aSThomas Huth {
1321*fcf5ef2aSThomas Huth     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
1322*fcf5ef2aSThomas Huth         d->MMX_S(0) = s->MMX_S(0);
1323*fcf5ef2aSThomas Huth     }
1324*fcf5ef2aSThomas Huth     if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
1325*fcf5ef2aSThomas Huth         d->MMX_S(1) = s->MMX_S(1);
1326*fcf5ef2aSThomas Huth     }
1327*fcf5ef2aSThomas Huth }
1328*fcf5ef2aSThomas Huth 
1329*fcf5ef2aSThomas Huth void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
1330*fcf5ef2aSThomas Huth {
1331*fcf5ef2aSThomas Huth     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1332*fcf5ef2aSThomas Huth     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1333*fcf5ef2aSThomas Huth }
1334*fcf5ef2aSThomas Huth 
1335*fcf5ef2aSThomas Huth void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1336*fcf5ef2aSThomas Huth {
1337*fcf5ef2aSThomas Huth     MMXReg r;
1338*fcf5ef2aSThomas Huth 
1339*fcf5ef2aSThomas Huth     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1340*fcf5ef2aSThomas Huth     r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1341*fcf5ef2aSThomas Huth     *d = r;
1342*fcf5ef2aSThomas Huth }
1343*fcf5ef2aSThomas Huth 
1344*fcf5ef2aSThomas Huth void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1345*fcf5ef2aSThomas Huth {
1346*fcf5ef2aSThomas Huth     MMXReg r;
1347*fcf5ef2aSThomas Huth 
1348*fcf5ef2aSThomas Huth     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1349*fcf5ef2aSThomas Huth     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1350*fcf5ef2aSThomas Huth     *d = r;
1351*fcf5ef2aSThomas Huth }
1352*fcf5ef2aSThomas Huth 
1353*fcf5ef2aSThomas Huth void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
1354*fcf5ef2aSThomas Huth {
1355*fcf5ef2aSThomas Huth     d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
1356*fcf5ef2aSThomas Huth     d->MMX_S(1) = d->MMX_S(0);
1357*fcf5ef2aSThomas Huth }
1358*fcf5ef2aSThomas Huth 
1359*fcf5ef2aSThomas Huth void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
1360*fcf5ef2aSThomas Huth {
1361*fcf5ef2aSThomas Huth     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
1362*fcf5ef2aSThomas Huth     d->MMX_S(1) = float32_div(float32_one,
1363*fcf5ef2aSThomas Huth                               float32_sqrt(d->MMX_S(1), &env->mmx_status),
1364*fcf5ef2aSThomas Huth                               &env->mmx_status);
1365*fcf5ef2aSThomas Huth     d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1366*fcf5ef2aSThomas Huth     d->MMX_L(0) = d->MMX_L(1);
1367*fcf5ef2aSThomas Huth }
1368*fcf5ef2aSThomas Huth 
1369*fcf5ef2aSThomas Huth void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
1370*fcf5ef2aSThomas Huth {
1371*fcf5ef2aSThomas Huth     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1372*fcf5ef2aSThomas Huth     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1373*fcf5ef2aSThomas Huth }
1374*fcf5ef2aSThomas Huth 
1375*fcf5ef2aSThomas Huth void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
1376*fcf5ef2aSThomas Huth {
1377*fcf5ef2aSThomas Huth     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1378*fcf5ef2aSThomas Huth     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1379*fcf5ef2aSThomas Huth }
1380*fcf5ef2aSThomas Huth 
1381*fcf5ef2aSThomas Huth void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
1382*fcf5ef2aSThomas Huth {
1383*fcf5ef2aSThomas Huth     MMXReg r;
1384*fcf5ef2aSThomas Huth 
1385*fcf5ef2aSThomas Huth     r.MMX_L(0) = s->MMX_L(1);
1386*fcf5ef2aSThomas Huth     r.MMX_L(1) = s->MMX_L(0);
1387*fcf5ef2aSThomas Huth     *d = r;
1388*fcf5ef2aSThomas Huth }
1389*fcf5ef2aSThomas Huth #endif
1390*fcf5ef2aSThomas Huth 
1391*fcf5ef2aSThomas Huth /* SSSE3 op helpers */
1392*fcf5ef2aSThomas Huth void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1393*fcf5ef2aSThomas Huth {
1394*fcf5ef2aSThomas Huth     int i;
1395*fcf5ef2aSThomas Huth     Reg r;
1396*fcf5ef2aSThomas Huth 
1397*fcf5ef2aSThomas Huth     for (i = 0; i < (8 << SHIFT); i++) {
1398*fcf5ef2aSThomas Huth         r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
1399*fcf5ef2aSThomas Huth     }
1400*fcf5ef2aSThomas Huth 
1401*fcf5ef2aSThomas Huth     *d = r;
1402*fcf5ef2aSThomas Huth }
1403*fcf5ef2aSThomas Huth 
1404*fcf5ef2aSThomas Huth void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1405*fcf5ef2aSThomas Huth {
1406*fcf5ef2aSThomas Huth     d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
1407*fcf5ef2aSThomas Huth     d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
1408*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
1409*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
1410*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
1411*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
1412*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
1413*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
1414*fcf5ef2aSThomas Huth }
1415*fcf5ef2aSThomas Huth 
1416*fcf5ef2aSThomas Huth void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1417*fcf5ef2aSThomas Huth {
1418*fcf5ef2aSThomas Huth     d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
1419*fcf5ef2aSThomas Huth     XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
1420*fcf5ef2aSThomas Huth     d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
1421*fcf5ef2aSThomas Huth     XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
1422*fcf5ef2aSThomas Huth }
1423*fcf5ef2aSThomas Huth 
1424*fcf5ef2aSThomas Huth void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1425*fcf5ef2aSThomas Huth {
1426*fcf5ef2aSThomas Huth     d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
1427*fcf5ef2aSThomas Huth     d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
1428*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
1429*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
1430*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
1431*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
1432*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
1433*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
1434*fcf5ef2aSThomas Huth }
1435*fcf5ef2aSThomas Huth 
1436*fcf5ef2aSThomas Huth void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1437*fcf5ef2aSThomas Huth {
1438*fcf5ef2aSThomas Huth     d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
1439*fcf5ef2aSThomas Huth                     (int8_t)s->B(1) * (uint8_t)d->B(1));
1440*fcf5ef2aSThomas Huth     d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) +
1441*fcf5ef2aSThomas Huth                     (int8_t)s->B(3) * (uint8_t)d->B(3));
1442*fcf5ef2aSThomas Huth     d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) +
1443*fcf5ef2aSThomas Huth                     (int8_t)s->B(5) * (uint8_t)d->B(5));
1444*fcf5ef2aSThomas Huth     d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) +
1445*fcf5ef2aSThomas Huth                     (int8_t)s->B(7) * (uint8_t)d->B(7));
1446*fcf5ef2aSThomas Huth #if SHIFT == 1
1447*fcf5ef2aSThomas Huth     d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) +
1448*fcf5ef2aSThomas Huth                     (int8_t)s->B(9) * (uint8_t)d->B(9));
1449*fcf5ef2aSThomas Huth     d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
1450*fcf5ef2aSThomas Huth                     (int8_t)s->B(11) * (uint8_t)d->B(11));
1451*fcf5ef2aSThomas Huth     d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
1452*fcf5ef2aSThomas Huth                     (int8_t)s->B(13) * (uint8_t)d->B(13));
1453*fcf5ef2aSThomas Huth     d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
1454*fcf5ef2aSThomas Huth                     (int8_t)s->B(15) * (uint8_t)d->B(15));
1455*fcf5ef2aSThomas Huth #endif
1456*fcf5ef2aSThomas Huth }
1457*fcf5ef2aSThomas Huth 
1458*fcf5ef2aSThomas Huth void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1459*fcf5ef2aSThomas Huth {
1460*fcf5ef2aSThomas Huth     d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
1461*fcf5ef2aSThomas Huth     d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
1462*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
1463*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
1464*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
1465*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
1466*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
1467*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
1468*fcf5ef2aSThomas Huth }
1469*fcf5ef2aSThomas Huth 
1470*fcf5ef2aSThomas Huth void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1471*fcf5ef2aSThomas Huth {
1472*fcf5ef2aSThomas Huth     d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
1473*fcf5ef2aSThomas Huth     XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
1474*fcf5ef2aSThomas Huth     d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
1475*fcf5ef2aSThomas Huth     XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
1476*fcf5ef2aSThomas Huth }
1477*fcf5ef2aSThomas Huth 
1478*fcf5ef2aSThomas Huth void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1479*fcf5ef2aSThomas Huth {
1480*fcf5ef2aSThomas Huth     d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
1481*fcf5ef2aSThomas Huth     d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
1482*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
1483*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
1484*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
1485*fcf5ef2aSThomas Huth     d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
1486*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
1487*fcf5ef2aSThomas Huth     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
1488*fcf5ef2aSThomas Huth }
1489*fcf5ef2aSThomas Huth 
1490*fcf5ef2aSThomas Huth #define FABSB(_, x) (x > INT8_MAX  ? -(int8_t)x : x)
1491*fcf5ef2aSThomas Huth #define FABSW(_, x) (x > INT16_MAX ? -(int16_t)x : x)
1492*fcf5ef2aSThomas Huth #define FABSL(_, x) (x > INT32_MAX ? -(int32_t)x : x)
1493*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pabsb, FABSB)
1494*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pabsw, FABSW)
1495*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pabsd, FABSL)
1496*fcf5ef2aSThomas Huth 
1497*fcf5ef2aSThomas Huth #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
1498*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1499*fcf5ef2aSThomas Huth 
1500*fcf5ef2aSThomas Huth #define FSIGNB(d, s) (s <= INT8_MAX  ? s ? d : 0 : -(int8_t)d)
1501*fcf5ef2aSThomas Huth #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1502*fcf5ef2aSThomas Huth #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
1503*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_psignb, FSIGNB)
1504*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_psignw, FSIGNW)
1505*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_psignd, FSIGNL)
1506*fcf5ef2aSThomas Huth 
1507*fcf5ef2aSThomas Huth void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1508*fcf5ef2aSThomas Huth                                   int32_t shift)
1509*fcf5ef2aSThomas Huth {
1510*fcf5ef2aSThomas Huth     Reg r;
1511*fcf5ef2aSThomas Huth 
1512*fcf5ef2aSThomas Huth     /* XXX could be checked during translation */
1513*fcf5ef2aSThomas Huth     if (shift >= (16 << SHIFT)) {
1514*fcf5ef2aSThomas Huth         r.Q(0) = 0;
1515*fcf5ef2aSThomas Huth         XMM_ONLY(r.Q(1) = 0);
1516*fcf5ef2aSThomas Huth     } else {
1517*fcf5ef2aSThomas Huth         shift <<= 3;
1518*fcf5ef2aSThomas Huth #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1519*fcf5ef2aSThomas Huth #if SHIFT == 0
1520*fcf5ef2aSThomas Huth         r.Q(0) = SHR(s->Q(0), shift - 0) |
1521*fcf5ef2aSThomas Huth             SHR(d->Q(0), shift -  64);
1522*fcf5ef2aSThomas Huth #else
1523*fcf5ef2aSThomas Huth         r.Q(0) = SHR(s->Q(0), shift - 0) |
1524*fcf5ef2aSThomas Huth             SHR(s->Q(1), shift -  64) |
1525*fcf5ef2aSThomas Huth             SHR(d->Q(0), shift - 128) |
1526*fcf5ef2aSThomas Huth             SHR(d->Q(1), shift - 192);
1527*fcf5ef2aSThomas Huth         r.Q(1) = SHR(s->Q(0), shift + 64) |
1528*fcf5ef2aSThomas Huth             SHR(s->Q(1), shift -   0) |
1529*fcf5ef2aSThomas Huth             SHR(d->Q(0), shift -  64) |
1530*fcf5ef2aSThomas Huth             SHR(d->Q(1), shift - 128);
1531*fcf5ef2aSThomas Huth #endif
1532*fcf5ef2aSThomas Huth #undef SHR
1533*fcf5ef2aSThomas Huth     }
1534*fcf5ef2aSThomas Huth 
1535*fcf5ef2aSThomas Huth     *d = r;
1536*fcf5ef2aSThomas Huth }
1537*fcf5ef2aSThomas Huth 
1538*fcf5ef2aSThomas Huth #define XMM0 (env->xmm_regs[0])
1539*fcf5ef2aSThomas Huth 
1540*fcf5ef2aSThomas Huth #if SHIFT == 1
1541*fcf5ef2aSThomas Huth #define SSE_HELPER_V(name, elem, num, F)                                \
1542*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)           \
1543*fcf5ef2aSThomas Huth     {                                                                   \
1544*fcf5ef2aSThomas Huth         d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));           \
1545*fcf5ef2aSThomas Huth         d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));           \
1546*fcf5ef2aSThomas Huth         if (num > 2) {                                                  \
1547*fcf5ef2aSThomas Huth             d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));       \
1548*fcf5ef2aSThomas Huth             d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));       \
1549*fcf5ef2aSThomas Huth             if (num > 4) {                                              \
1550*fcf5ef2aSThomas Huth                 d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));   \
1551*fcf5ef2aSThomas Huth                 d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));   \
1552*fcf5ef2aSThomas Huth                 d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));   \
1553*fcf5ef2aSThomas Huth                 d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));   \
1554*fcf5ef2aSThomas Huth                 if (num > 8) {                                          \
1555*fcf5ef2aSThomas Huth                     d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \
1556*fcf5ef2aSThomas Huth                     d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \
1557*fcf5ef2aSThomas Huth                     d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \
1558*fcf5ef2aSThomas Huth                     d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \
1559*fcf5ef2aSThomas Huth                     d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \
1560*fcf5ef2aSThomas Huth                     d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \
1561*fcf5ef2aSThomas Huth                     d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \
1562*fcf5ef2aSThomas Huth                     d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \
1563*fcf5ef2aSThomas Huth                 }                                                       \
1564*fcf5ef2aSThomas Huth             }                                                           \
1565*fcf5ef2aSThomas Huth         }                                                               \
1566*fcf5ef2aSThomas Huth     }
1567*fcf5ef2aSThomas Huth 
1568*fcf5ef2aSThomas Huth #define SSE_HELPER_I(name, elem, num, F)                                \
1569*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \
1570*fcf5ef2aSThomas Huth     {                                                                   \
1571*fcf5ef2aSThomas Huth         d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));       \
1572*fcf5ef2aSThomas Huth         d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));       \
1573*fcf5ef2aSThomas Huth         if (num > 2) {                                                  \
1574*fcf5ef2aSThomas Huth             d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));   \
1575*fcf5ef2aSThomas Huth             d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));   \
1576*fcf5ef2aSThomas Huth             if (num > 4) {                                              \
1577*fcf5ef2aSThomas Huth                 d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1)); \
1578*fcf5ef2aSThomas Huth                 d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1)); \
1579*fcf5ef2aSThomas Huth                 d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1)); \
1580*fcf5ef2aSThomas Huth                 d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1)); \
1581*fcf5ef2aSThomas Huth                 if (num > 8) {                                          \
1582*fcf5ef2aSThomas Huth                     d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1)); \
1583*fcf5ef2aSThomas Huth                     d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1)); \
1584*fcf5ef2aSThomas Huth                     d->elem(10) = F(d->elem(10), s->elem(10),           \
1585*fcf5ef2aSThomas Huth                                     ((imm >> 10) & 1));                 \
1586*fcf5ef2aSThomas Huth                     d->elem(11) = F(d->elem(11), s->elem(11),           \
1587*fcf5ef2aSThomas Huth                                     ((imm >> 11) & 1));                 \
1588*fcf5ef2aSThomas Huth                     d->elem(12) = F(d->elem(12), s->elem(12),           \
1589*fcf5ef2aSThomas Huth                                     ((imm >> 12) & 1));                 \
1590*fcf5ef2aSThomas Huth                     d->elem(13) = F(d->elem(13), s->elem(13),           \
1591*fcf5ef2aSThomas Huth                                     ((imm >> 13) & 1));                 \
1592*fcf5ef2aSThomas Huth                     d->elem(14) = F(d->elem(14), s->elem(14),           \
1593*fcf5ef2aSThomas Huth                                     ((imm >> 14) & 1));                 \
1594*fcf5ef2aSThomas Huth                     d->elem(15) = F(d->elem(15), s->elem(15),           \
1595*fcf5ef2aSThomas Huth                                     ((imm >> 15) & 1));                 \
1596*fcf5ef2aSThomas Huth                 }                                                       \
1597*fcf5ef2aSThomas Huth             }                                                           \
1598*fcf5ef2aSThomas Huth         }                                                               \
1599*fcf5ef2aSThomas Huth     }
1600*fcf5ef2aSThomas Huth 
1601*fcf5ef2aSThomas Huth /* SSE4.1 op helpers */
1602*fcf5ef2aSThomas Huth #define FBLENDVB(d, s, m) ((m & 0x80) ? s : d)
1603*fcf5ef2aSThomas Huth #define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d)
1604*fcf5ef2aSThomas Huth #define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d)
1605*fcf5ef2aSThomas Huth SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
1606*fcf5ef2aSThomas Huth SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
1607*fcf5ef2aSThomas Huth SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
1608*fcf5ef2aSThomas Huth 
1609*fcf5ef2aSThomas Huth void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1610*fcf5ef2aSThomas Huth {
1611*fcf5ef2aSThomas Huth     uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
1612*fcf5ef2aSThomas Huth     uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
1613*fcf5ef2aSThomas Huth 
1614*fcf5ef2aSThomas Huth     CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1615*fcf5ef2aSThomas Huth }
1616*fcf5ef2aSThomas Huth 
1617*fcf5ef2aSThomas Huth #define SSE_HELPER_F(name, elem, num, F)        \
1618*fcf5ef2aSThomas Huth     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)     \
1619*fcf5ef2aSThomas Huth     {                                           \
1620*fcf5ef2aSThomas Huth         d->elem(0) = F(0);                      \
1621*fcf5ef2aSThomas Huth         d->elem(1) = F(1);                      \
1622*fcf5ef2aSThomas Huth         if (num > 2) {                          \
1623*fcf5ef2aSThomas Huth             d->elem(2) = F(2);                  \
1624*fcf5ef2aSThomas Huth             d->elem(3) = F(3);                  \
1625*fcf5ef2aSThomas Huth             if (num > 4) {                      \
1626*fcf5ef2aSThomas Huth                 d->elem(4) = F(4);              \
1627*fcf5ef2aSThomas Huth                 d->elem(5) = F(5);              \
1628*fcf5ef2aSThomas Huth                 d->elem(6) = F(6);              \
1629*fcf5ef2aSThomas Huth                 d->elem(7) = F(7);              \
1630*fcf5ef2aSThomas Huth             }                                   \
1631*fcf5ef2aSThomas Huth         }                                       \
1632*fcf5ef2aSThomas Huth     }
1633*fcf5ef2aSThomas Huth 
1634*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
1635*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
1636*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
1637*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
1638*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
1639*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
1640*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
1641*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
1642*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
1643*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
1644*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
1645*fcf5ef2aSThomas Huth SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
1646*fcf5ef2aSThomas Huth 
1647*fcf5ef2aSThomas Huth void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1648*fcf5ef2aSThomas Huth {
1649*fcf5ef2aSThomas Huth     d->Q(0) = (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0);
1650*fcf5ef2aSThomas Huth     d->Q(1) = (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2);
1651*fcf5ef2aSThomas Huth }
1652*fcf5ef2aSThomas Huth 
1653*fcf5ef2aSThomas Huth #define FCMPEQQ(d, s) (d == s ? -1 : 0)
1654*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1655*fcf5ef2aSThomas Huth 
1656*fcf5ef2aSThomas Huth void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1657*fcf5ef2aSThomas Huth {
1658*fcf5ef2aSThomas Huth     d->W(0) = satuw((int32_t) d->L(0));
1659*fcf5ef2aSThomas Huth     d->W(1) = satuw((int32_t) d->L(1));
1660*fcf5ef2aSThomas Huth     d->W(2) = satuw((int32_t) d->L(2));
1661*fcf5ef2aSThomas Huth     d->W(3) = satuw((int32_t) d->L(3));
1662*fcf5ef2aSThomas Huth     d->W(4) = satuw((int32_t) s->L(0));
1663*fcf5ef2aSThomas Huth     d->W(5) = satuw((int32_t) s->L(1));
1664*fcf5ef2aSThomas Huth     d->W(6) = satuw((int32_t) s->L(2));
1665*fcf5ef2aSThomas Huth     d->W(7) = satuw((int32_t) s->L(3));
1666*fcf5ef2aSThomas Huth }
1667*fcf5ef2aSThomas Huth 
1668*fcf5ef2aSThomas Huth #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
1669*fcf5ef2aSThomas Huth #define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
1670*fcf5ef2aSThomas Huth #define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
1671*fcf5ef2aSThomas Huth #define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
1672*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pminsb, FMINSB)
1673*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pminsd, FMINSD)
1674*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pminuw, MIN)
1675*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pminud, MIN)
1676*fcf5ef2aSThomas Huth SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1677*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1678*fcf5ef2aSThomas Huth SSE_HELPER_W(helper_pmaxuw, MAX)
1679*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pmaxud, MAX)
1680*fcf5ef2aSThomas Huth 
1681*fcf5ef2aSThomas Huth #define FMULLD(d, s) ((int32_t)d * (int32_t)s)
1682*fcf5ef2aSThomas Huth SSE_HELPER_L(helper_pmulld, FMULLD)
1683*fcf5ef2aSThomas Huth 
1684*fcf5ef2aSThomas Huth void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1685*fcf5ef2aSThomas Huth {
1686*fcf5ef2aSThomas Huth     int idx = 0;
1687*fcf5ef2aSThomas Huth 
1688*fcf5ef2aSThomas Huth     if (s->W(1) < s->W(idx)) {
1689*fcf5ef2aSThomas Huth         idx = 1;
1690*fcf5ef2aSThomas Huth     }
1691*fcf5ef2aSThomas Huth     if (s->W(2) < s->W(idx)) {
1692*fcf5ef2aSThomas Huth         idx = 2;
1693*fcf5ef2aSThomas Huth     }
1694*fcf5ef2aSThomas Huth     if (s->W(3) < s->W(idx)) {
1695*fcf5ef2aSThomas Huth         idx = 3;
1696*fcf5ef2aSThomas Huth     }
1697*fcf5ef2aSThomas Huth     if (s->W(4) < s->W(idx)) {
1698*fcf5ef2aSThomas Huth         idx = 4;
1699*fcf5ef2aSThomas Huth     }
1700*fcf5ef2aSThomas Huth     if (s->W(5) < s->W(idx)) {
1701*fcf5ef2aSThomas Huth         idx = 5;
1702*fcf5ef2aSThomas Huth     }
1703*fcf5ef2aSThomas Huth     if (s->W(6) < s->W(idx)) {
1704*fcf5ef2aSThomas Huth         idx = 6;
1705*fcf5ef2aSThomas Huth     }
1706*fcf5ef2aSThomas Huth     if (s->W(7) < s->W(idx)) {
1707*fcf5ef2aSThomas Huth         idx = 7;
1708*fcf5ef2aSThomas Huth     }
1709*fcf5ef2aSThomas Huth 
1710*fcf5ef2aSThomas Huth     d->Q(1) = 0;
1711*fcf5ef2aSThomas Huth     d->L(1) = 0;
1712*fcf5ef2aSThomas Huth     d->W(1) = idx;
1713*fcf5ef2aSThomas Huth     d->W(0) = s->W(idx);
1714*fcf5ef2aSThomas Huth }
1715*fcf5ef2aSThomas Huth 
1716*fcf5ef2aSThomas Huth void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1717*fcf5ef2aSThomas Huth                                   uint32_t mode)
1718*fcf5ef2aSThomas Huth {
1719*fcf5ef2aSThomas Huth     signed char prev_rounding_mode;
1720*fcf5ef2aSThomas Huth 
1721*fcf5ef2aSThomas Huth     prev_rounding_mode = env->sse_status.float_rounding_mode;
1722*fcf5ef2aSThomas Huth     if (!(mode & (1 << 2))) {
1723*fcf5ef2aSThomas Huth         switch (mode & 3) {
1724*fcf5ef2aSThomas Huth         case 0:
1725*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1726*fcf5ef2aSThomas Huth             break;
1727*fcf5ef2aSThomas Huth         case 1:
1728*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_down, &env->sse_status);
1729*fcf5ef2aSThomas Huth             break;
1730*fcf5ef2aSThomas Huth         case 2:
1731*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_up, &env->sse_status);
1732*fcf5ef2aSThomas Huth             break;
1733*fcf5ef2aSThomas Huth         case 3:
1734*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1735*fcf5ef2aSThomas Huth             break;
1736*fcf5ef2aSThomas Huth         }
1737*fcf5ef2aSThomas Huth     }
1738*fcf5ef2aSThomas Huth 
1739*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
1740*fcf5ef2aSThomas Huth     d->ZMM_S(1) = float32_round_to_int(s->ZMM_S(1), &env->sse_status);
1741*fcf5ef2aSThomas Huth     d->ZMM_S(2) = float32_round_to_int(s->ZMM_S(2), &env->sse_status);
1742*fcf5ef2aSThomas Huth     d->ZMM_S(3) = float32_round_to_int(s->ZMM_S(3), &env->sse_status);
1743*fcf5ef2aSThomas Huth 
1744*fcf5ef2aSThomas Huth #if 0 /* TODO */
1745*fcf5ef2aSThomas Huth     if (mode & (1 << 3)) {
1746*fcf5ef2aSThomas Huth         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1747*fcf5ef2aSThomas Huth                                   ~float_flag_inexact,
1748*fcf5ef2aSThomas Huth                                   &env->sse_status);
1749*fcf5ef2aSThomas Huth     }
1750*fcf5ef2aSThomas Huth #endif
1751*fcf5ef2aSThomas Huth     env->sse_status.float_rounding_mode = prev_rounding_mode;
1752*fcf5ef2aSThomas Huth }
1753*fcf5ef2aSThomas Huth 
1754*fcf5ef2aSThomas Huth void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1755*fcf5ef2aSThomas Huth                                   uint32_t mode)
1756*fcf5ef2aSThomas Huth {
1757*fcf5ef2aSThomas Huth     signed char prev_rounding_mode;
1758*fcf5ef2aSThomas Huth 
1759*fcf5ef2aSThomas Huth     prev_rounding_mode = env->sse_status.float_rounding_mode;
1760*fcf5ef2aSThomas Huth     if (!(mode & (1 << 2))) {
1761*fcf5ef2aSThomas Huth         switch (mode & 3) {
1762*fcf5ef2aSThomas Huth         case 0:
1763*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1764*fcf5ef2aSThomas Huth             break;
1765*fcf5ef2aSThomas Huth         case 1:
1766*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_down, &env->sse_status);
1767*fcf5ef2aSThomas Huth             break;
1768*fcf5ef2aSThomas Huth         case 2:
1769*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_up, &env->sse_status);
1770*fcf5ef2aSThomas Huth             break;
1771*fcf5ef2aSThomas Huth         case 3:
1772*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1773*fcf5ef2aSThomas Huth             break;
1774*fcf5ef2aSThomas Huth         }
1775*fcf5ef2aSThomas Huth     }
1776*fcf5ef2aSThomas Huth 
1777*fcf5ef2aSThomas Huth     d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
1778*fcf5ef2aSThomas Huth     d->ZMM_D(1) = float64_round_to_int(s->ZMM_D(1), &env->sse_status);
1779*fcf5ef2aSThomas Huth 
1780*fcf5ef2aSThomas Huth #if 0 /* TODO */
1781*fcf5ef2aSThomas Huth     if (mode & (1 << 3)) {
1782*fcf5ef2aSThomas Huth         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1783*fcf5ef2aSThomas Huth                                   ~float_flag_inexact,
1784*fcf5ef2aSThomas Huth                                   &env->sse_status);
1785*fcf5ef2aSThomas Huth     }
1786*fcf5ef2aSThomas Huth #endif
1787*fcf5ef2aSThomas Huth     env->sse_status.float_rounding_mode = prev_rounding_mode;
1788*fcf5ef2aSThomas Huth }
1789*fcf5ef2aSThomas Huth 
1790*fcf5ef2aSThomas Huth void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1791*fcf5ef2aSThomas Huth                                   uint32_t mode)
1792*fcf5ef2aSThomas Huth {
1793*fcf5ef2aSThomas Huth     signed char prev_rounding_mode;
1794*fcf5ef2aSThomas Huth 
1795*fcf5ef2aSThomas Huth     prev_rounding_mode = env->sse_status.float_rounding_mode;
1796*fcf5ef2aSThomas Huth     if (!(mode & (1 << 2))) {
1797*fcf5ef2aSThomas Huth         switch (mode & 3) {
1798*fcf5ef2aSThomas Huth         case 0:
1799*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1800*fcf5ef2aSThomas Huth             break;
1801*fcf5ef2aSThomas Huth         case 1:
1802*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_down, &env->sse_status);
1803*fcf5ef2aSThomas Huth             break;
1804*fcf5ef2aSThomas Huth         case 2:
1805*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_up, &env->sse_status);
1806*fcf5ef2aSThomas Huth             break;
1807*fcf5ef2aSThomas Huth         case 3:
1808*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1809*fcf5ef2aSThomas Huth             break;
1810*fcf5ef2aSThomas Huth         }
1811*fcf5ef2aSThomas Huth     }
1812*fcf5ef2aSThomas Huth 
1813*fcf5ef2aSThomas Huth     d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
1814*fcf5ef2aSThomas Huth 
1815*fcf5ef2aSThomas Huth #if 0 /* TODO */
1816*fcf5ef2aSThomas Huth     if (mode & (1 << 3)) {
1817*fcf5ef2aSThomas Huth         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1818*fcf5ef2aSThomas Huth                                   ~float_flag_inexact,
1819*fcf5ef2aSThomas Huth                                   &env->sse_status);
1820*fcf5ef2aSThomas Huth     }
1821*fcf5ef2aSThomas Huth #endif
1822*fcf5ef2aSThomas Huth     env->sse_status.float_rounding_mode = prev_rounding_mode;
1823*fcf5ef2aSThomas Huth }
1824*fcf5ef2aSThomas Huth 
1825*fcf5ef2aSThomas Huth void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1826*fcf5ef2aSThomas Huth                                   uint32_t mode)
1827*fcf5ef2aSThomas Huth {
1828*fcf5ef2aSThomas Huth     signed char prev_rounding_mode;
1829*fcf5ef2aSThomas Huth 
1830*fcf5ef2aSThomas Huth     prev_rounding_mode = env->sse_status.float_rounding_mode;
1831*fcf5ef2aSThomas Huth     if (!(mode & (1 << 2))) {
1832*fcf5ef2aSThomas Huth         switch (mode & 3) {
1833*fcf5ef2aSThomas Huth         case 0:
1834*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1835*fcf5ef2aSThomas Huth             break;
1836*fcf5ef2aSThomas Huth         case 1:
1837*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_down, &env->sse_status);
1838*fcf5ef2aSThomas Huth             break;
1839*fcf5ef2aSThomas Huth         case 2:
1840*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_up, &env->sse_status);
1841*fcf5ef2aSThomas Huth             break;
1842*fcf5ef2aSThomas Huth         case 3:
1843*fcf5ef2aSThomas Huth             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1844*fcf5ef2aSThomas Huth             break;
1845*fcf5ef2aSThomas Huth         }
1846*fcf5ef2aSThomas Huth     }
1847*fcf5ef2aSThomas Huth 
1848*fcf5ef2aSThomas Huth     d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
1849*fcf5ef2aSThomas Huth 
1850*fcf5ef2aSThomas Huth #if 0 /* TODO */
1851*fcf5ef2aSThomas Huth     if (mode & (1 << 3)) {
1852*fcf5ef2aSThomas Huth         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1853*fcf5ef2aSThomas Huth                                   ~float_flag_inexact,
1854*fcf5ef2aSThomas Huth                                   &env->sse_status);
1855*fcf5ef2aSThomas Huth     }
1856*fcf5ef2aSThomas Huth #endif
1857*fcf5ef2aSThomas Huth     env->sse_status.float_rounding_mode = prev_rounding_mode;
1858*fcf5ef2aSThomas Huth }
1859*fcf5ef2aSThomas Huth 
1860*fcf5ef2aSThomas Huth #define FBLENDP(d, s, m) (m ? s : d)
1861*fcf5ef2aSThomas Huth SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
1862*fcf5ef2aSThomas Huth SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
1863*fcf5ef2aSThomas Huth SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
1864*fcf5ef2aSThomas Huth 
1865*fcf5ef2aSThomas Huth void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
1866*fcf5ef2aSThomas Huth {
1867*fcf5ef2aSThomas Huth     float32 iresult = float32_zero;
1868*fcf5ef2aSThomas Huth 
1869*fcf5ef2aSThomas Huth     if (mask & (1 << 4)) {
1870*fcf5ef2aSThomas Huth         iresult = float32_add(iresult,
1871*fcf5ef2aSThomas Huth                               float32_mul(d->ZMM_S(0), s->ZMM_S(0),
1872*fcf5ef2aSThomas Huth                                           &env->sse_status),
1873*fcf5ef2aSThomas Huth                               &env->sse_status);
1874*fcf5ef2aSThomas Huth     }
1875*fcf5ef2aSThomas Huth     if (mask & (1 << 5)) {
1876*fcf5ef2aSThomas Huth         iresult = float32_add(iresult,
1877*fcf5ef2aSThomas Huth                               float32_mul(d->ZMM_S(1), s->ZMM_S(1),
1878*fcf5ef2aSThomas Huth                                           &env->sse_status),
1879*fcf5ef2aSThomas Huth                               &env->sse_status);
1880*fcf5ef2aSThomas Huth     }
1881*fcf5ef2aSThomas Huth     if (mask & (1 << 6)) {
1882*fcf5ef2aSThomas Huth         iresult = float32_add(iresult,
1883*fcf5ef2aSThomas Huth                               float32_mul(d->ZMM_S(2), s->ZMM_S(2),
1884*fcf5ef2aSThomas Huth                                           &env->sse_status),
1885*fcf5ef2aSThomas Huth                               &env->sse_status);
1886*fcf5ef2aSThomas Huth     }
1887*fcf5ef2aSThomas Huth     if (mask & (1 << 7)) {
1888*fcf5ef2aSThomas Huth         iresult = float32_add(iresult,
1889*fcf5ef2aSThomas Huth                               float32_mul(d->ZMM_S(3), s->ZMM_S(3),
1890*fcf5ef2aSThomas Huth                                           &env->sse_status),
1891*fcf5ef2aSThomas Huth                               &env->sse_status);
1892*fcf5ef2aSThomas Huth     }
1893*fcf5ef2aSThomas Huth     d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero;
1894*fcf5ef2aSThomas Huth     d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero;
1895*fcf5ef2aSThomas Huth     d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero;
1896*fcf5ef2aSThomas Huth     d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero;
1897*fcf5ef2aSThomas Huth }
1898*fcf5ef2aSThomas Huth 
1899*fcf5ef2aSThomas Huth void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
1900*fcf5ef2aSThomas Huth {
1901*fcf5ef2aSThomas Huth     float64 iresult = float64_zero;
1902*fcf5ef2aSThomas Huth 
1903*fcf5ef2aSThomas Huth     if (mask & (1 << 4)) {
1904*fcf5ef2aSThomas Huth         iresult = float64_add(iresult,
1905*fcf5ef2aSThomas Huth                               float64_mul(d->ZMM_D(0), s->ZMM_D(0),
1906*fcf5ef2aSThomas Huth                                           &env->sse_status),
1907*fcf5ef2aSThomas Huth                               &env->sse_status);
1908*fcf5ef2aSThomas Huth     }
1909*fcf5ef2aSThomas Huth     if (mask & (1 << 5)) {
1910*fcf5ef2aSThomas Huth         iresult = float64_add(iresult,
1911*fcf5ef2aSThomas Huth                               float64_mul(d->ZMM_D(1), s->ZMM_D(1),
1912*fcf5ef2aSThomas Huth                                           &env->sse_status),
1913*fcf5ef2aSThomas Huth                               &env->sse_status);
1914*fcf5ef2aSThomas Huth     }
1915*fcf5ef2aSThomas Huth     d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero;
1916*fcf5ef2aSThomas Huth     d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero;
1917*fcf5ef2aSThomas Huth }
1918*fcf5ef2aSThomas Huth 
1919*fcf5ef2aSThomas Huth void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1920*fcf5ef2aSThomas Huth                                   uint32_t offset)
1921*fcf5ef2aSThomas Huth {
1922*fcf5ef2aSThomas Huth     int s0 = (offset & 3) << 2;
1923*fcf5ef2aSThomas Huth     int d0 = (offset & 4) << 0;
1924*fcf5ef2aSThomas Huth     int i;
1925*fcf5ef2aSThomas Huth     Reg r;
1926*fcf5ef2aSThomas Huth 
1927*fcf5ef2aSThomas Huth     for (i = 0; i < 8; i++, d0++) {
1928*fcf5ef2aSThomas Huth         r.W(i) = 0;
1929*fcf5ef2aSThomas Huth         r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
1930*fcf5ef2aSThomas Huth         r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
1931*fcf5ef2aSThomas Huth         r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
1932*fcf5ef2aSThomas Huth         r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
1933*fcf5ef2aSThomas Huth     }
1934*fcf5ef2aSThomas Huth 
1935*fcf5ef2aSThomas Huth     *d = r;
1936*fcf5ef2aSThomas Huth }
1937*fcf5ef2aSThomas Huth 
1938*fcf5ef2aSThomas Huth /* SSE4.2 op helpers */
1939*fcf5ef2aSThomas Huth #define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
1940*fcf5ef2aSThomas Huth SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
1941*fcf5ef2aSThomas Huth 
1942*fcf5ef2aSThomas Huth static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
1943*fcf5ef2aSThomas Huth {
1944*fcf5ef2aSThomas Huth     int val;
1945*fcf5ef2aSThomas Huth 
1946*fcf5ef2aSThomas Huth     /* Presence of REX.W is indicated by a bit higher than 7 set */
1947*fcf5ef2aSThomas Huth     if (ctrl >> 8) {
1948*fcf5ef2aSThomas Huth         val = abs1((int64_t)env->regs[reg]);
1949*fcf5ef2aSThomas Huth     } else {
1950*fcf5ef2aSThomas Huth         val = abs1((int32_t)env->regs[reg]);
1951*fcf5ef2aSThomas Huth     }
1952*fcf5ef2aSThomas Huth 
1953*fcf5ef2aSThomas Huth     if (ctrl & 1) {
1954*fcf5ef2aSThomas Huth         if (val > 8) {
1955*fcf5ef2aSThomas Huth             return 8;
1956*fcf5ef2aSThomas Huth         }
1957*fcf5ef2aSThomas Huth     } else {
1958*fcf5ef2aSThomas Huth         if (val > 16) {
1959*fcf5ef2aSThomas Huth             return 16;
1960*fcf5ef2aSThomas Huth         }
1961*fcf5ef2aSThomas Huth     }
1962*fcf5ef2aSThomas Huth     return val;
1963*fcf5ef2aSThomas Huth }
1964*fcf5ef2aSThomas Huth 
1965*fcf5ef2aSThomas Huth static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
1966*fcf5ef2aSThomas Huth {
1967*fcf5ef2aSThomas Huth     int val = 0;
1968*fcf5ef2aSThomas Huth 
1969*fcf5ef2aSThomas Huth     if (ctrl & 1) {
1970*fcf5ef2aSThomas Huth         while (val < 8 && r->W(val)) {
1971*fcf5ef2aSThomas Huth             val++;
1972*fcf5ef2aSThomas Huth         }
1973*fcf5ef2aSThomas Huth     } else {
1974*fcf5ef2aSThomas Huth         while (val < 16 && r->B(val)) {
1975*fcf5ef2aSThomas Huth             val++;
1976*fcf5ef2aSThomas Huth         }
1977*fcf5ef2aSThomas Huth     }
1978*fcf5ef2aSThomas Huth 
1979*fcf5ef2aSThomas Huth     return val;
1980*fcf5ef2aSThomas Huth }
1981*fcf5ef2aSThomas Huth 
1982*fcf5ef2aSThomas Huth static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
1983*fcf5ef2aSThomas Huth {
1984*fcf5ef2aSThomas Huth     switch ((ctrl >> 0) & 3) {
1985*fcf5ef2aSThomas Huth     case 0:
1986*fcf5ef2aSThomas Huth         return r->B(i);
1987*fcf5ef2aSThomas Huth     case 1:
1988*fcf5ef2aSThomas Huth         return r->W(i);
1989*fcf5ef2aSThomas Huth     case 2:
1990*fcf5ef2aSThomas Huth         return (int8_t)r->B(i);
1991*fcf5ef2aSThomas Huth     case 3:
1992*fcf5ef2aSThomas Huth     default:
1993*fcf5ef2aSThomas Huth         return (int16_t)r->W(i);
1994*fcf5ef2aSThomas Huth     }
1995*fcf5ef2aSThomas Huth }
1996*fcf5ef2aSThomas Huth 
1997*fcf5ef2aSThomas Huth static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
1998*fcf5ef2aSThomas Huth                                  int8_t ctrl, int valids, int validd)
1999*fcf5ef2aSThomas Huth {
2000*fcf5ef2aSThomas Huth     unsigned int res = 0;
2001*fcf5ef2aSThomas Huth     int v;
2002*fcf5ef2aSThomas Huth     int j, i;
2003*fcf5ef2aSThomas Huth     int upper = (ctrl & 1) ? 7 : 15;
2004*fcf5ef2aSThomas Huth 
2005*fcf5ef2aSThomas Huth     valids--;
2006*fcf5ef2aSThomas Huth     validd--;
2007*fcf5ef2aSThomas Huth 
2008*fcf5ef2aSThomas Huth     CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
2009*fcf5ef2aSThomas Huth 
2010*fcf5ef2aSThomas Huth     switch ((ctrl >> 2) & 3) {
2011*fcf5ef2aSThomas Huth     case 0:
2012*fcf5ef2aSThomas Huth         for (j = valids; j >= 0; j--) {
2013*fcf5ef2aSThomas Huth             res <<= 1;
2014*fcf5ef2aSThomas Huth             v = pcmp_val(s, ctrl, j);
2015*fcf5ef2aSThomas Huth             for (i = validd; i >= 0; i--) {
2016*fcf5ef2aSThomas Huth                 res |= (v == pcmp_val(d, ctrl, i));
2017*fcf5ef2aSThomas Huth             }
2018*fcf5ef2aSThomas Huth         }
2019*fcf5ef2aSThomas Huth         break;
2020*fcf5ef2aSThomas Huth     case 1:
2021*fcf5ef2aSThomas Huth         for (j = valids; j >= 0; j--) {
2022*fcf5ef2aSThomas Huth             res <<= 1;
2023*fcf5ef2aSThomas Huth             v = pcmp_val(s, ctrl, j);
2024*fcf5ef2aSThomas Huth             for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
2025*fcf5ef2aSThomas Huth                 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
2026*fcf5ef2aSThomas Huth                         pcmp_val(d, ctrl, i - 1) <= v);
2027*fcf5ef2aSThomas Huth             }
2028*fcf5ef2aSThomas Huth         }
2029*fcf5ef2aSThomas Huth         break;
2030*fcf5ef2aSThomas Huth     case 2:
2031*fcf5ef2aSThomas Huth         res = (1 << (upper - MAX(valids, validd))) - 1;
2032*fcf5ef2aSThomas Huth         res <<= MAX(valids, validd) - MIN(valids, validd);
2033*fcf5ef2aSThomas Huth         for (i = MIN(valids, validd); i >= 0; i--) {
2034*fcf5ef2aSThomas Huth             res <<= 1;
2035*fcf5ef2aSThomas Huth             v = pcmp_val(s, ctrl, i);
2036*fcf5ef2aSThomas Huth             res |= (v == pcmp_val(d, ctrl, i));
2037*fcf5ef2aSThomas Huth         }
2038*fcf5ef2aSThomas Huth         break;
2039*fcf5ef2aSThomas Huth     case 3:
2040*fcf5ef2aSThomas Huth         for (j = valids; j >= 0; j--) {
2041*fcf5ef2aSThomas Huth             res <<= 1;
2042*fcf5ef2aSThomas Huth             v = 1;
2043*fcf5ef2aSThomas Huth             for (i = MIN(valids - j, validd); i >= 0; i--) {
2044*fcf5ef2aSThomas Huth                 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
2045*fcf5ef2aSThomas Huth             }
2046*fcf5ef2aSThomas Huth             res |= v;
2047*fcf5ef2aSThomas Huth         }
2048*fcf5ef2aSThomas Huth         break;
2049*fcf5ef2aSThomas Huth     }
2050*fcf5ef2aSThomas Huth 
2051*fcf5ef2aSThomas Huth     switch ((ctrl >> 4) & 3) {
2052*fcf5ef2aSThomas Huth     case 1:
2053*fcf5ef2aSThomas Huth         res ^= (2 << upper) - 1;
2054*fcf5ef2aSThomas Huth         break;
2055*fcf5ef2aSThomas Huth     case 3:
2056*fcf5ef2aSThomas Huth         res ^= (1 << (valids + 1)) - 1;
2057*fcf5ef2aSThomas Huth         break;
2058*fcf5ef2aSThomas Huth     }
2059*fcf5ef2aSThomas Huth 
2060*fcf5ef2aSThomas Huth     if (res) {
2061*fcf5ef2aSThomas Huth         CC_SRC |= CC_C;
2062*fcf5ef2aSThomas Huth     }
2063*fcf5ef2aSThomas Huth     if (res & 1) {
2064*fcf5ef2aSThomas Huth         CC_SRC |= CC_O;
2065*fcf5ef2aSThomas Huth     }
2066*fcf5ef2aSThomas Huth 
2067*fcf5ef2aSThomas Huth     return res;
2068*fcf5ef2aSThomas Huth }
2069*fcf5ef2aSThomas Huth 
2070*fcf5ef2aSThomas Huth void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2071*fcf5ef2aSThomas Huth                                     uint32_t ctrl)
2072*fcf5ef2aSThomas Huth {
2073*fcf5ef2aSThomas Huth     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2074*fcf5ef2aSThomas Huth                                  pcmp_elen(env, R_EDX, ctrl),
2075*fcf5ef2aSThomas Huth                                  pcmp_elen(env, R_EAX, ctrl));
2076*fcf5ef2aSThomas Huth 
2077*fcf5ef2aSThomas Huth     if (res) {
2078*fcf5ef2aSThomas Huth         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2079*fcf5ef2aSThomas Huth     } else {
2080*fcf5ef2aSThomas Huth         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2081*fcf5ef2aSThomas Huth     }
2082*fcf5ef2aSThomas Huth }
2083*fcf5ef2aSThomas Huth 
2084*fcf5ef2aSThomas Huth void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2085*fcf5ef2aSThomas Huth                                     uint32_t ctrl)
2086*fcf5ef2aSThomas Huth {
2087*fcf5ef2aSThomas Huth     int i;
2088*fcf5ef2aSThomas Huth     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2089*fcf5ef2aSThomas Huth                                  pcmp_elen(env, R_EDX, ctrl),
2090*fcf5ef2aSThomas Huth                                  pcmp_elen(env, R_EAX, ctrl));
2091*fcf5ef2aSThomas Huth 
2092*fcf5ef2aSThomas Huth     if ((ctrl >> 6) & 1) {
2093*fcf5ef2aSThomas Huth         if (ctrl & 1) {
2094*fcf5ef2aSThomas Huth             for (i = 0; i < 8; i++, res >>= 1) {
2095*fcf5ef2aSThomas Huth                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2096*fcf5ef2aSThomas Huth             }
2097*fcf5ef2aSThomas Huth         } else {
2098*fcf5ef2aSThomas Huth             for (i = 0; i < 16; i++, res >>= 1) {
2099*fcf5ef2aSThomas Huth                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2100*fcf5ef2aSThomas Huth             }
2101*fcf5ef2aSThomas Huth         }
2102*fcf5ef2aSThomas Huth     } else {
2103*fcf5ef2aSThomas Huth         env->xmm_regs[0].Q(1) = 0;
2104*fcf5ef2aSThomas Huth         env->xmm_regs[0].Q(0) = res;
2105*fcf5ef2aSThomas Huth     }
2106*fcf5ef2aSThomas Huth }
2107*fcf5ef2aSThomas Huth 
2108*fcf5ef2aSThomas Huth void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2109*fcf5ef2aSThomas Huth                                     uint32_t ctrl)
2110*fcf5ef2aSThomas Huth {
2111*fcf5ef2aSThomas Huth     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2112*fcf5ef2aSThomas Huth                                  pcmp_ilen(s, ctrl),
2113*fcf5ef2aSThomas Huth                                  pcmp_ilen(d, ctrl));
2114*fcf5ef2aSThomas Huth 
2115*fcf5ef2aSThomas Huth     if (res) {
2116*fcf5ef2aSThomas Huth         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2117*fcf5ef2aSThomas Huth     } else {
2118*fcf5ef2aSThomas Huth         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2119*fcf5ef2aSThomas Huth     }
2120*fcf5ef2aSThomas Huth }
2121*fcf5ef2aSThomas Huth 
2122*fcf5ef2aSThomas Huth void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2123*fcf5ef2aSThomas Huth                                     uint32_t ctrl)
2124*fcf5ef2aSThomas Huth {
2125*fcf5ef2aSThomas Huth     int i;
2126*fcf5ef2aSThomas Huth     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2127*fcf5ef2aSThomas Huth                                  pcmp_ilen(s, ctrl),
2128*fcf5ef2aSThomas Huth                                  pcmp_ilen(d, ctrl));
2129*fcf5ef2aSThomas Huth 
2130*fcf5ef2aSThomas Huth     if ((ctrl >> 6) & 1) {
2131*fcf5ef2aSThomas Huth         if (ctrl & 1) {
2132*fcf5ef2aSThomas Huth             for (i = 0; i < 8; i++, res >>= 1) {
2133*fcf5ef2aSThomas Huth                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2134*fcf5ef2aSThomas Huth             }
2135*fcf5ef2aSThomas Huth         } else {
2136*fcf5ef2aSThomas Huth             for (i = 0; i < 16; i++, res >>= 1) {
2137*fcf5ef2aSThomas Huth                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2138*fcf5ef2aSThomas Huth             }
2139*fcf5ef2aSThomas Huth         }
2140*fcf5ef2aSThomas Huth     } else {
2141*fcf5ef2aSThomas Huth         env->xmm_regs[0].Q(1) = 0;
2142*fcf5ef2aSThomas Huth         env->xmm_regs[0].Q(0) = res;
2143*fcf5ef2aSThomas Huth     }
2144*fcf5ef2aSThomas Huth }
2145*fcf5ef2aSThomas Huth 
2146*fcf5ef2aSThomas Huth #define CRCPOLY        0x1edc6f41
2147*fcf5ef2aSThomas Huth #define CRCPOLY_BITREV 0x82f63b78
2148*fcf5ef2aSThomas Huth target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2149*fcf5ef2aSThomas Huth {
2150*fcf5ef2aSThomas Huth     target_ulong crc = (msg & ((target_ulong) -1 >>
2151*fcf5ef2aSThomas Huth                                (TARGET_LONG_BITS - len))) ^ crc1;
2152*fcf5ef2aSThomas Huth 
2153*fcf5ef2aSThomas Huth     while (len--) {
2154*fcf5ef2aSThomas Huth         crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
2155*fcf5ef2aSThomas Huth     }
2156*fcf5ef2aSThomas Huth 
2157*fcf5ef2aSThomas Huth     return crc;
2158*fcf5ef2aSThomas Huth }
2159*fcf5ef2aSThomas Huth 
2160*fcf5ef2aSThomas Huth #define POPMASK(i)     ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
2161*fcf5ef2aSThomas Huth #define POPCOUNT(n, i) ((n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i)))
2162*fcf5ef2aSThomas Huth target_ulong helper_popcnt(CPUX86State *env, target_ulong n, uint32_t type)
2163*fcf5ef2aSThomas Huth {
2164*fcf5ef2aSThomas Huth     CC_SRC = n ? 0 : CC_Z;
2165*fcf5ef2aSThomas Huth 
2166*fcf5ef2aSThomas Huth     n = POPCOUNT(n, 0);
2167*fcf5ef2aSThomas Huth     n = POPCOUNT(n, 1);
2168*fcf5ef2aSThomas Huth     n = POPCOUNT(n, 2);
2169*fcf5ef2aSThomas Huth     n = POPCOUNT(n, 3);
2170*fcf5ef2aSThomas Huth     if (type == 1) {
2171*fcf5ef2aSThomas Huth         return n & 0xff;
2172*fcf5ef2aSThomas Huth     }
2173*fcf5ef2aSThomas Huth 
2174*fcf5ef2aSThomas Huth     n = POPCOUNT(n, 4);
2175*fcf5ef2aSThomas Huth #ifndef TARGET_X86_64
2176*fcf5ef2aSThomas Huth     return n;
2177*fcf5ef2aSThomas Huth #else
2178*fcf5ef2aSThomas Huth     if (type == 2) {
2179*fcf5ef2aSThomas Huth         return n & 0xff;
2180*fcf5ef2aSThomas Huth     }
2181*fcf5ef2aSThomas Huth 
2182*fcf5ef2aSThomas Huth     return POPCOUNT(n, 5);
2183*fcf5ef2aSThomas Huth #endif
2184*fcf5ef2aSThomas Huth }
2185*fcf5ef2aSThomas Huth 
2186*fcf5ef2aSThomas Huth void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2187*fcf5ef2aSThomas Huth                                     uint32_t ctrl)
2188*fcf5ef2aSThomas Huth {
2189*fcf5ef2aSThomas Huth     uint64_t ah, al, b, resh, resl;
2190*fcf5ef2aSThomas Huth 
2191*fcf5ef2aSThomas Huth     ah = 0;
2192*fcf5ef2aSThomas Huth     al = d->Q((ctrl & 1) != 0);
2193*fcf5ef2aSThomas Huth     b = s->Q((ctrl & 16) != 0);
2194*fcf5ef2aSThomas Huth     resh = resl = 0;
2195*fcf5ef2aSThomas Huth 
2196*fcf5ef2aSThomas Huth     while (b) {
2197*fcf5ef2aSThomas Huth         if (b & 1) {
2198*fcf5ef2aSThomas Huth             resl ^= al;
2199*fcf5ef2aSThomas Huth             resh ^= ah;
2200*fcf5ef2aSThomas Huth         }
2201*fcf5ef2aSThomas Huth         ah = (ah << 1) | (al >> 63);
2202*fcf5ef2aSThomas Huth         al <<= 1;
2203*fcf5ef2aSThomas Huth         b >>= 1;
2204*fcf5ef2aSThomas Huth     }
2205*fcf5ef2aSThomas Huth 
2206*fcf5ef2aSThomas Huth     d->Q(0) = resl;
2207*fcf5ef2aSThomas Huth     d->Q(1) = resh;
2208*fcf5ef2aSThomas Huth }
2209*fcf5ef2aSThomas Huth 
2210*fcf5ef2aSThomas Huth void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2211*fcf5ef2aSThomas Huth {
2212*fcf5ef2aSThomas Huth     int i;
2213*fcf5ef2aSThomas Huth     Reg st = *d;
2214*fcf5ef2aSThomas Huth     Reg rk = *s;
2215*fcf5ef2aSThomas Huth 
2216*fcf5ef2aSThomas Huth     for (i = 0 ; i < 4 ; i++) {
2217*fcf5ef2aSThomas Huth         d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^
2218*fcf5ef2aSThomas Huth                                     AES_Td1[st.B(AES_ishifts[4*i+1])] ^
2219*fcf5ef2aSThomas Huth                                     AES_Td2[st.B(AES_ishifts[4*i+2])] ^
2220*fcf5ef2aSThomas Huth                                     AES_Td3[st.B(AES_ishifts[4*i+3])]);
2221*fcf5ef2aSThomas Huth     }
2222*fcf5ef2aSThomas Huth }
2223*fcf5ef2aSThomas Huth 
2224*fcf5ef2aSThomas Huth void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2225*fcf5ef2aSThomas Huth {
2226*fcf5ef2aSThomas Huth     int i;
2227*fcf5ef2aSThomas Huth     Reg st = *d;
2228*fcf5ef2aSThomas Huth     Reg rk = *s;
2229*fcf5ef2aSThomas Huth 
2230*fcf5ef2aSThomas Huth     for (i = 0; i < 16; i++) {
2231*fcf5ef2aSThomas Huth         d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]);
2232*fcf5ef2aSThomas Huth     }
2233*fcf5ef2aSThomas Huth }
2234*fcf5ef2aSThomas Huth 
2235*fcf5ef2aSThomas Huth void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2236*fcf5ef2aSThomas Huth {
2237*fcf5ef2aSThomas Huth     int i;
2238*fcf5ef2aSThomas Huth     Reg st = *d;
2239*fcf5ef2aSThomas Huth     Reg rk = *s;
2240*fcf5ef2aSThomas Huth 
2241*fcf5ef2aSThomas Huth     for (i = 0 ; i < 4 ; i++) {
2242*fcf5ef2aSThomas Huth         d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^
2243*fcf5ef2aSThomas Huth                                     AES_Te1[st.B(AES_shifts[4*i+1])] ^
2244*fcf5ef2aSThomas Huth                                     AES_Te2[st.B(AES_shifts[4*i+2])] ^
2245*fcf5ef2aSThomas Huth                                     AES_Te3[st.B(AES_shifts[4*i+3])]);
2246*fcf5ef2aSThomas Huth     }
2247*fcf5ef2aSThomas Huth }
2248*fcf5ef2aSThomas Huth 
2249*fcf5ef2aSThomas Huth void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2250*fcf5ef2aSThomas Huth {
2251*fcf5ef2aSThomas Huth     int i;
2252*fcf5ef2aSThomas Huth     Reg st = *d;
2253*fcf5ef2aSThomas Huth     Reg rk = *s;
2254*fcf5ef2aSThomas Huth 
2255*fcf5ef2aSThomas Huth     for (i = 0; i < 16; i++) {
2256*fcf5ef2aSThomas Huth         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]);
2257*fcf5ef2aSThomas Huth     }
2258*fcf5ef2aSThomas Huth 
2259*fcf5ef2aSThomas Huth }
2260*fcf5ef2aSThomas Huth 
2261*fcf5ef2aSThomas Huth void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2262*fcf5ef2aSThomas Huth {
2263*fcf5ef2aSThomas Huth     int i;
2264*fcf5ef2aSThomas Huth     Reg tmp = *s;
2265*fcf5ef2aSThomas Huth 
2266*fcf5ef2aSThomas Huth     for (i = 0 ; i < 4 ; i++) {
2267*fcf5ef2aSThomas Huth         d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^
2268*fcf5ef2aSThomas Huth                           AES_imc[tmp.B(4*i+1)][1] ^
2269*fcf5ef2aSThomas Huth                           AES_imc[tmp.B(4*i+2)][2] ^
2270*fcf5ef2aSThomas Huth                           AES_imc[tmp.B(4*i+3)][3]);
2271*fcf5ef2aSThomas Huth     }
2272*fcf5ef2aSThomas Huth }
2273*fcf5ef2aSThomas Huth 
2274*fcf5ef2aSThomas Huth void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2275*fcf5ef2aSThomas Huth                                           uint32_t ctrl)
2276*fcf5ef2aSThomas Huth {
2277*fcf5ef2aSThomas Huth     int i;
2278*fcf5ef2aSThomas Huth     Reg tmp = *s;
2279*fcf5ef2aSThomas Huth 
2280*fcf5ef2aSThomas Huth     for (i = 0 ; i < 4 ; i++) {
2281*fcf5ef2aSThomas Huth         d->B(i) = AES_sbox[tmp.B(i + 4)];
2282*fcf5ef2aSThomas Huth         d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
2283*fcf5ef2aSThomas Huth     }
2284*fcf5ef2aSThomas Huth     d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2285*fcf5ef2aSThomas Huth     d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2286*fcf5ef2aSThomas Huth }
2287*fcf5ef2aSThomas Huth #endif
2288*fcf5ef2aSThomas Huth 
2289*fcf5ef2aSThomas Huth #undef SHIFT
2290*fcf5ef2aSThomas Huth #undef XMM_ONLY
2291*fcf5ef2aSThomas Huth #undef Reg
2292*fcf5ef2aSThomas Huth #undef B
2293*fcf5ef2aSThomas Huth #undef W
2294*fcf5ef2aSThomas Huth #undef L
2295*fcf5ef2aSThomas Huth #undef Q
2296*fcf5ef2aSThomas Huth #undef SUFFIX
2297