xref: /openbmc/qemu/target/i386/ops_sse.h (revision f723f626)
1 /*
2  *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3  *
4  *  Copyright (c) 2005 Fabrice Bellard
5  *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "crypto/aes.h"
22 
23 #if SHIFT == 0
24 #define Reg MMXReg
25 #define XMM_ONLY(...)
26 #define B(n) MMX_B(n)
27 #define W(n) MMX_W(n)
28 #define L(n) MMX_L(n)
29 #define Q(n) MMX_Q(n)
30 #define SUFFIX _mmx
31 #else
32 #define Reg ZMMReg
33 #define XMM_ONLY(...) __VA_ARGS__
34 #define B(n) ZMM_B(n)
35 #define W(n) ZMM_W(n)
36 #define L(n) ZMM_L(n)
37 #define Q(n) ZMM_Q(n)
38 #if SHIFT == 1
39 #define SUFFIX _xmm
40 #else
41 #define SUFFIX _ymm
42 #endif
43 #endif
44 
45 #define LANE_WIDTH (SHIFT ? 16 : 8)
46 #define PACK_WIDTH (LANE_WIDTH / 2)
47 
48 #if SHIFT == 0
49 #define FPSRL(x, c) ((x) >> shift)
50 #define FPSRAW(x, c) ((int16_t)(x) >> shift)
51 #define FPSRAL(x, c) ((int32_t)(x) >> shift)
52 #define FPSLL(x, c) ((x) << shift)
53 #endif
54 
55 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
56 {
57     int shift;
58     if (c->Q(0) > 15) {
59         for (int i = 0; i < 1 << SHIFT; i++) {
60             d->Q(i) = 0;
61         }
62     } else {
63         shift = c->B(0);
64         for (int i = 0; i < 4 << SHIFT; i++) {
65             d->W(i) = FPSRL(s->W(i), shift);
66         }
67     }
68 }
69 
70 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
71 {
72     int shift;
73     if (c->Q(0) > 15) {
74         for (int i = 0; i < 1 << SHIFT; i++) {
75             d->Q(i) = 0;
76         }
77     } else {
78         shift = c->B(0);
79         for (int i = 0; i < 4 << SHIFT; i++) {
80             d->W(i) = FPSLL(s->W(i), shift);
81         }
82     }
83 }
84 
85 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
86 {
87     int shift;
88     if (c->Q(0) > 15) {
89         shift = 15;
90     } else {
91         shift = c->B(0);
92     }
93     for (int i = 0; i < 4 << SHIFT; i++) {
94         d->W(i) = FPSRAW(s->W(i), shift);
95     }
96 }
97 
98 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
99 {
100     int shift;
101     if (c->Q(0) > 31) {
102         for (int i = 0; i < 1 << SHIFT; i++) {
103             d->Q(i) = 0;
104         }
105     } else {
106         shift = c->B(0);
107         for (int i = 0; i < 2 << SHIFT; i++) {
108             d->L(i) = FPSRL(s->L(i), shift);
109         }
110     }
111 }
112 
113 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
114 {
115     int shift;
116     if (c->Q(0) > 31) {
117         for (int i = 0; i < 1 << SHIFT; i++) {
118             d->Q(i) = 0;
119         }
120     } else {
121         shift = c->B(0);
122         for (int i = 0; i < 2 << SHIFT; i++) {
123             d->L(i) = FPSLL(s->L(i), shift);
124         }
125     }
126 }
127 
128 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
129 {
130     int shift;
131     if (c->Q(0) > 31) {
132         shift = 31;
133     } else {
134         shift = c->B(0);
135     }
136     for (int i = 0; i < 2 << SHIFT; i++) {
137         d->L(i) = FPSRAL(s->L(i), shift);
138     }
139 }
140 
141 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
142 {
143     int shift;
144     if (c->Q(0) > 63) {
145         for (int i = 0; i < 1 << SHIFT; i++) {
146             d->Q(i) = 0;
147         }
148     } else {
149         shift = c->B(0);
150         for (int i = 0; i < 1 << SHIFT; i++) {
151             d->Q(i) = FPSRL(s->Q(i), shift);
152         }
153     }
154 }
155 
156 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
157 {
158     int shift;
159     if (c->Q(0) > 63) {
160         for (int i = 0; i < 1 << SHIFT; i++) {
161             d->Q(i) = 0;
162         }
163     } else {
164         shift = c->B(0);
165         for (int i = 0; i < 1 << SHIFT; i++) {
166             d->Q(i) = FPSLL(s->Q(i), shift);
167         }
168     }
169 }
170 
171 #if SHIFT >= 1
172 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
173 {
174     int shift, i, j;
175 
176     shift = c->L(0);
177     if (shift > 16) {
178         shift = 16;
179     }
180     for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
181         for (i = 0; i < 16 - shift; i++) {
182             d->B(j + i) = s->B(j + i + shift);
183         }
184         for (i = 16 - shift; i < 16; i++) {
185             d->B(j + i) = 0;
186         }
187     }
188 }
189 
190 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
191 {
192     int shift, i, j;
193 
194     shift = c->L(0);
195     if (shift > 16) {
196         shift = 16;
197     }
198     for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
199         for (i = 15; i >= shift; i--) {
200             d->B(j + i) = s->B(j + i - shift);
201         }
202         for (i = 0; i < shift; i++) {
203             d->B(j + i) = 0;
204         }
205     }
206 }
207 #endif
208 
209 #define SSE_HELPER_1(name, elem, num, F)                        \
210     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
211     {                                                           \
212         int n = num;                                            \
213         for (int i = 0; i < n; i++) {                           \
214             d->elem(i) = F(s->elem(i));                         \
215         }                                                       \
216     }
217 
218 #define SSE_HELPER_2(name, elem, num, F)                        \
219     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)   \
220     {                                                           \
221         int n = num;                                            \
222         for (int i = 0; i < n; i++) {                           \
223             d->elem(i) = F(v->elem(i), s->elem(i));             \
224         }                                                       \
225     }
226 
227 #define SSE_HELPER_B(name, F)                                   \
228     SSE_HELPER_2(name, B, 8 << SHIFT, F)
229 
230 #define SSE_HELPER_W(name, F)                                   \
231     SSE_HELPER_2(name, W, 4 << SHIFT, F)
232 
233 #define SSE_HELPER_L(name, F)                                   \
234     SSE_HELPER_2(name, L, 2 << SHIFT, F)
235 
236 #define SSE_HELPER_Q(name, F)                                   \
237     SSE_HELPER_2(name, Q, 1 << SHIFT, F)
238 
239 #if SHIFT == 0
240 static inline int satub(int x)
241 {
242     if (x < 0) {
243         return 0;
244     } else if (x > 255) {
245         return 255;
246     } else {
247         return x;
248     }
249 }
250 
251 static inline int satuw(int x)
252 {
253     if (x < 0) {
254         return 0;
255     } else if (x > 65535) {
256         return 65535;
257     } else {
258         return x;
259     }
260 }
261 
262 static inline int satsb(int x)
263 {
264     if (x < -128) {
265         return -128;
266     } else if (x > 127) {
267         return 127;
268     } else {
269         return x;
270     }
271 }
272 
273 static inline int satsw(int x)
274 {
275     if (x < -32768) {
276         return -32768;
277     } else if (x > 32767) {
278         return 32767;
279     } else {
280         return x;
281     }
282 }
283 
284 #define FADD(a, b) ((a) + (b))
285 #define FADDUB(a, b) satub((a) + (b))
286 #define FADDUW(a, b) satuw((a) + (b))
287 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
288 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
289 
290 #define FSUB(a, b) ((a) - (b))
291 #define FSUBUB(a, b) satub((a) - (b))
292 #define FSUBUW(a, b) satuw((a) - (b))
293 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
294 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
295 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
296 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
297 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
298 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
299 
300 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
301 #define FMULHUW(a, b) ((a) * (b) >> 16)
302 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
303 
304 #define FAVG(a, b) (((a) + (b) + 1) >> 1)
305 #endif
306 
307 SSE_HELPER_W(helper_pmulhuw, FMULHUW)
308 SSE_HELPER_W(helper_pmulhw, FMULHW)
309 
310 #if SHIFT == 0
311 void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
312 {
313     d->W(0) = FMULHRW(d->W(0), s->W(0));
314     d->W(1) = FMULHRW(d->W(1), s->W(1));
315     d->W(2) = FMULHRW(d->W(2), s->W(2));
316     d->W(3) = FMULHRW(d->W(3), s->W(3));
317 }
318 #endif
319 
320 SSE_HELPER_B(helper_pavgb, FAVG)
321 SSE_HELPER_W(helper_pavgw, FAVG)
322 
323 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
324 {
325     int i;
326 
327     for (i = 0; i < (1 << SHIFT); i++) {
328         d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2);
329     }
330 }
331 
332 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
333 {
334     int i;
335 
336     for (i = 0; i < (2 << SHIFT); i++) {
337         d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
338             (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
339     }
340 }
341 
342 #if SHIFT == 0
343 static inline int abs1(int a)
344 {
345     if (a < 0) {
346         return -a;
347     } else {
348         return a;
349     }
350 }
351 #endif
352 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
353 {
354     int i;
355 
356     for (i = 0; i < (1 << SHIFT); i++) {
357         unsigned int val = 0;
358         val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0));
359         val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1));
360         val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2));
361         val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3));
362         val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4));
363         val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5));
364         val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6));
365         val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7));
366         d->Q(i) = val;
367     }
368 }
369 
370 #if SHIFT < 2
371 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
372                                   target_ulong a0)
373 {
374     int i;
375 
376     for (i = 0; i < (8 << SHIFT); i++) {
377         if (s->B(i) & 0x80) {
378             cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
379         }
380     }
381 }
382 #endif
383 
384 #define SHUFFLE4(F, a, b, offset) do {      \
385     r0 = a->F((order & 3) + offset);        \
386     r1 = a->F(((order >> 2) & 3) + offset); \
387     r2 = b->F(((order >> 4) & 3) + offset); \
388     r3 = b->F(((order >> 6) & 3) + offset); \
389     d->F(offset) = r0;                      \
390     d->F(offset + 1) = r1;                  \
391     d->F(offset + 2) = r2;                  \
392     d->F(offset + 3) = r3;                  \
393     } while (0)
394 
395 #if SHIFT == 0
396 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
397 {
398     uint16_t r0, r1, r2, r3;
399 
400     SHUFFLE4(W, s, s, 0);
401 }
402 #else
403 void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
404 {
405     uint32_t r0, r1, r2, r3;
406     int i;
407 
408     for (i = 0; i < 2 << SHIFT; i += 4) {
409         SHUFFLE4(L, v, s, i);
410     }
411 }
412 
413 void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
414 {
415     uint64_t r0, r1;
416     int i;
417 
418     for (i = 0; i < 1 << SHIFT; i += 2) {
419         r0 = v->Q(((order & 1) & 1) + i);
420         r1 = s->Q(((order >> 1) & 1) + i);
421         d->Q(i) = r0;
422         d->Q(i + 1) = r1;
423         order >>= 2;
424     }
425 }
426 
427 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
428 {
429     uint32_t r0, r1, r2, r3;
430     int i;
431 
432     for (i = 0; i < 2 << SHIFT; i += 4) {
433         SHUFFLE4(L, s, s, i);
434     }
435 }
436 
437 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
438 {
439     uint16_t r0, r1, r2, r3;
440     int i, j;
441 
442     for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
443         SHUFFLE4(W, s, s, i);
444         d->Q(j) = s->Q(j);
445     }
446 }
447 
448 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
449 {
450     uint16_t r0, r1, r2, r3;
451     int i, j;
452 
453     for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
454         d->Q(j) = s->Q(j);
455         SHUFFLE4(W, s, s, i);
456     }
457 }
458 #endif
459 
460 #if SHIFT >= 1
461 /* FPU ops */
462 /* XXX: not accurate */
463 
464 #define SSE_HELPER_P(name, F)                                           \
465     void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,          \
466             Reg *d, Reg *v, Reg *s)                                     \
467     {                                                                   \
468         int i;                                                          \
469         for (i = 0; i < 2 << SHIFT; i++) {                              \
470             d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i));              \
471         }                                                               \
472     }                                                                   \
473                                                                         \
474     void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,          \
475             Reg *d, Reg *v, Reg *s)                                     \
476     {                                                                   \
477         int i;                                                          \
478         for (i = 0; i < 1 << SHIFT; i++) {                              \
479             d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i));              \
480         }                                                               \
481     }
482 
483 #if SHIFT == 1
484 
485 #define SSE_HELPER_S(name, F)                                           \
486     SSE_HELPER_P(name, F)                                               \
487                                                                         \
488     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
489     {                                                                   \
490         int i;                                                          \
491         d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0));                  \
492         for (i = 1; i < 2 << SHIFT; i++) {                              \
493             d->ZMM_L(i) = v->ZMM_L(i);                                  \
494         }                                                               \
495     }                                                                   \
496                                                                         \
497     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
498     {                                                                   \
499         int i;                                                          \
500         d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0));                  \
501         for (i = 1; i < 1 << SHIFT; i++) {                              \
502             d->ZMM_Q(i) = v->ZMM_Q(i);                                  \
503         }                                                               \
504     }
505 
506 #else
507 
508 #define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
509 
510 #endif
511 
512 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
513 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
514 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
515 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
516 
517 /* Note that the choice of comparison op here is important to get the
518  * special cases right: for min and max Intel specifies that (-0,0),
519  * (NaN, anything) and (anything, NaN) return the second argument.
520  */
521 #define FPU_MIN(size, a, b)                                     \
522     (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
523 #define FPU_MAX(size, a, b)                                     \
524     (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
525 
526 SSE_HELPER_S(add, FPU_ADD)
527 SSE_HELPER_S(sub, FPU_SUB)
528 SSE_HELPER_S(mul, FPU_MUL)
529 SSE_HELPER_S(div, FPU_DIV)
530 SSE_HELPER_S(min, FPU_MIN)
531 SSE_HELPER_S(max, FPU_MAX)
532 
533 void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
534 {
535     int i;
536     for (i = 0; i < 2 << SHIFT; i++) {
537         d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status);
538     }
539 }
540 
541 void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
542 {
543     int i;
544     for (i = 0; i < 1 << SHIFT; i++) {
545         d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status);
546     }
547 }
548 
549 #if SHIFT == 1
550 void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
551 {
552     int i;
553     d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
554     for (i = 1; i < 2 << SHIFT; i++) {
555         d->ZMM_L(i) = v->ZMM_L(i);
556     }
557 }
558 
559 void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
560 {
561     int i;
562     d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
563     for (i = 1; i < 1 << SHIFT; i++) {
564         d->ZMM_Q(i) = v->ZMM_Q(i);
565     }
566 }
567 #endif
568 
569 /* float to float conversions */
570 void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
571 {
572     int i;
573     for (i = 1 << SHIFT; --i >= 0; ) {
574         d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status);
575     }
576 }
577 
578 void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
579 {
580     int i;
581     for (i = 0; i < 1 << SHIFT; i++) {
582          d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status);
583     }
584     for (i >>= 1; i < 1 << SHIFT; i++) {
585          d->Q(i) = 0;
586     }
587 }
588 
589 #if SHIFT == 1
590 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
591 {
592     int i;
593     d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
594     for (i = 1; i < 1 << SHIFT; i++) {
595         d->ZMM_Q(i) = v->ZMM_Q(i);
596     }
597 }
598 
599 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
600 {
601     int i;
602     d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
603     for (i = 1; i < 2 << SHIFT; i++) {
604         d->ZMM_L(i) = v->ZMM_L(i);
605     }
606 }
607 #endif
608 
609 /* integer to float */
610 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
611 {
612     int i;
613     for (i = 0; i < 2 << SHIFT; i++) {
614         d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status);
615     }
616 }
617 
618 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
619 {
620     int i;
621     for (i = 1 << SHIFT; --i >= 0; ) {
622         int32_t l = s->ZMM_L(i);
623         d->ZMM_D(i) = int32_to_float64(l, &env->sse_status);
624     }
625 }
626 
627 #if SHIFT == 1
628 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
629 {
630     d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
631     d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
632 }
633 
634 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
635 {
636     d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
637     d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
638 }
639 
640 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
641 {
642     d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
643 }
644 
645 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
646 {
647     d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
648 }
649 
650 #ifdef TARGET_X86_64
651 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
652 {
653     d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
654 }
655 
656 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
657 {
658     d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
659 }
660 #endif
661 
662 #endif
663 
664 /* float to integer */
665 
666 #if SHIFT == 1
667 /*
668  * x86 mandates that we return the indefinite integer value for the result
669  * of any float-to-integer conversion that raises the 'invalid' exception.
670  * Wrap the softfloat functions to get this behaviour.
671  */
672 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE)              \
673     static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s)        \
674     {                                                                   \
675         int oldflags, newflags;                                         \
676         RETTYPE r;                                                      \
677                                                                         \
678         oldflags = get_float_exception_flags(s);                        \
679         set_float_exception_flags(0, s);                                \
680         r = FN(a, s);                                                   \
681         newflags = get_float_exception_flags(s);                        \
682         if (newflags & float_flag_invalid) {                            \
683             r = INDEFVALUE;                                             \
684         }                                                               \
685         set_float_exception_flags(newflags | oldflags, s);              \
686         return r;                                                       \
687     }
688 
689 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
690 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
691 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
692 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
693 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
694 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
695 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
696 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
697 #endif
698 
699 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
700 {
701     int i;
702     for (i = 0; i < 2 << SHIFT; i++) {
703         d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status);
704     }
705 }
706 
707 void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
708 {
709     int i;
710     for (i = 0; i < 1 << SHIFT; i++) {
711         d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status);
712     }
713     for (i >>= 1; i < 1 << SHIFT; i++) {
714          d->Q(i) = 0;
715     }
716 }
717 
718 #if SHIFT == 1
719 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
720 {
721     d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
722     d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
723 }
724 
725 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
726 {
727     d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
728     d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
729 }
730 
731 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
732 {
733     return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
734 }
735 
736 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
737 {
738     return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
739 }
740 
741 #ifdef TARGET_X86_64
742 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
743 {
744     return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
745 }
746 
747 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
748 {
749     return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
750 }
751 #endif
752 #endif
753 
754 /* float to integer truncated */
755 void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
756 {
757     int i;
758     for (i = 0; i < 2 << SHIFT; i++) {
759         d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i),
760                                                          &env->sse_status);
761     }
762 }
763 
764 void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
765 {
766     int i;
767     for (i = 0; i < 1 << SHIFT; i++) {
768         d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i),
769                                                          &env->sse_status);
770     }
771     for (i >>= 1; i < 1 << SHIFT; i++) {
772          d->Q(i) = 0;
773     }
774 }
775 
776 #if SHIFT == 1
777 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
778 {
779     d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
780     d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
781 }
782 
783 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
784 {
785     d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
786     d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
787 }
788 
789 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
790 {
791     return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
792 }
793 
794 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
795 {
796     return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
797 }
798 
799 #ifdef TARGET_X86_64
800 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
801 {
802     return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
803 }
804 
805 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
806 {
807     return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
808 }
809 #endif
810 #endif
811 
812 void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
813 {
814     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
815     int i;
816     for (i = 0; i < 2 << SHIFT; i++) {
817         d->ZMM_S(i) = float32_div(float32_one,
818                                   float32_sqrt(s->ZMM_S(i), &env->sse_status),
819                                   &env->sse_status);
820     }
821     set_float_exception_flags(old_flags, &env->sse_status);
822 }
823 
824 #if SHIFT == 1
825 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
826 {
827     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
828     int i;
829     d->ZMM_S(0) = float32_div(float32_one,
830                               float32_sqrt(s->ZMM_S(0), &env->sse_status),
831                               &env->sse_status);
832     set_float_exception_flags(old_flags, &env->sse_status);
833     for (i = 1; i < 2 << SHIFT; i++) {
834         d->ZMM_L(i) = v->ZMM_L(i);
835     }
836 }
837 #endif
838 
839 void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
840 {
841     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
842     int i;
843     for (i = 0; i < 2 << SHIFT; i++) {
844         d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status);
845     }
846     set_float_exception_flags(old_flags, &env->sse_status);
847 }
848 
849 #if SHIFT == 1
850 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
851 {
852     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
853     int i;
854     d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
855     for (i = 1; i < 2 << SHIFT; i++) {
856         d->ZMM_L(i) = v->ZMM_L(i);
857     }
858     set_float_exception_flags(old_flags, &env->sse_status);
859 }
860 #endif
861 
862 #if SHIFT == 1
863 static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
864 {
865     uint64_t mask;
866 
867     if (len == 0) {
868         mask = ~0LL;
869     } else {
870         mask = (1ULL << len) - 1;
871     }
872     return (src >> shift) & mask;
873 }
874 
875 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
876 {
877     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63);
878 }
879 
880 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
881 {
882     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
883 }
884 
885 static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len)
886 {
887     uint64_t mask;
888 
889     if (len == 0) {
890         mask = ~0ULL;
891     } else {
892         mask = (1ULL << len) - 1;
893     }
894     return (dest & ~(mask << shift)) | ((src & mask) << shift);
895 }
896 
897 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
898 {
899     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63);
900 }
901 
902 void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length)
903 {
904     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length);
905 }
906 #endif
907 
908 #define SSE_HELPER_HPS(name, F)  \
909 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
910 {                                                                 \
911     float32 r[2 << SHIFT];                                        \
912     int i, j, k;                                                  \
913     for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {            \
914         for (i = j = 0; j < 4; i++, j += 2) {                     \
915             r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
916         }                                                         \
917         for (j = 0; j < 4; i++, j += 2) {                         \
918             r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
919         }                                                         \
920     }                                                             \
921     for (i = 0; i < 2 << SHIFT; i++) {                            \
922         d->ZMM_S(i) = r[i];                                       \
923     }                                                             \
924 }
925 
926 SSE_HELPER_HPS(haddps, float32_add)
927 SSE_HELPER_HPS(hsubps, float32_sub)
928 
929 #define SSE_HELPER_HPD(name, F)  \
930 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
931 {                                                                 \
932     float64 r[1 << SHIFT];                                        \
933     int i, j, k;                                                  \
934     for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) {            \
935         for (i = j = 0; j < 2; i++, j += 2) {                     \
936             r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
937         }                                                         \
938         for (j = 0; j < 2; i++, j += 2) {                         \
939             r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
940         }                                                         \
941     }                                                             \
942     for (i = 0; i < 1 << SHIFT; i++) {                            \
943         d->ZMM_D(i) = r[i];                                       \
944     }                                                             \
945 }
946 
947 SSE_HELPER_HPD(haddpd, float64_add)
948 SSE_HELPER_HPD(hsubpd, float64_sub)
949 
950 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
951 {
952     int i;
953     for (i = 0; i < 2 << SHIFT; i += 2) {
954         d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
955         d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
956     }
957 }
958 
959 void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
960 {
961     int i;
962     for (i = 0; i < 1 << SHIFT; i += 2) {
963         d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
964         d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status);
965     }
966 }
967 
968 #define SSE_HELPER_CMP_P(name, F, C)                                    \
969     void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,          \
970                                              Reg *d, Reg *v, Reg *s)    \
971     {                                                                   \
972         int i;                                                          \
973         for (i = 0; i < 2 << SHIFT; i++) {                              \
974             d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0;  \
975         }                                                               \
976     }                                                                   \
977                                                                         \
978     void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,          \
979                                              Reg *d, Reg *v, Reg *s)    \
980     {                                                                   \
981         int i;                                                          \
982         for (i = 0; i < 1 << SHIFT; i++) {                              \
983             d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0;  \
984         }                                                               \
985     }
986 
987 #if SHIFT == 1
988 #define SSE_HELPER_CMP(name, F, C)                                          \
989     SSE_HELPER_CMP_P(name, F, C)                                            \
990     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)    \
991     {                                                                       \
992         int i;                                                              \
993         d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0;          \
994         for (i = 1; i < 2 << SHIFT; i++) {                                  \
995             d->ZMM_L(i) = v->ZMM_L(i);                                      \
996         }                                                                   \
997     }                                                                       \
998                                                                             \
999     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)    \
1000     {                                                                       \
1001         int i;                                                              \
1002         d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0;          \
1003         for (i = 1; i < 1 << SHIFT; i++) {                                  \
1004             d->ZMM_Q(i) = v->ZMM_Q(i);                                      \
1005         }                                                                   \
1006     }
1007 
1008 static inline bool FPU_EQU(FloatRelation x)
1009 {
1010     return (x == float_relation_equal || x == float_relation_unordered);
1011 }
1012 static inline bool FPU_GE(FloatRelation x)
1013 {
1014     return (x == float_relation_equal || x == float_relation_greater);
1015 }
1016 #define FPU_EQ(x) (x == float_relation_equal)
1017 #define FPU_LT(x) (x == float_relation_less)
1018 #define FPU_LE(x) (x <= float_relation_equal)
1019 #define FPU_GT(x) (x == float_relation_greater)
1020 #define FPU_UNORD(x) (x == float_relation_unordered)
1021 /* We must make sure we evaluate the argument in case it is a signalling NAN */
1022 #define FPU_FALSE(x) (x == float_relation_equal && 0)
1023 
1024 #define FPU_CMPQ(size, a, b) \
1025     float ## size ## _compare_quiet(a, b, &env->sse_status)
1026 #define FPU_CMPS(size, a, b) \
1027     float ## size ## _compare(a, b, &env->sse_status)
1028 
1029 #else
1030 #define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C)
1031 #endif
1032 
1033 SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
1034 SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
1035 SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE)
1036 SSE_HELPER_CMP(cmpunord, FPU_CMPQ,  FPU_UNORD)
1037 SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ)
1038 SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
1039 SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
1040 SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
1041 
1042 SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU)
1043 SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE)
1044 SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT)
1045 SSE_HELPER_CMP(cmpfalse, FPU_CMPQ,  FPU_FALSE)
1046 SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU)
1047 SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE)
1048 SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT)
1049 SSE_HELPER_CMP(cmptrue, FPU_CMPQ,  !FPU_FALSE)
1050 
1051 SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ)
1052 SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT)
1053 SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE)
1054 SSE_HELPER_CMP(cmpunords, FPU_CMPS,  FPU_UNORD)
1055 SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ)
1056 SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT)
1057 SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE)
1058 SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD)
1059 
1060 SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU)
1061 SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE)
1062 SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT)
1063 SSE_HELPER_CMP(cmpfalses, FPU_CMPS,  FPU_FALSE)
1064 SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU)
1065 SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE)
1066 SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT)
1067 SSE_HELPER_CMP(cmptrues, FPU_CMPS,  !FPU_FALSE)
1068 
1069 #undef SSE_HELPER_CMP
1070 
1071 #if SHIFT == 1
1072 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
1073 
1074 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
1075 {
1076     FloatRelation ret;
1077     float32 s0, s1;
1078 
1079     s0 = d->ZMM_S(0);
1080     s1 = s->ZMM_S(0);
1081     ret = float32_compare_quiet(s0, s1, &env->sse_status);
1082     CC_SRC = comis_eflags[ret + 1];
1083 }
1084 
1085 void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
1086 {
1087     FloatRelation ret;
1088     float32 s0, s1;
1089 
1090     s0 = d->ZMM_S(0);
1091     s1 = s->ZMM_S(0);
1092     ret = float32_compare(s0, s1, &env->sse_status);
1093     CC_SRC = comis_eflags[ret + 1];
1094 }
1095 
1096 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
1097 {
1098     FloatRelation ret;
1099     float64 d0, d1;
1100 
1101     d0 = d->ZMM_D(0);
1102     d1 = s->ZMM_D(0);
1103     ret = float64_compare_quiet(d0, d1, &env->sse_status);
1104     CC_SRC = comis_eflags[ret + 1];
1105 }
1106 
1107 void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
1108 {
1109     FloatRelation ret;
1110     float64 d0, d1;
1111 
1112     d0 = d->ZMM_D(0);
1113     d1 = s->ZMM_D(0);
1114     ret = float64_compare(d0, d1, &env->sse_status);
1115     CC_SRC = comis_eflags[ret + 1];
1116 }
1117 #endif
1118 
1119 uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s)
1120 {
1121     uint32_t mask;
1122     int i;
1123 
1124     mask = 0;
1125     for (i = 0; i < 2 << SHIFT; i++) {
1126         mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i);
1127     }
1128     return mask;
1129 }
1130 
1131 uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
1132 {
1133     uint32_t mask;
1134     int i;
1135 
1136     mask = 0;
1137     for (i = 0; i < 1 << SHIFT; i++) {
1138         mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i);
1139     }
1140     return mask;
1141 }
1142 
1143 #endif
1144 
1145 #define PACK_HELPER_B(name, F) \
1146 void glue(helper_pack ## name, SUFFIX)(CPUX86State *env,      \
1147         Reg *d, Reg *v, Reg *s)                               \
1148 {                                                             \
1149     uint8_t r[PACK_WIDTH * 2];                                \
1150     int j, k;                                                 \
1151     for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) {            \
1152         for (k = 0; k < PACK_WIDTH; k++) {                    \
1153             r[k] = F((int16_t)v->W(j + k));                   \
1154         }                                                     \
1155         for (k = 0; k < PACK_WIDTH; k++) {                    \
1156             r[PACK_WIDTH + k] = F((int16_t)s->W(j + k));      \
1157         }                                                     \
1158         for (k = 0; k < PACK_WIDTH * 2; k++) {                \
1159             d->B(2 * j + k) = r[k];                           \
1160         }                                                     \
1161     }                                                         \
1162 }
1163 
1164 PACK_HELPER_B(sswb, satsb)
1165 PACK_HELPER_B(uswb, satub)
1166 
1167 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1168 {
1169     uint16_t r[PACK_WIDTH];
1170     int j, k;
1171 
1172     for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
1173         for (k = 0; k < PACK_WIDTH / 2; k++) {
1174             r[k] = satsw(v->L(j + k));
1175         }
1176         for (k = 0; k < PACK_WIDTH / 2; k++) {
1177             r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
1178         }
1179         for (k = 0; k < PACK_WIDTH; k++) {
1180             d->W(2 * j + k) = r[k];
1181         }
1182     }
1183 }
1184 
1185 #define UNPCK_OP(base_name, base)                                       \
1186                                                                         \
1187     void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
1188                                                 Reg *d, Reg *v, Reg *s) \
1189     {                                                                   \
1190         uint8_t r[PACK_WIDTH * 2];                                      \
1191         int j, i;                                                       \
1192                                                                         \
1193         for (j = 0; j < 8 << SHIFT; ) {                                 \
1194             int k = j + base * PACK_WIDTH;                              \
1195             for (i = 0; i < PACK_WIDTH; i++) {                          \
1196                 r[2 * i] = v->B(k + i);                                 \
1197                 r[2 * i + 1] = s->B(k + i);                             \
1198             }                                                           \
1199             for (i = 0; i < PACK_WIDTH * 2; i++, j++) {                 \
1200                 d->B(j) = r[i];                                         \
1201             }                                                           \
1202         }                                                               \
1203     }                                                                   \
1204                                                                         \
1205     void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
1206                                                 Reg *d, Reg *v, Reg *s) \
1207     {                                                                   \
1208         uint16_t r[PACK_WIDTH];                                         \
1209         int j, i;                                                       \
1210                                                                         \
1211         for (j = 0; j < 4 << SHIFT; ) {                                 \
1212             int k = j + base * PACK_WIDTH / 2;                          \
1213             for (i = 0; i < PACK_WIDTH / 2; i++) {                      \
1214                 r[2 * i] = v->W(k + i);                                 \
1215                 r[2 * i + 1] = s->W(k + i);                             \
1216             }                                                           \
1217             for (i = 0; i < PACK_WIDTH; i++, j++) {                     \
1218                 d->W(j) = r[i];                                         \
1219             }                                                           \
1220         }                                                               \
1221     }                                                                   \
1222                                                                         \
1223     void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
1224                                                 Reg *d, Reg *v, Reg *s) \
1225     {                                                                   \
1226         uint32_t r[PACK_WIDTH / 2];                                     \
1227         int j, i;                                                       \
1228                                                                         \
1229         for (j = 0; j < 2 << SHIFT; ) {                                 \
1230             int k = j + base * PACK_WIDTH / 4;                          \
1231             for (i = 0; i < PACK_WIDTH / 4; i++) {                      \
1232                 r[2 * i] = v->L(k + i);                                 \
1233                 r[2 * i + 1] = s->L(k + i);                             \
1234             }                                                           \
1235             for (i = 0; i < PACK_WIDTH / 2; i++, j++) {                 \
1236                 d->L(j) = r[i];                                         \
1237             }                                                           \
1238         }                                                               \
1239     }                                                                   \
1240                                                                         \
1241     XMM_ONLY(                                                           \
1242              void glue(helper_punpck ## base_name ## qdq, SUFFIX)(      \
1243                         CPUX86State *env, Reg *d, Reg *v, Reg *s)       \
1244              {                                                          \
1245                  uint64_t r[2];                                         \
1246                  int i;                                                 \
1247                                                                         \
1248                  for (i = 0; i < 1 << SHIFT; i += 2) {                  \
1249                      r[0] = v->Q(base + i);                             \
1250                      r[1] = s->Q(base + i);                             \
1251                      d->Q(i) = r[0];                                    \
1252                      d->Q(i + 1) = r[1];                                \
1253                  }                                                      \
1254              }                                                          \
1255                                                                         )
1256 
1257 UNPCK_OP(l, 0)
1258 UNPCK_OP(h, 1)
1259 
1260 #undef PACK_WIDTH
1261 #undef PACK_HELPER_B
1262 #undef UNPCK_OP
1263 
1264 
1265 /* 3DNow! float ops */
1266 #if SHIFT == 0
1267 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
1268 {
1269     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1270     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1271 }
1272 
1273 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
1274 {
1275     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1276     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1277 }
1278 
1279 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
1280 {
1281     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1282     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1283 }
1284 
1285 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
1286 {
1287     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1288                                                        &env->mmx_status));
1289     d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1290                                                        &env->mmx_status));
1291 }
1292 
1293 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1294 {
1295     float32 r;
1296 
1297     r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1298     d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1299     d->MMX_S(0) = r;
1300 }
1301 
1302 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
1303 {
1304     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1305     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1306 }
1307 
1308 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
1309 {
1310     d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1311                                    &env->mmx_status) ? -1 : 0;
1312     d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1313                                    &env->mmx_status) ? -1 : 0;
1314 }
1315 
1316 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
1317 {
1318     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1319                              &env->mmx_status) ? -1 : 0;
1320     d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1321                              &env->mmx_status) ? -1 : 0;
1322 }
1323 
1324 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
1325 {
1326     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1327                              &env->mmx_status) ? -1 : 0;
1328     d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1329                              &env->mmx_status) ? -1 : 0;
1330 }
1331 
1332 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
1333 {
1334     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
1335         d->MMX_S(0) = s->MMX_S(0);
1336     }
1337     if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
1338         d->MMX_S(1) = s->MMX_S(1);
1339     }
1340 }
1341 
1342 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
1343 {
1344     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
1345         d->MMX_S(0) = s->MMX_S(0);
1346     }
1347     if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
1348         d->MMX_S(1) = s->MMX_S(1);
1349     }
1350 }
1351 
1352 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
1353 {
1354     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1355     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1356 }
1357 
1358 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1359 {
1360     float32 r;
1361 
1362     r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1363     d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1364     d->MMX_S(0) = r;
1365 }
1366 
1367 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1368 {
1369     float32 r;
1370 
1371     r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1372     d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1373     d->MMX_S(0) = r;
1374 }
1375 
1376 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
1377 {
1378     d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
1379     d->MMX_S(1) = d->MMX_S(0);
1380 }
1381 
1382 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
1383 {
1384     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
1385     d->MMX_S(1) = float32_div(float32_one,
1386                               float32_sqrt(d->MMX_S(1), &env->mmx_status),
1387                               &env->mmx_status);
1388     d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1389     d->MMX_L(0) = d->MMX_L(1);
1390 }
1391 
1392 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
1393 {
1394     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1395     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1396 }
1397 
1398 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
1399 {
1400     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1401     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1402 }
1403 
1404 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
1405 {
1406     uint32_t r;
1407 
1408     r = s->MMX_L(0);
1409     d->MMX_L(0) = s->MMX_L(1);
1410     d->MMX_L(1) = r;
1411 }
1412 #endif
1413 
1414 /* SSSE3 op helpers */
1415 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1416 {
1417     int i;
1418 #if SHIFT == 0
1419     uint8_t r[8];
1420 
1421     for (i = 0; i < 8; i++) {
1422         r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
1423     }
1424     for (i = 0; i < 8; i++) {
1425         d->B(i) = r[i];
1426     }
1427 #else
1428     uint8_t r[8 << SHIFT];
1429 
1430     for (i = 0; i < 8 << SHIFT; i++) {
1431         int j = i & ~0xf;
1432         r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
1433     }
1434     for (i = 0; i < 8 << SHIFT; i++) {
1435         d->B(i) = r[i];
1436     }
1437 #endif
1438 }
1439 
1440 #define SSE_HELPER_HW(name, F)  \
1441 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
1442 {                                                          \
1443     uint16_t r[4 << SHIFT];                                \
1444     int i, j, k;                                           \
1445     for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) {     \
1446         for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1447             r[i + k] = F(v->W(j + k), v->W(j + k + 1));    \
1448         }                                                  \
1449         for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) {     \
1450             r[i + k] = F(s->W(j + k), s->W(j + k + 1));    \
1451         }                                                  \
1452     }                                                      \
1453     for (i = 0; i < 4 << SHIFT; i++) {                     \
1454         d->W(i) = r[i];                                    \
1455     }                                                      \
1456 }
1457 
1458 #define SSE_HELPER_HL(name, F)  \
1459 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
1460 {                                                          \
1461     uint32_t r[2 << SHIFT];                                \
1462     int i, j, k;                                           \
1463     for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {     \
1464         for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1465             r[i + k] = F(v->L(j + k), v->L(j + k + 1));    \
1466         }                                                  \
1467         for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) {     \
1468             r[i + k] = F(s->L(j + k), s->L(j + k + 1));    \
1469         }                                                  \
1470     }                                                      \
1471     for (i = 0; i < 2 << SHIFT; i++) {                     \
1472         d->L(i) = r[i];                                    \
1473     }                                                      \
1474 }
1475 
1476 SSE_HELPER_HW(phaddw, FADD)
1477 SSE_HELPER_HW(phsubw, FSUB)
1478 SSE_HELPER_HW(phaddsw, FADDSW)
1479 SSE_HELPER_HW(phsubsw, FSUBSW)
1480 SSE_HELPER_HL(phaddd, FADD)
1481 SSE_HELPER_HL(phsubd, FSUB)
1482 
1483 #undef SSE_HELPER_HW
1484 #undef SSE_HELPER_HL
1485 
1486 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1487 {
1488     int i;
1489     for (i = 0; i < 4 << SHIFT; i++) {
1490         d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
1491                         (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
1492     }
1493 }
1494 
1495 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
1496 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1497 
1498 #define FSIGNB(d, s) (s <= INT8_MAX  ? s ? d : 0 : -(int8_t)d)
1499 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1500 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
1501 SSE_HELPER_B(helper_psignb, FSIGNB)
1502 SSE_HELPER_W(helper_psignw, FSIGNW)
1503 SSE_HELPER_L(helper_psignd, FSIGNL)
1504 
1505 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1506                                   uint32_t imm)
1507 {
1508     int i;
1509 
1510     /* XXX could be checked during translation */
1511     if (imm >= (SHIFT ? 32 : 16)) {
1512         for (i = 0; i < (1 << SHIFT); i++) {
1513             d->Q(i) = 0;
1514         }
1515     } else {
1516         int shift = imm * 8;
1517 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1518 #if SHIFT == 0
1519         d->Q(0) = SHR(s->Q(0), shift - 0) |
1520             SHR(v->Q(0), shift -  64);
1521 #else
1522         for (i = 0; i < (1 << SHIFT); i += 2) {
1523             uint64_t r0, r1;
1524 
1525             r0 = SHR(s->Q(i), shift - 0) |
1526                  SHR(s->Q(i + 1), shift -  64) |
1527                  SHR(v->Q(i), shift - 128) |
1528                  SHR(v->Q(i + 1), shift - 192);
1529             r1 = SHR(s->Q(i), shift + 64) |
1530                  SHR(s->Q(i + 1), shift -   0) |
1531                  SHR(v->Q(i), shift -  64) |
1532                  SHR(v->Q(i + 1), shift - 128);
1533             d->Q(i) = r0;
1534             d->Q(i + 1) = r1;
1535         }
1536 #endif
1537 #undef SHR
1538     }
1539 }
1540 
1541 #if SHIFT >= 1
1542 
1543 #define SSE_HELPER_V(name, elem, num, F)                                \
1544     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,   \
1545                             Reg *m)                                     \
1546     {                                                                   \
1547         int i;                                                          \
1548         for (i = 0; i < num; i++) {                                     \
1549             d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i));         \
1550         }                                                               \
1551     }
1552 
1553 #define SSE_HELPER_I(name, elem, num, F)                                \
1554     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,   \
1555                             uint32_t imm)                               \
1556     {                                                                   \
1557         int i;                                                          \
1558         for (i = 0; i < num; i++) {                                     \
1559             int j = i & 7;                                              \
1560             d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1);     \
1561         }                                                               \
1562     }
1563 
1564 /* SSE4.1 op helpers */
1565 #define FBLENDVB(v, s, m) ((m & 0x80) ? s : v)
1566 #define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v)
1567 #define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v)
1568 SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB)
1569 SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS)
1570 SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD)
1571 
1572 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1573 {
1574     uint64_t zf = 0, cf = 0;
1575     int i;
1576 
1577     for (i = 0; i < 1 << SHIFT; i++) {
1578         zf |= (s->Q(i) &  d->Q(i));
1579         cf |= (s->Q(i) & ~d->Q(i));
1580     }
1581     CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1582 }
1583 
1584 #define FMOVSLDUP(i) s->L((i) & ~1)
1585 #define FMOVSHDUP(i) s->L((i) | 1)
1586 #define FMOVDLDUP(i) s->Q((i) & ~1)
1587 
1588 #define SSE_HELPER_F(name, elem, num, F)                        \
1589     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
1590     {                                                           \
1591         int n = num;                                            \
1592         for (int i = n; --i >= 0; ) {                           \
1593             d->elem(i) = F(i);                                  \
1594         }                                                       \
1595     }
1596 
1597 #if SHIFT > 0
1598 SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B)
1599 SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B)
1600 SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B)
1601 SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W)
1602 SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W)
1603 SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L)
1604 SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B)
1605 SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B)
1606 SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
1607 SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
1608 SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
1609 SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
1610 SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP)
1611 SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP)
1612 SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP)
1613 #endif
1614 
1615 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1616 {
1617     int i;
1618 
1619     for (i = 0; i < 1 << SHIFT; i++) {
1620         d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i);
1621     }
1622 }
1623 
1624 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1625 {
1626     uint16_t r[8];
1627     int i, j, k;
1628 
1629     for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
1630         r[0] = satuw(v->L(j));
1631         r[1] = satuw(v->L(j + 1));
1632         r[2] = satuw(v->L(j + 2));
1633         r[3] = satuw(v->L(j + 3));
1634         r[4] = satuw(s->L(j));
1635         r[5] = satuw(s->L(j + 1));
1636         r[6] = satuw(s->L(j + 2));
1637         r[7] = satuw(s->L(j + 3));
1638         for (k = 0; k < 8; k++) {
1639             d->W(i + k) = r[k];
1640         }
1641     }
1642 }
1643 
1644 #if SHIFT == 1
1645 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1646 {
1647     int idx = 0;
1648 
1649     if (s->W(1) < s->W(idx)) {
1650         idx = 1;
1651     }
1652     if (s->W(2) < s->W(idx)) {
1653         idx = 2;
1654     }
1655     if (s->W(3) < s->W(idx)) {
1656         idx = 3;
1657     }
1658     if (s->W(4) < s->W(idx)) {
1659         idx = 4;
1660     }
1661     if (s->W(5) < s->W(idx)) {
1662         idx = 5;
1663     }
1664     if (s->W(6) < s->W(idx)) {
1665         idx = 6;
1666     }
1667     if (s->W(7) < s->W(idx)) {
1668         idx = 7;
1669     }
1670 
1671     d->W(0) = s->W(idx);
1672     d->W(1) = idx;
1673     d->L(1) = 0;
1674     d->Q(1) = 0;
1675 }
1676 #endif
1677 
1678 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1679                                   uint32_t mode)
1680 {
1681     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1682     signed char prev_rounding_mode;
1683     int i;
1684 
1685     prev_rounding_mode = env->sse_status.float_rounding_mode;
1686     if (!(mode & (1 << 2))) {
1687         switch (mode & 3) {
1688         case 0:
1689             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1690             break;
1691         case 1:
1692             set_float_rounding_mode(float_round_down, &env->sse_status);
1693             break;
1694         case 2:
1695             set_float_rounding_mode(float_round_up, &env->sse_status);
1696             break;
1697         case 3:
1698             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1699             break;
1700         }
1701     }
1702 
1703     for (i = 0; i < 2 << SHIFT; i++) {
1704         d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status);
1705     }
1706 
1707     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1708         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1709                                   ~float_flag_inexact,
1710                                   &env->sse_status);
1711     }
1712     env->sse_status.float_rounding_mode = prev_rounding_mode;
1713 }
1714 
1715 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1716                                   uint32_t mode)
1717 {
1718     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1719     signed char prev_rounding_mode;
1720     int i;
1721 
1722     prev_rounding_mode = env->sse_status.float_rounding_mode;
1723     if (!(mode & (1 << 2))) {
1724         switch (mode & 3) {
1725         case 0:
1726             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1727             break;
1728         case 1:
1729             set_float_rounding_mode(float_round_down, &env->sse_status);
1730             break;
1731         case 2:
1732             set_float_rounding_mode(float_round_up, &env->sse_status);
1733             break;
1734         case 3:
1735             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1736             break;
1737         }
1738     }
1739 
1740     for (i = 0; i < 1 << SHIFT; i++) {
1741         d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status);
1742     }
1743 
1744     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1745         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1746                                   ~float_flag_inexact,
1747                                   &env->sse_status);
1748     }
1749     env->sse_status.float_rounding_mode = prev_rounding_mode;
1750 }
1751 
1752 #if SHIFT == 1
1753 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1754                                   uint32_t mode)
1755 {
1756     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1757     signed char prev_rounding_mode;
1758     int i;
1759 
1760     prev_rounding_mode = env->sse_status.float_rounding_mode;
1761     if (!(mode & (1 << 2))) {
1762         switch (mode & 3) {
1763         case 0:
1764             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1765             break;
1766         case 1:
1767             set_float_rounding_mode(float_round_down, &env->sse_status);
1768             break;
1769         case 2:
1770             set_float_rounding_mode(float_round_up, &env->sse_status);
1771             break;
1772         case 3:
1773             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1774             break;
1775         }
1776     }
1777 
1778     d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
1779     for (i = 1; i < 2 << SHIFT; i++) {
1780         d->ZMM_L(i) = v->ZMM_L(i);
1781     }
1782 
1783     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1784         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1785                                   ~float_flag_inexact,
1786                                   &env->sse_status);
1787     }
1788     env->sse_status.float_rounding_mode = prev_rounding_mode;
1789 }
1790 
1791 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1792                                   uint32_t mode)
1793 {
1794     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1795     signed char prev_rounding_mode;
1796     int i;
1797 
1798     prev_rounding_mode = env->sse_status.float_rounding_mode;
1799     if (!(mode & (1 << 2))) {
1800         switch (mode & 3) {
1801         case 0:
1802             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1803             break;
1804         case 1:
1805             set_float_rounding_mode(float_round_down, &env->sse_status);
1806             break;
1807         case 2:
1808             set_float_rounding_mode(float_round_up, &env->sse_status);
1809             break;
1810         case 3:
1811             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1812             break;
1813         }
1814     }
1815 
1816     d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
1817     for (i = 1; i < 1 << SHIFT; i++) {
1818         d->ZMM_Q(i) = v->ZMM_Q(i);
1819     }
1820 
1821     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1822         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1823                                   ~float_flag_inexact,
1824                                   &env->sse_status);
1825     }
1826     env->sse_status.float_rounding_mode = prev_rounding_mode;
1827 }
1828 #endif
1829 
1830 #define FBLENDP(v, s, m) (m ? s : v)
1831 SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP)
1832 SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP)
1833 SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP)
1834 
1835 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1836                                uint32_t mask)
1837 {
1838     float32 prod1, prod2, temp2, temp3, temp4;
1839     int i;
1840 
1841     for (i = 0; i < 2 << SHIFT; i += 4) {
1842         /*
1843          * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
1844          * to correctly round the intermediate results
1845          */
1846         if (mask & (1 << 4)) {
1847             prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1848         } else {
1849             prod1 = float32_zero;
1850         }
1851         if (mask & (1 << 5)) {
1852             prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1853         } else {
1854             prod2 = float32_zero;
1855         }
1856         temp2 = float32_add(prod1, prod2, &env->sse_status);
1857         if (mask & (1 << 6)) {
1858             prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status);
1859         } else {
1860             prod1 = float32_zero;
1861         }
1862         if (mask & (1 << 7)) {
1863             prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status);
1864         } else {
1865             prod2 = float32_zero;
1866         }
1867         temp3 = float32_add(prod1, prod2, &env->sse_status);
1868         temp4 = float32_add(temp2, temp3, &env->sse_status);
1869 
1870         d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero;
1871         d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero;
1872         d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero;
1873         d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero;
1874     }
1875 }
1876 
1877 #if SHIFT == 1
1878 /* Oddly, there is no ymm version of dppd */
1879 void glue(helper_dppd, SUFFIX)(CPUX86State *env,
1880                                Reg *d, Reg *v, Reg *s, uint32_t mask)
1881 {
1882     float64 prod1, prod2, temp2;
1883 
1884     if (mask & (1 << 4)) {
1885         prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
1886     } else {
1887         prod1 = float64_zero;
1888     }
1889     if (mask & (1 << 5)) {
1890         prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
1891     } else {
1892         prod2 = float64_zero;
1893     }
1894     temp2 = float64_add(prod1, prod2, &env->sse_status);
1895     d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
1896     d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
1897 }
1898 #endif
1899 
1900 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1901                                   uint32_t offset)
1902 {
1903     int i, j;
1904     uint16_t r[8];
1905 
1906     for (j = 0; j < 4 << SHIFT; ) {
1907         int s0 = (j * 2) + ((offset & 3) << 2);
1908         int d0 = (j * 2) + ((offset & 4) << 0);
1909         for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
1910             r[i] = 0;
1911             r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
1912             r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
1913             r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
1914             r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
1915         }
1916         for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
1917             d->W(j) = r[i];
1918         }
1919         offset >>= 3;
1920     }
1921 }
1922 
1923 /* SSE4.2 op helpers */
1924 #if SHIFT == 1
1925 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
1926 {
1927     target_long val, limit;
1928 
1929     /* Presence of REX.W is indicated by a bit higher than 7 set */
1930     if (ctrl >> 8) {
1931         val = (target_long)env->regs[reg];
1932     } else {
1933         val = (int32_t)env->regs[reg];
1934     }
1935     if (ctrl & 1) {
1936         limit = 8;
1937     } else {
1938         limit = 16;
1939     }
1940     if ((val > limit) || (val < -limit)) {
1941         return limit;
1942     }
1943     return abs1(val);
1944 }
1945 
1946 static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
1947 {
1948     int val = 0;
1949 
1950     if (ctrl & 1) {
1951         while (val < 8 && r->W(val)) {
1952             val++;
1953         }
1954     } else {
1955         while (val < 16 && r->B(val)) {
1956             val++;
1957         }
1958     }
1959 
1960     return val;
1961 }
1962 
1963 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
1964 {
1965     switch ((ctrl >> 0) & 3) {
1966     case 0:
1967         return r->B(i);
1968     case 1:
1969         return r->W(i);
1970     case 2:
1971         return (int8_t)r->B(i);
1972     case 3:
1973     default:
1974         return (int16_t)r->W(i);
1975     }
1976 }
1977 
1978 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
1979                                  uint8_t ctrl, int valids, int validd)
1980 {
1981     unsigned int res = 0;
1982     int v;
1983     int j, i;
1984     int upper = (ctrl & 1) ? 7 : 15;
1985 
1986     valids--;
1987     validd--;
1988 
1989     CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
1990 
1991     switch ((ctrl >> 2) & 3) {
1992     case 0:
1993         for (j = valids; j >= 0; j--) {
1994             res <<= 1;
1995             v = pcmp_val(s, ctrl, j);
1996             for (i = validd; i >= 0; i--) {
1997                 res |= (v == pcmp_val(d, ctrl, i));
1998             }
1999         }
2000         break;
2001     case 1:
2002         for (j = valids; j >= 0; j--) {
2003             res <<= 1;
2004             v = pcmp_val(s, ctrl, j);
2005             for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
2006                 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
2007                         pcmp_val(d, ctrl, i - 1) <= v);
2008             }
2009         }
2010         break;
2011     case 2:
2012         res = (1 << (upper - MAX(valids, validd))) - 1;
2013         res <<= MAX(valids, validd) - MIN(valids, validd);
2014         for (i = MIN(valids, validd); i >= 0; i--) {
2015             res <<= 1;
2016             v = pcmp_val(s, ctrl, i);
2017             res |= (v == pcmp_val(d, ctrl, i));
2018         }
2019         break;
2020     case 3:
2021         if (validd == -1) {
2022             res = (2 << upper) - 1;
2023             break;
2024         }
2025         for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
2026             res <<= 1;
2027             v = 1;
2028             for (i = MIN(valids - j, validd); i >= 0; i--) {
2029                 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
2030             }
2031             res |= v;
2032         }
2033         break;
2034     }
2035 
2036     switch ((ctrl >> 4) & 3) {
2037     case 1:
2038         res ^= (2 << upper) - 1;
2039         break;
2040     case 3:
2041         res ^= (1 << (valids + 1)) - 1;
2042         break;
2043     }
2044 
2045     if (res) {
2046         CC_SRC |= CC_C;
2047     }
2048     if (res & 1) {
2049         CC_SRC |= CC_O;
2050     }
2051 
2052     return res;
2053 }
2054 
2055 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2056                                     uint32_t ctrl)
2057 {
2058     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2059                                  pcmp_elen(env, R_EDX, ctrl),
2060                                  pcmp_elen(env, R_EAX, ctrl));
2061 
2062     if (res) {
2063         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2064     } else {
2065         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2066     }
2067 }
2068 
2069 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2070                                     uint32_t ctrl)
2071 {
2072     int i;
2073     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2074                                  pcmp_elen(env, R_EDX, ctrl),
2075                                  pcmp_elen(env, R_EAX, ctrl));
2076 
2077     if ((ctrl >> 6) & 1) {
2078         if (ctrl & 1) {
2079             for (i = 0; i < 8; i++, res >>= 1) {
2080                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2081             }
2082         } else {
2083             for (i = 0; i < 16; i++, res >>= 1) {
2084                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2085             }
2086         }
2087     } else {
2088         env->xmm_regs[0].Q(1) = 0;
2089         env->xmm_regs[0].Q(0) = res;
2090     }
2091 }
2092 
2093 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2094                                     uint32_t ctrl)
2095 {
2096     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2097                                  pcmp_ilen(s, ctrl),
2098                                  pcmp_ilen(d, ctrl));
2099 
2100     if (res) {
2101         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2102     } else {
2103         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2104     }
2105 }
2106 
2107 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2108                                     uint32_t ctrl)
2109 {
2110     int i;
2111     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2112                                  pcmp_ilen(s, ctrl),
2113                                  pcmp_ilen(d, ctrl));
2114 
2115     if ((ctrl >> 6) & 1) {
2116         if (ctrl & 1) {
2117             for (i = 0; i < 8; i++, res >>= 1) {
2118                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2119             }
2120         } else {
2121             for (i = 0; i < 16; i++, res >>= 1) {
2122                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2123             }
2124         }
2125     } else {
2126         env->xmm_regs[0].Q(1) = 0;
2127         env->xmm_regs[0].Q(0) = res;
2128     }
2129 }
2130 
2131 #define CRCPOLY        0x1edc6f41
2132 #define CRCPOLY_BITREV 0x82f63b78
2133 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2134 {
2135     target_ulong crc = (msg & ((target_ulong) -1 >>
2136                                (TARGET_LONG_BITS - len))) ^ crc1;
2137 
2138     while (len--) {
2139         crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
2140     }
2141 
2142     return crc;
2143 }
2144 
2145 #endif
2146 
2147 #if SHIFT == 1
2148 static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
2149                           uint64_t a, uint64_t b)
2150 {
2151     uint64_t al, ah, resh, resl;
2152 
2153     ah = 0;
2154     al = a;
2155     resh = resl = 0;
2156 
2157     while (b) {
2158         if (b & 1) {
2159             resl ^= al;
2160             resh ^= ah;
2161         }
2162         ah = (ah << 1) | (al >> 63);
2163         al <<= 1;
2164         b >>= 1;
2165     }
2166 
2167     *dest_l = resl;
2168     *dest_h = resh;
2169 }
2170 #endif
2171 
2172 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
2173                                     uint32_t ctrl)
2174 {
2175     uint64_t a, b;
2176     int i;
2177 
2178     for (i = 0; i < 1 << SHIFT; i += 2) {
2179         a = v->Q(((ctrl & 1) != 0) + i);
2180         b = s->Q(((ctrl & 16) != 0) + i);
2181         clmulq(&d->Q(i), &d->Q(i + 1), a, b);
2182     }
2183 }
2184 
2185 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2186 {
2187     int i;
2188     Reg st = *v;
2189     Reg rk = *s;
2190 
2191     for (i = 0 ; i < 2 << SHIFT ; i++) {
2192         int j = i & 3;
2193         d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
2194                                     AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^
2195                                     AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^
2196                                     AES_Td3[st.B(AES_ishifts[4 * j + 3])]);
2197     }
2198 }
2199 
2200 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2201 {
2202     int i;
2203     Reg st = *v;
2204     Reg rk = *s;
2205 
2206     for (i = 0; i < 8 << SHIFT; i++) {
2207         d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
2208     }
2209 }
2210 
2211 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2212 {
2213     int i;
2214     Reg st = *v;
2215     Reg rk = *s;
2216 
2217     for (i = 0 ; i < 2 << SHIFT ; i++) {
2218         int j = i & 3;
2219         d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
2220                                     AES_Te1[st.B(AES_shifts[4 * j + 1])] ^
2221                                     AES_Te2[st.B(AES_shifts[4 * j + 2])] ^
2222                                     AES_Te3[st.B(AES_shifts[4 * j + 3])]);
2223     }
2224 }
2225 
2226 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2227 {
2228     int i;
2229     Reg st = *v;
2230     Reg rk = *s;
2231 
2232     for (i = 0; i < 8 << SHIFT; i++) {
2233         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
2234     }
2235 }
2236 
2237 #if SHIFT == 1
2238 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2239 {
2240     int i;
2241     Reg tmp = *s;
2242 
2243     for (i = 0 ; i < 4 ; i++) {
2244         d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
2245                           AES_imc[tmp.B(4 * i + 1)][1] ^
2246                           AES_imc[tmp.B(4 * i + 2)][2] ^
2247                           AES_imc[tmp.B(4 * i + 3)][3]);
2248     }
2249 }
2250 
2251 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2252                                           uint32_t ctrl)
2253 {
2254     int i;
2255     Reg tmp = *s;
2256 
2257     for (i = 0 ; i < 4 ; i++) {
2258         d->B(i) = AES_sbox[tmp.B(i + 4)];
2259         d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
2260     }
2261     d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2262     d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2263 }
2264 #endif
2265 #endif
2266 
2267 #if SHIFT >= 1
2268 void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2269 {
2270     uint64_t r0, r1;
2271     int i;
2272 
2273     for (i = 0; i < 1 << SHIFT; i += 2) {
2274         r0 = v->Q(i + ((s->Q(i) >> 1) & 1));
2275         r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1));
2276         d->Q(i) = r0;
2277         d->Q(i+1) = r1;
2278     }
2279 }
2280 
2281 void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2282 {
2283     uint32_t r0, r1, r2, r3;
2284     int i;
2285 
2286     for (i = 0; i < 2 << SHIFT; i += 4) {
2287         r0 = v->L(i + (s->L(i) & 3));
2288         r1 = v->L(i + (s->L(i+1) & 3));
2289         r2 = v->L(i + (s->L(i+2) & 3));
2290         r3 = v->L(i + (s->L(i+3) & 3));
2291         d->L(i) = r0;
2292         d->L(i+1) = r1;
2293         d->L(i+2) = r2;
2294         d->L(i+3) = r3;
2295     }
2296 }
2297 
2298 void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
2299 {
2300     uint64_t r0, r1;
2301     int i;
2302 
2303     for (i = 0; i < 1 << SHIFT; i += 2) {
2304         r0 = s->Q(i + ((order >> 0) & 1));
2305         r1 = s->Q(i + ((order >> 1) & 1));
2306         d->Q(i) = r0;
2307         d->Q(i+1) = r1;
2308 
2309         order >>= 2;
2310     }
2311 }
2312 
2313 void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
2314 {
2315     uint32_t r0, r1, r2, r3;
2316     int i;
2317 
2318     for (i = 0; i < 2 << SHIFT; i += 4) {
2319         r0 = s->L(i + ((order >> 0) & 3));
2320         r1 = s->L(i + ((order >> 2) & 3));
2321         r2 = s->L(i + ((order >> 4) & 3));
2322         r3 = s->L(i + ((order >> 6) & 3));
2323         d->L(i) = r0;
2324         d->L(i+1) = r1;
2325         d->L(i+2) = r2;
2326         d->L(i+3) = r3;
2327     }
2328 }
2329 
2330 #if SHIFT == 1
2331 #define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0)
2332 #define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0)
2333 #define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31))
2334 #define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63))
2335 #define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0)
2336 #define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0)
2337 #endif
2338 
2339 SSE_HELPER_L(helper_vpsrlvd, FPSRLVD)
2340 SSE_HELPER_L(helper_vpsravd, FPSRAVD)
2341 SSE_HELPER_L(helper_vpsllvd, FPSLLVD)
2342 
2343 SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
2344 SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
2345 SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
2346 
2347 void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2348 {
2349     uint32_t zf = 0, cf = 0;
2350     int i;
2351 
2352     for (i = 0; i < 2 << SHIFT; i++) {
2353         zf |= (s->L(i) &  d->L(i));
2354         cf |= (s->L(i) & ~d->L(i));
2355     }
2356     CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C);
2357 }
2358 
2359 void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2360 {
2361     uint64_t zf = 0, cf = 0;
2362     int i;
2363 
2364     for (i = 0; i < 1 << SHIFT; i++) {
2365         zf |= (s->Q(i) &  d->Q(i));
2366         cf |= (s->Q(i) & ~d->Q(i));
2367     }
2368     CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
2369 }
2370 
2371 void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
2372                                         Reg *v, Reg *s, target_ulong a0)
2373 {
2374     int i;
2375 
2376     for (i = 0; i < (2 << SHIFT); i++) {
2377         if (v->L(i) >> 31) {
2378             cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
2379         }
2380     }
2381 }
2382 
2383 void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
2384                                         Reg *v, Reg *s, target_ulong a0)
2385 {
2386     int i;
2387 
2388     for (i = 0; i < (1 << SHIFT); i++) {
2389         if (v->Q(i) >> 63) {
2390             cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
2391         }
2392     }
2393 }
2394 
2395 void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2396 {
2397     int i;
2398 
2399     for (i = 0; i < (2 << SHIFT); i++) {
2400         d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
2401     }
2402 }
2403 
2404 void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2405 {
2406     int i;
2407 
2408     for (i = 0; i < (1 << SHIFT); i++) {
2409         d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0;
2410     }
2411 }
2412 
2413 void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
2414         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2415 {
2416     int i;
2417     for (i = 0; i < (2 << SHIFT); i++) {
2418         if (v->L(i) >> 31) {
2419             target_ulong addr = a0
2420                 + ((target_ulong)(int32_t)s->L(i) << scale);
2421             d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
2422         }
2423         v->L(i) = 0;
2424     }
2425 }
2426 
2427 void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env,
2428         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2429 {
2430     int i;
2431     for (i = 0; i < (1 << SHIFT); i++) {
2432         if (v->Q(i) >> 63) {
2433             target_ulong addr = a0
2434                 + ((target_ulong)(int32_t)s->L(i) << scale);
2435             d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
2436         }
2437         v->Q(i) = 0;
2438     }
2439 }
2440 
2441 void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env,
2442         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2443 {
2444     int i;
2445     for (i = 0; i < (1 << SHIFT); i++) {
2446         if (v->L(i) >> 31) {
2447             target_ulong addr = a0
2448                 + ((target_ulong)(int64_t)s->Q(i) << scale);
2449             d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
2450         }
2451         v->L(i) = 0;
2452     }
2453     for (i /= 2; i < 1 << SHIFT; i++) {
2454         d->Q(i) = 0;
2455         v->Q(i) = 0;
2456     }
2457 }
2458 
2459 void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env,
2460         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2461 {
2462     int i;
2463     for (i = 0; i < (1 << SHIFT); i++) {
2464         if (v->Q(i) >> 63) {
2465             target_ulong addr = a0
2466                 + ((target_ulong)(int64_t)s->Q(i) << scale);
2467             d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
2468         }
2469         v->Q(i) = 0;
2470     }
2471 }
2472 #endif
2473 
2474 #if SHIFT >= 2
2475 void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order)
2476 {
2477     uint64_t r0, r1, r2, r3;
2478 
2479     switch (order & 3) {
2480     case 0:
2481         r0 = v->Q(0);
2482         r1 = v->Q(1);
2483         break;
2484     case 1:
2485         r0 = v->Q(2);
2486         r1 = v->Q(3);
2487         break;
2488     case 2:
2489         r0 = s->Q(0);
2490         r1 = s->Q(1);
2491         break;
2492     case 3:
2493         r0 = s->Q(2);
2494         r1 = s->Q(3);
2495         break;
2496     }
2497     switch ((order >> 4) & 3) {
2498     case 0:
2499         r2 = v->Q(0);
2500         r3 = v->Q(1);
2501         break;
2502     case 1:
2503         r2 = v->Q(2);
2504         r3 = v->Q(3);
2505         break;
2506     case 2:
2507         r2 = s->Q(0);
2508         r3 = s->Q(1);
2509         break;
2510     case 3:
2511         r2 = s->Q(2);
2512         r3 = s->Q(3);
2513         break;
2514     }
2515     d->Q(0) = r0;
2516     d->Q(1) = r1;
2517     d->Q(2) = r2;
2518     d->Q(3) = r3;
2519 }
2520 
2521 void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order)
2522 {
2523     uint64_t r0, r1, r2, r3;
2524     r0 = s->Q(order & 3);
2525     r1 = s->Q((order >> 2) & 3);
2526     r2 = s->Q((order >> 4) & 3);
2527     r3 = s->Q((order >> 6) & 3);
2528     d->Q(0) = r0;
2529     d->Q(1) = r1;
2530     d->Q(2) = r2;
2531     d->Q(3) = r3;
2532 }
2533 
2534 void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
2535 {
2536     uint32_t r[8];
2537     int i;
2538 
2539     for (i = 0; i < 8; i++) {
2540         r[i] = s->L(v->L(i) & 7);
2541     }
2542     for (i = 0; i < 8; i++) {
2543         d->L(i) = r[i];
2544     }
2545 }
2546 #endif
2547 
2548 #undef SSE_HELPER_S
2549 
2550 #undef LANE_WIDTH
2551 #undef SHIFT
2552 #undef XMM_ONLY
2553 #undef Reg
2554 #undef B
2555 #undef W
2556 #undef L
2557 #undef Q
2558 #undef SUFFIX
2559