xref: /openbmc/qemu/target/mips/tcg/lmmi_helper.c (revision a2b0a27d33e9b1079698cee04ff029a0555b5ea5)
1*a2b0a27dSPhilippe Mathieu-Daudé /*
2*a2b0a27dSPhilippe Mathieu-Daudé  *  Loongson Multimedia Instruction emulation helpers for QEMU.
3*a2b0a27dSPhilippe Mathieu-Daudé  *
4*a2b0a27dSPhilippe Mathieu-Daudé  *  Copyright (c) 2011  Richard Henderson <rth@twiddle.net>
5*a2b0a27dSPhilippe Mathieu-Daudé  *
6*a2b0a27dSPhilippe Mathieu-Daudé  * This library is free software; you can redistribute it and/or
7*a2b0a27dSPhilippe Mathieu-Daudé  * modify it under the terms of the GNU Lesser General Public
8*a2b0a27dSPhilippe Mathieu-Daudé  * License as published by the Free Software Foundation; either
9*a2b0a27dSPhilippe Mathieu-Daudé  * version 2.1 of the License, or (at your option) any later version.
10*a2b0a27dSPhilippe Mathieu-Daudé  *
11*a2b0a27dSPhilippe Mathieu-Daudé  * This library is distributed in the hope that it will be useful,
12*a2b0a27dSPhilippe Mathieu-Daudé  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13*a2b0a27dSPhilippe Mathieu-Daudé  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14*a2b0a27dSPhilippe Mathieu-Daudé  * Lesser General Public License for more details.
15*a2b0a27dSPhilippe Mathieu-Daudé  *
16*a2b0a27dSPhilippe Mathieu-Daudé  * You should have received a copy of the GNU Lesser General Public
17*a2b0a27dSPhilippe Mathieu-Daudé  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18*a2b0a27dSPhilippe Mathieu-Daudé  */
19*a2b0a27dSPhilippe Mathieu-Daudé 
20*a2b0a27dSPhilippe Mathieu-Daudé #include "qemu/osdep.h"
21*a2b0a27dSPhilippe Mathieu-Daudé #include "cpu.h"
22*a2b0a27dSPhilippe Mathieu-Daudé #include "exec/helper-proto.h"
23*a2b0a27dSPhilippe Mathieu-Daudé 
24*a2b0a27dSPhilippe Mathieu-Daudé /*
25*a2b0a27dSPhilippe Mathieu-Daudé  * If the byte ordering doesn't matter, i.e. all columns are treated
26*a2b0a27dSPhilippe Mathieu-Daudé  * identically, then this union can be used directly.  If byte ordering
27*a2b0a27dSPhilippe Mathieu-Daudé  * does matter, we generally ignore dumping to memory.
28*a2b0a27dSPhilippe Mathieu-Daudé  */
29*a2b0a27dSPhilippe Mathieu-Daudé typedef union {
30*a2b0a27dSPhilippe Mathieu-Daudé     uint8_t  ub[8];
31*a2b0a27dSPhilippe Mathieu-Daudé     int8_t   sb[8];
32*a2b0a27dSPhilippe Mathieu-Daudé     uint16_t uh[4];
33*a2b0a27dSPhilippe Mathieu-Daudé     int16_t  sh[4];
34*a2b0a27dSPhilippe Mathieu-Daudé     uint32_t uw[2];
35*a2b0a27dSPhilippe Mathieu-Daudé     int32_t  sw[2];
36*a2b0a27dSPhilippe Mathieu-Daudé     uint64_t d;
37*a2b0a27dSPhilippe Mathieu-Daudé } LMIValue;
38*a2b0a27dSPhilippe Mathieu-Daudé 
39*a2b0a27dSPhilippe Mathieu-Daudé /* Some byte ordering issues can be mitigated by XORing in the following.  */
40*a2b0a27dSPhilippe Mathieu-Daudé #ifdef HOST_WORDS_BIGENDIAN
41*a2b0a27dSPhilippe Mathieu-Daudé # define BYTE_ORDER_XOR(N) N
42*a2b0a27dSPhilippe Mathieu-Daudé #else
43*a2b0a27dSPhilippe Mathieu-Daudé # define BYTE_ORDER_XOR(N) 0
44*a2b0a27dSPhilippe Mathieu-Daudé #endif
45*a2b0a27dSPhilippe Mathieu-Daudé 
46*a2b0a27dSPhilippe Mathieu-Daudé #define SATSB(x)  (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
47*a2b0a27dSPhilippe Mathieu-Daudé #define SATUB(x)  (x > 0xff ? 0xff : x)
48*a2b0a27dSPhilippe Mathieu-Daudé 
49*a2b0a27dSPhilippe Mathieu-Daudé #define SATSH(x)  (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
50*a2b0a27dSPhilippe Mathieu-Daudé #define SATUH(x)  (x > 0xffff ? 0xffff : x)
51*a2b0a27dSPhilippe Mathieu-Daudé 
52*a2b0a27dSPhilippe Mathieu-Daudé #define SATSW(x) \
53*a2b0a27dSPhilippe Mathieu-Daudé     (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
54*a2b0a27dSPhilippe Mathieu-Daudé #define SATUW(x)  (x > 0xffffffffull ? 0xffffffffull : x)
55*a2b0a27dSPhilippe Mathieu-Daudé 
56*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
57*a2b0a27dSPhilippe Mathieu-Daudé {
58*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
59*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
60*a2b0a27dSPhilippe Mathieu-Daudé 
61*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
62*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
63*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
64*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.sb[i] + vt.sb[i];
65*a2b0a27dSPhilippe Mathieu-Daudé         vs.sb[i] = SATSB(r);
66*a2b0a27dSPhilippe Mathieu-Daudé     }
67*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
68*a2b0a27dSPhilippe Mathieu-Daudé }
69*a2b0a27dSPhilippe Mathieu-Daudé 
70*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
71*a2b0a27dSPhilippe Mathieu-Daudé {
72*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
73*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
74*a2b0a27dSPhilippe Mathieu-Daudé 
75*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
76*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
77*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
78*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.ub[i] + vt.ub[i];
79*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = SATUB(r);
80*a2b0a27dSPhilippe Mathieu-Daudé     }
81*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
82*a2b0a27dSPhilippe Mathieu-Daudé }
83*a2b0a27dSPhilippe Mathieu-Daudé 
84*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
85*a2b0a27dSPhilippe Mathieu-Daudé {
86*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
87*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
88*a2b0a27dSPhilippe Mathieu-Daudé 
89*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
90*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
91*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
92*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.sh[i] + vt.sh[i];
93*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] = SATSH(r);
94*a2b0a27dSPhilippe Mathieu-Daudé     }
95*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
96*a2b0a27dSPhilippe Mathieu-Daudé }
97*a2b0a27dSPhilippe Mathieu-Daudé 
98*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddush(uint64_t fs, uint64_t ft)
99*a2b0a27dSPhilippe Mathieu-Daudé {
100*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
101*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
102*a2b0a27dSPhilippe Mathieu-Daudé 
103*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
104*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
105*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
106*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.uh[i] + vt.uh[i];
107*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] = SATUH(r);
108*a2b0a27dSPhilippe Mathieu-Daudé     }
109*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
110*a2b0a27dSPhilippe Mathieu-Daudé }
111*a2b0a27dSPhilippe Mathieu-Daudé 
112*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddb(uint64_t fs, uint64_t ft)
113*a2b0a27dSPhilippe Mathieu-Daudé {
114*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
115*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
116*a2b0a27dSPhilippe Mathieu-Daudé 
117*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
118*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
119*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
120*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] += vt.ub[i];
121*a2b0a27dSPhilippe Mathieu-Daudé     }
122*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
123*a2b0a27dSPhilippe Mathieu-Daudé }
124*a2b0a27dSPhilippe Mathieu-Daudé 
125*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddh(uint64_t fs, uint64_t ft)
126*a2b0a27dSPhilippe Mathieu-Daudé {
127*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
128*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
129*a2b0a27dSPhilippe Mathieu-Daudé 
130*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
131*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
132*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
133*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] += vt.uh[i];
134*a2b0a27dSPhilippe Mathieu-Daudé     }
135*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
136*a2b0a27dSPhilippe Mathieu-Daudé }
137*a2b0a27dSPhilippe Mathieu-Daudé 
138*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddw(uint64_t fs, uint64_t ft)
139*a2b0a27dSPhilippe Mathieu-Daudé {
140*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
141*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
142*a2b0a27dSPhilippe Mathieu-Daudé 
143*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
144*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
145*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; ++i) {
146*a2b0a27dSPhilippe Mathieu-Daudé         vs.uw[i] += vt.uw[i];
147*a2b0a27dSPhilippe Mathieu-Daudé     }
148*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
149*a2b0a27dSPhilippe Mathieu-Daudé }
150*a2b0a27dSPhilippe Mathieu-Daudé 
151*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
152*a2b0a27dSPhilippe Mathieu-Daudé {
153*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
154*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
155*a2b0a27dSPhilippe Mathieu-Daudé 
156*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
157*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
158*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
159*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.sb[i] - vt.sb[i];
160*a2b0a27dSPhilippe Mathieu-Daudé         vs.sb[i] = SATSB(r);
161*a2b0a27dSPhilippe Mathieu-Daudé     }
162*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
163*a2b0a27dSPhilippe Mathieu-Daudé }
164*a2b0a27dSPhilippe Mathieu-Daudé 
165*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
166*a2b0a27dSPhilippe Mathieu-Daudé {
167*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
168*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
169*a2b0a27dSPhilippe Mathieu-Daudé 
170*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
171*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
172*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
173*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.ub[i] - vt.ub[i];
174*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = SATUB(r);
175*a2b0a27dSPhilippe Mathieu-Daudé     }
176*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
177*a2b0a27dSPhilippe Mathieu-Daudé }
178*a2b0a27dSPhilippe Mathieu-Daudé 
179*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
180*a2b0a27dSPhilippe Mathieu-Daudé {
181*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
182*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
183*a2b0a27dSPhilippe Mathieu-Daudé 
184*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
185*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
186*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
187*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.sh[i] - vt.sh[i];
188*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] = SATSH(r);
189*a2b0a27dSPhilippe Mathieu-Daudé     }
190*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
191*a2b0a27dSPhilippe Mathieu-Daudé }
192*a2b0a27dSPhilippe Mathieu-Daudé 
193*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubush(uint64_t fs, uint64_t ft)
194*a2b0a27dSPhilippe Mathieu-Daudé {
195*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
196*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
197*a2b0a27dSPhilippe Mathieu-Daudé 
198*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
199*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
200*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
201*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.uh[i] - vt.uh[i];
202*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] = SATUH(r);
203*a2b0a27dSPhilippe Mathieu-Daudé     }
204*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
205*a2b0a27dSPhilippe Mathieu-Daudé }
206*a2b0a27dSPhilippe Mathieu-Daudé 
207*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubb(uint64_t fs, uint64_t ft)
208*a2b0a27dSPhilippe Mathieu-Daudé {
209*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
210*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
211*a2b0a27dSPhilippe Mathieu-Daudé 
212*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
213*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
214*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
215*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] -= vt.ub[i];
216*a2b0a27dSPhilippe Mathieu-Daudé     }
217*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
218*a2b0a27dSPhilippe Mathieu-Daudé }
219*a2b0a27dSPhilippe Mathieu-Daudé 
220*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubh(uint64_t fs, uint64_t ft)
221*a2b0a27dSPhilippe Mathieu-Daudé {
222*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
223*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
224*a2b0a27dSPhilippe Mathieu-Daudé 
225*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
226*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
227*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
228*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] -= vt.uh[i];
229*a2b0a27dSPhilippe Mathieu-Daudé     }
230*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
231*a2b0a27dSPhilippe Mathieu-Daudé }
232*a2b0a27dSPhilippe Mathieu-Daudé 
233*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubw(uint64_t fs, uint64_t ft)
234*a2b0a27dSPhilippe Mathieu-Daudé {
235*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
236*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
237*a2b0a27dSPhilippe Mathieu-Daudé 
238*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
239*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
240*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; ++i) {
241*a2b0a27dSPhilippe Mathieu-Daudé         vs.uw[i] -= vt.uw[i];
242*a2b0a27dSPhilippe Mathieu-Daudé     }
243*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
244*a2b0a27dSPhilippe Mathieu-Daudé }
245*a2b0a27dSPhilippe Mathieu-Daudé 
246*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
247*a2b0a27dSPhilippe Mathieu-Daudé {
248*a2b0a27dSPhilippe Mathieu-Daudé     unsigned host = BYTE_ORDER_XOR(3);
249*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vd, vs;
250*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
251*a2b0a27dSPhilippe Mathieu-Daudé 
252*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
253*a2b0a27dSPhilippe Mathieu-Daudé     vd.d = 0;
254*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++, ft >>= 2) {
255*a2b0a27dSPhilippe Mathieu-Daudé         vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
256*a2b0a27dSPhilippe Mathieu-Daudé     }
257*a2b0a27dSPhilippe Mathieu-Daudé     return vd.d;
258*a2b0a27dSPhilippe Mathieu-Daudé }
259*a2b0a27dSPhilippe Mathieu-Daudé 
260*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
261*a2b0a27dSPhilippe Mathieu-Daudé {
262*a2b0a27dSPhilippe Mathieu-Daudé     uint64_t fd = 0;
263*a2b0a27dSPhilippe Mathieu-Daudé     int64_t tmp;
264*a2b0a27dSPhilippe Mathieu-Daudé 
265*a2b0a27dSPhilippe Mathieu-Daudé     tmp = (int32_t)(fs >> 0);
266*a2b0a27dSPhilippe Mathieu-Daudé     tmp = SATSH(tmp);
267*a2b0a27dSPhilippe Mathieu-Daudé     fd |= (tmp & 0xffff) << 0;
268*a2b0a27dSPhilippe Mathieu-Daudé 
269*a2b0a27dSPhilippe Mathieu-Daudé     tmp = (int32_t)(fs >> 32);
270*a2b0a27dSPhilippe Mathieu-Daudé     tmp = SATSH(tmp);
271*a2b0a27dSPhilippe Mathieu-Daudé     fd |= (tmp & 0xffff) << 16;
272*a2b0a27dSPhilippe Mathieu-Daudé 
273*a2b0a27dSPhilippe Mathieu-Daudé     tmp = (int32_t)(ft >> 0);
274*a2b0a27dSPhilippe Mathieu-Daudé     tmp = SATSH(tmp);
275*a2b0a27dSPhilippe Mathieu-Daudé     fd |= (tmp & 0xffff) << 32;
276*a2b0a27dSPhilippe Mathieu-Daudé 
277*a2b0a27dSPhilippe Mathieu-Daudé     tmp = (int32_t)(ft >> 32);
278*a2b0a27dSPhilippe Mathieu-Daudé     tmp = SATSH(tmp);
279*a2b0a27dSPhilippe Mathieu-Daudé     fd |= (tmp & 0xffff) << 48;
280*a2b0a27dSPhilippe Mathieu-Daudé 
281*a2b0a27dSPhilippe Mathieu-Daudé     return fd;
282*a2b0a27dSPhilippe Mathieu-Daudé }
283*a2b0a27dSPhilippe Mathieu-Daudé 
284*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
285*a2b0a27dSPhilippe Mathieu-Daudé {
286*a2b0a27dSPhilippe Mathieu-Daudé     uint64_t fd = 0;
287*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
288*a2b0a27dSPhilippe Mathieu-Daudé 
289*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
290*a2b0a27dSPhilippe Mathieu-Daudé         int16_t tmp = fs >> (i * 16);
291*a2b0a27dSPhilippe Mathieu-Daudé         tmp = SATSB(tmp);
292*a2b0a27dSPhilippe Mathieu-Daudé         fd |= (uint64_t)(tmp & 0xff) << (i * 8);
293*a2b0a27dSPhilippe Mathieu-Daudé     }
294*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
295*a2b0a27dSPhilippe Mathieu-Daudé         int16_t tmp = ft >> (i * 16);
296*a2b0a27dSPhilippe Mathieu-Daudé         tmp = SATSB(tmp);
297*a2b0a27dSPhilippe Mathieu-Daudé         fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
298*a2b0a27dSPhilippe Mathieu-Daudé     }
299*a2b0a27dSPhilippe Mathieu-Daudé 
300*a2b0a27dSPhilippe Mathieu-Daudé     return fd;
301*a2b0a27dSPhilippe Mathieu-Daudé }
302*a2b0a27dSPhilippe Mathieu-Daudé 
303*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_packushb(uint64_t fs, uint64_t ft)
304*a2b0a27dSPhilippe Mathieu-Daudé {
305*a2b0a27dSPhilippe Mathieu-Daudé     uint64_t fd = 0;
306*a2b0a27dSPhilippe Mathieu-Daudé     unsigned int i;
307*a2b0a27dSPhilippe Mathieu-Daudé 
308*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
309*a2b0a27dSPhilippe Mathieu-Daudé         int16_t tmp = fs >> (i * 16);
310*a2b0a27dSPhilippe Mathieu-Daudé         tmp = SATUB(tmp);
311*a2b0a27dSPhilippe Mathieu-Daudé         fd |= (uint64_t)(tmp & 0xff) << (i * 8);
312*a2b0a27dSPhilippe Mathieu-Daudé     }
313*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
314*a2b0a27dSPhilippe Mathieu-Daudé         int16_t tmp = ft >> (i * 16);
315*a2b0a27dSPhilippe Mathieu-Daudé         tmp = SATUB(tmp);
316*a2b0a27dSPhilippe Mathieu-Daudé         fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
317*a2b0a27dSPhilippe Mathieu-Daudé     }
318*a2b0a27dSPhilippe Mathieu-Daudé 
319*a2b0a27dSPhilippe Mathieu-Daudé     return fd;
320*a2b0a27dSPhilippe Mathieu-Daudé }
321*a2b0a27dSPhilippe Mathieu-Daudé 
322*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
323*a2b0a27dSPhilippe Mathieu-Daudé {
324*a2b0a27dSPhilippe Mathieu-Daudé     return (fs & 0xffffffff) | (ft << 32);
325*a2b0a27dSPhilippe Mathieu-Daudé }
326*a2b0a27dSPhilippe Mathieu-Daudé 
327*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
328*a2b0a27dSPhilippe Mathieu-Daudé {
329*a2b0a27dSPhilippe Mathieu-Daudé     return (fs >> 32) | (ft & ~0xffffffffull);
330*a2b0a27dSPhilippe Mathieu-Daudé }
331*a2b0a27dSPhilippe Mathieu-Daudé 
332*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
333*a2b0a27dSPhilippe Mathieu-Daudé {
334*a2b0a27dSPhilippe Mathieu-Daudé     unsigned host = BYTE_ORDER_XOR(3);
335*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vd, vs, vt;
336*a2b0a27dSPhilippe Mathieu-Daudé 
337*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
338*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
339*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[0 ^ host] = vs.uh[0 ^ host];
340*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[1 ^ host] = vt.uh[0 ^ host];
341*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[2 ^ host] = vs.uh[1 ^ host];
342*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[3 ^ host] = vt.uh[1 ^ host];
343*a2b0a27dSPhilippe Mathieu-Daudé 
344*a2b0a27dSPhilippe Mathieu-Daudé     return vd.d;
345*a2b0a27dSPhilippe Mathieu-Daudé }
346*a2b0a27dSPhilippe Mathieu-Daudé 
347*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
348*a2b0a27dSPhilippe Mathieu-Daudé {
349*a2b0a27dSPhilippe Mathieu-Daudé     unsigned host = BYTE_ORDER_XOR(3);
350*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vd, vs, vt;
351*a2b0a27dSPhilippe Mathieu-Daudé 
352*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
353*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
354*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[0 ^ host] = vs.uh[2 ^ host];
355*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[1 ^ host] = vt.uh[2 ^ host];
356*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[2 ^ host] = vs.uh[3 ^ host];
357*a2b0a27dSPhilippe Mathieu-Daudé     vd.uh[3 ^ host] = vt.uh[3 ^ host];
358*a2b0a27dSPhilippe Mathieu-Daudé 
359*a2b0a27dSPhilippe Mathieu-Daudé     return vd.d;
360*a2b0a27dSPhilippe Mathieu-Daudé }
361*a2b0a27dSPhilippe Mathieu-Daudé 
362*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
363*a2b0a27dSPhilippe Mathieu-Daudé {
364*a2b0a27dSPhilippe Mathieu-Daudé     unsigned host = BYTE_ORDER_XOR(7);
365*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vd, vs, vt;
366*a2b0a27dSPhilippe Mathieu-Daudé 
367*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
368*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
369*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[0 ^ host] = vs.ub[0 ^ host];
370*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[1 ^ host] = vt.ub[0 ^ host];
371*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[2 ^ host] = vs.ub[1 ^ host];
372*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[3 ^ host] = vt.ub[1 ^ host];
373*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[4 ^ host] = vs.ub[2 ^ host];
374*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[5 ^ host] = vt.ub[2 ^ host];
375*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[6 ^ host] = vs.ub[3 ^ host];
376*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[7 ^ host] = vt.ub[3 ^ host];
377*a2b0a27dSPhilippe Mathieu-Daudé 
378*a2b0a27dSPhilippe Mathieu-Daudé     return vd.d;
379*a2b0a27dSPhilippe Mathieu-Daudé }
380*a2b0a27dSPhilippe Mathieu-Daudé 
381*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
382*a2b0a27dSPhilippe Mathieu-Daudé {
383*a2b0a27dSPhilippe Mathieu-Daudé     unsigned host = BYTE_ORDER_XOR(7);
384*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vd, vs, vt;
385*a2b0a27dSPhilippe Mathieu-Daudé 
386*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
387*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
388*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[0 ^ host] = vs.ub[4 ^ host];
389*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[1 ^ host] = vt.ub[4 ^ host];
390*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[2 ^ host] = vs.ub[5 ^ host];
391*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[3 ^ host] = vt.ub[5 ^ host];
392*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[4 ^ host] = vs.ub[6 ^ host];
393*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[5 ^ host] = vt.ub[6 ^ host];
394*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[6 ^ host] = vs.ub[7 ^ host];
395*a2b0a27dSPhilippe Mathieu-Daudé     vd.ub[7 ^ host] = vt.ub[7 ^ host];
396*a2b0a27dSPhilippe Mathieu-Daudé 
397*a2b0a27dSPhilippe Mathieu-Daudé     return vd.d;
398*a2b0a27dSPhilippe Mathieu-Daudé }
399*a2b0a27dSPhilippe Mathieu-Daudé 
400*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
401*a2b0a27dSPhilippe Mathieu-Daudé {
402*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
403*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
404*a2b0a27dSPhilippe Mathieu-Daudé 
405*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
406*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
407*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
408*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
409*a2b0a27dSPhilippe Mathieu-Daudé     }
410*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
411*a2b0a27dSPhilippe Mathieu-Daudé }
412*a2b0a27dSPhilippe Mathieu-Daudé 
413*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
414*a2b0a27dSPhilippe Mathieu-Daudé {
415*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
416*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
417*a2b0a27dSPhilippe Mathieu-Daudé 
418*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
419*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
420*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; i++) {
421*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
422*a2b0a27dSPhilippe Mathieu-Daudé     }
423*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
424*a2b0a27dSPhilippe Mathieu-Daudé }
425*a2b0a27dSPhilippe Mathieu-Daudé 
426*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
427*a2b0a27dSPhilippe Mathieu-Daudé {
428*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
429*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
430*a2b0a27dSPhilippe Mathieu-Daudé 
431*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
432*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
433*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
434*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
435*a2b0a27dSPhilippe Mathieu-Daudé     }
436*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
437*a2b0a27dSPhilippe Mathieu-Daudé }
438*a2b0a27dSPhilippe Mathieu-Daudé 
439*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
440*a2b0a27dSPhilippe Mathieu-Daudé {
441*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
442*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
443*a2b0a27dSPhilippe Mathieu-Daudé 
444*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
445*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
446*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
447*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
448*a2b0a27dSPhilippe Mathieu-Daudé     }
449*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
450*a2b0a27dSPhilippe Mathieu-Daudé }
451*a2b0a27dSPhilippe Mathieu-Daudé 
452*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
453*a2b0a27dSPhilippe Mathieu-Daudé {
454*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
455*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
456*a2b0a27dSPhilippe Mathieu-Daudé 
457*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
458*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
459*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
460*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
461*a2b0a27dSPhilippe Mathieu-Daudé     }
462*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
463*a2b0a27dSPhilippe Mathieu-Daudé }
464*a2b0a27dSPhilippe Mathieu-Daudé 
465*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pminub(uint64_t fs, uint64_t ft)
466*a2b0a27dSPhilippe Mathieu-Daudé {
467*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
468*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
469*a2b0a27dSPhilippe Mathieu-Daudé 
470*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
471*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
472*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
473*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
474*a2b0a27dSPhilippe Mathieu-Daudé     }
475*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
476*a2b0a27dSPhilippe Mathieu-Daudé }
477*a2b0a27dSPhilippe Mathieu-Daudé 
478*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
479*a2b0a27dSPhilippe Mathieu-Daudé {
480*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
481*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
482*a2b0a27dSPhilippe Mathieu-Daudé 
483*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
484*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
485*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; i++) {
486*a2b0a27dSPhilippe Mathieu-Daudé         vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
487*a2b0a27dSPhilippe Mathieu-Daudé     }
488*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
489*a2b0a27dSPhilippe Mathieu-Daudé }
490*a2b0a27dSPhilippe Mathieu-Daudé 
491*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
492*a2b0a27dSPhilippe Mathieu-Daudé {
493*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
494*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
495*a2b0a27dSPhilippe Mathieu-Daudé 
496*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
497*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
498*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; i++) {
499*a2b0a27dSPhilippe Mathieu-Daudé         vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
500*a2b0a27dSPhilippe Mathieu-Daudé     }
501*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
502*a2b0a27dSPhilippe Mathieu-Daudé }
503*a2b0a27dSPhilippe Mathieu-Daudé 
504*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
505*a2b0a27dSPhilippe Mathieu-Daudé {
506*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
507*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
508*a2b0a27dSPhilippe Mathieu-Daudé 
509*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
510*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
511*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
512*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
513*a2b0a27dSPhilippe Mathieu-Daudé     }
514*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
515*a2b0a27dSPhilippe Mathieu-Daudé }
516*a2b0a27dSPhilippe Mathieu-Daudé 
517*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
518*a2b0a27dSPhilippe Mathieu-Daudé {
519*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
520*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
521*a2b0a27dSPhilippe Mathieu-Daudé 
522*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
523*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
524*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; i++) {
525*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
526*a2b0a27dSPhilippe Mathieu-Daudé     }
527*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
528*a2b0a27dSPhilippe Mathieu-Daudé }
529*a2b0a27dSPhilippe Mathieu-Daudé 
530*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
531*a2b0a27dSPhilippe Mathieu-Daudé {
532*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
533*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
534*a2b0a27dSPhilippe Mathieu-Daudé 
535*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
536*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
537*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; i++) {
538*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
539*a2b0a27dSPhilippe Mathieu-Daudé     }
540*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
541*a2b0a27dSPhilippe Mathieu-Daudé }
542*a2b0a27dSPhilippe Mathieu-Daudé 
543*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
544*a2b0a27dSPhilippe Mathieu-Daudé {
545*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
546*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
547*a2b0a27dSPhilippe Mathieu-Daudé 
548*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
549*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
550*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; i++) {
551*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
552*a2b0a27dSPhilippe Mathieu-Daudé     }
553*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
554*a2b0a27dSPhilippe Mathieu-Daudé }
555*a2b0a27dSPhilippe Mathieu-Daudé 
556*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psllw(uint64_t fs, uint64_t ft)
557*a2b0a27dSPhilippe Mathieu-Daudé {
558*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs;
559*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
560*a2b0a27dSPhilippe Mathieu-Daudé 
561*a2b0a27dSPhilippe Mathieu-Daudé     ft &= 0x7f;
562*a2b0a27dSPhilippe Mathieu-Daudé     if (ft > 31) {
563*a2b0a27dSPhilippe Mathieu-Daudé         return 0;
564*a2b0a27dSPhilippe Mathieu-Daudé     }
565*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
566*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; ++i) {
567*a2b0a27dSPhilippe Mathieu-Daudé         vs.uw[i] <<= ft;
568*a2b0a27dSPhilippe Mathieu-Daudé     }
569*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
570*a2b0a27dSPhilippe Mathieu-Daudé }
571*a2b0a27dSPhilippe Mathieu-Daudé 
572*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
573*a2b0a27dSPhilippe Mathieu-Daudé {
574*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs;
575*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
576*a2b0a27dSPhilippe Mathieu-Daudé 
577*a2b0a27dSPhilippe Mathieu-Daudé     ft &= 0x7f;
578*a2b0a27dSPhilippe Mathieu-Daudé     if (ft > 31) {
579*a2b0a27dSPhilippe Mathieu-Daudé         return 0;
580*a2b0a27dSPhilippe Mathieu-Daudé     }
581*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
582*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; ++i) {
583*a2b0a27dSPhilippe Mathieu-Daudé         vs.uw[i] >>= ft;
584*a2b0a27dSPhilippe Mathieu-Daudé     }
585*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
586*a2b0a27dSPhilippe Mathieu-Daudé }
587*a2b0a27dSPhilippe Mathieu-Daudé 
588*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psraw(uint64_t fs, uint64_t ft)
589*a2b0a27dSPhilippe Mathieu-Daudé {
590*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs;
591*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
592*a2b0a27dSPhilippe Mathieu-Daudé 
593*a2b0a27dSPhilippe Mathieu-Daudé     ft &= 0x7f;
594*a2b0a27dSPhilippe Mathieu-Daudé     if (ft > 31) {
595*a2b0a27dSPhilippe Mathieu-Daudé         ft = 31;
596*a2b0a27dSPhilippe Mathieu-Daudé     }
597*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
598*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 2; ++i) {
599*a2b0a27dSPhilippe Mathieu-Daudé         vs.sw[i] >>= ft;
600*a2b0a27dSPhilippe Mathieu-Daudé     }
601*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
602*a2b0a27dSPhilippe Mathieu-Daudé }
603*a2b0a27dSPhilippe Mathieu-Daudé 
604*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psllh(uint64_t fs, uint64_t ft)
605*a2b0a27dSPhilippe Mathieu-Daudé {
606*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs;
607*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
608*a2b0a27dSPhilippe Mathieu-Daudé 
609*a2b0a27dSPhilippe Mathieu-Daudé     ft &= 0x7f;
610*a2b0a27dSPhilippe Mathieu-Daudé     if (ft > 15) {
611*a2b0a27dSPhilippe Mathieu-Daudé         return 0;
612*a2b0a27dSPhilippe Mathieu-Daudé     }
613*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
614*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
615*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] <<= ft;
616*a2b0a27dSPhilippe Mathieu-Daudé     }
617*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
618*a2b0a27dSPhilippe Mathieu-Daudé }
619*a2b0a27dSPhilippe Mathieu-Daudé 
620*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
621*a2b0a27dSPhilippe Mathieu-Daudé {
622*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs;
623*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
624*a2b0a27dSPhilippe Mathieu-Daudé 
625*a2b0a27dSPhilippe Mathieu-Daudé     ft &= 0x7f;
626*a2b0a27dSPhilippe Mathieu-Daudé     if (ft > 15) {
627*a2b0a27dSPhilippe Mathieu-Daudé         return 0;
628*a2b0a27dSPhilippe Mathieu-Daudé     }
629*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
630*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
631*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] >>= ft;
632*a2b0a27dSPhilippe Mathieu-Daudé     }
633*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
634*a2b0a27dSPhilippe Mathieu-Daudé }
635*a2b0a27dSPhilippe Mathieu-Daudé 
636*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psrah(uint64_t fs, uint64_t ft)
637*a2b0a27dSPhilippe Mathieu-Daudé {
638*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs;
639*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
640*a2b0a27dSPhilippe Mathieu-Daudé 
641*a2b0a27dSPhilippe Mathieu-Daudé     ft &= 0x7f;
642*a2b0a27dSPhilippe Mathieu-Daudé     if (ft > 15) {
643*a2b0a27dSPhilippe Mathieu-Daudé         ft = 15;
644*a2b0a27dSPhilippe Mathieu-Daudé     }
645*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
646*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
647*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] >>= ft;
648*a2b0a27dSPhilippe Mathieu-Daudé     }
649*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
650*a2b0a27dSPhilippe Mathieu-Daudé }
651*a2b0a27dSPhilippe Mathieu-Daudé 
652*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
653*a2b0a27dSPhilippe Mathieu-Daudé {
654*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
655*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
656*a2b0a27dSPhilippe Mathieu-Daudé 
657*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
658*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
659*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
660*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] *= vt.sh[i];
661*a2b0a27dSPhilippe Mathieu-Daudé     }
662*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
663*a2b0a27dSPhilippe Mathieu-Daudé }
664*a2b0a27dSPhilippe Mathieu-Daudé 
665*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
666*a2b0a27dSPhilippe Mathieu-Daudé {
667*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
668*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
669*a2b0a27dSPhilippe Mathieu-Daudé 
670*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
671*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
672*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
673*a2b0a27dSPhilippe Mathieu-Daudé         int32_t r = vs.sh[i] * vt.sh[i];
674*a2b0a27dSPhilippe Mathieu-Daudé         vs.sh[i] = r >> 16;
675*a2b0a27dSPhilippe Mathieu-Daudé     }
676*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
677*a2b0a27dSPhilippe Mathieu-Daudé }
678*a2b0a27dSPhilippe Mathieu-Daudé 
679*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
680*a2b0a27dSPhilippe Mathieu-Daudé {
681*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
682*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
683*a2b0a27dSPhilippe Mathieu-Daudé 
684*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
685*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
686*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 4; ++i) {
687*a2b0a27dSPhilippe Mathieu-Daudé         uint32_t r = vs.uh[i] * vt.uh[i];
688*a2b0a27dSPhilippe Mathieu-Daudé         vs.uh[i] = r >> 16;
689*a2b0a27dSPhilippe Mathieu-Daudé     }
690*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
691*a2b0a27dSPhilippe Mathieu-Daudé }
692*a2b0a27dSPhilippe Mathieu-Daudé 
693*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
694*a2b0a27dSPhilippe Mathieu-Daudé {
695*a2b0a27dSPhilippe Mathieu-Daudé     unsigned host = BYTE_ORDER_XOR(3);
696*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
697*a2b0a27dSPhilippe Mathieu-Daudé     uint32_t p0, p1;
698*a2b0a27dSPhilippe Mathieu-Daudé 
699*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
700*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
701*a2b0a27dSPhilippe Mathieu-Daudé     p0  = vs.sh[0 ^ host] * vt.sh[0 ^ host];
702*a2b0a27dSPhilippe Mathieu-Daudé     p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
703*a2b0a27dSPhilippe Mathieu-Daudé     p1  = vs.sh[2 ^ host] * vt.sh[2 ^ host];
704*a2b0a27dSPhilippe Mathieu-Daudé     p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
705*a2b0a27dSPhilippe Mathieu-Daudé 
706*a2b0a27dSPhilippe Mathieu-Daudé     return ((uint64_t)p1 << 32) | p0;
707*a2b0a27dSPhilippe Mathieu-Daudé }
708*a2b0a27dSPhilippe Mathieu-Daudé 
709*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
710*a2b0a27dSPhilippe Mathieu-Daudé {
711*a2b0a27dSPhilippe Mathieu-Daudé     LMIValue vs, vt;
712*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i;
713*a2b0a27dSPhilippe Mathieu-Daudé 
714*a2b0a27dSPhilippe Mathieu-Daudé     vs.d = fs;
715*a2b0a27dSPhilippe Mathieu-Daudé     vt.d = ft;
716*a2b0a27dSPhilippe Mathieu-Daudé     for (i = 0; i < 8; ++i) {
717*a2b0a27dSPhilippe Mathieu-Daudé         int r = vs.ub[i] - vt.ub[i];
718*a2b0a27dSPhilippe Mathieu-Daudé         vs.ub[i] = (r < 0 ? -r : r);
719*a2b0a27dSPhilippe Mathieu-Daudé     }
720*a2b0a27dSPhilippe Mathieu-Daudé     return vs.d;
721*a2b0a27dSPhilippe Mathieu-Daudé }
722*a2b0a27dSPhilippe Mathieu-Daudé 
723*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_biadd(uint64_t fs)
724*a2b0a27dSPhilippe Mathieu-Daudé {
725*a2b0a27dSPhilippe Mathieu-Daudé     unsigned i, fd;
726*a2b0a27dSPhilippe Mathieu-Daudé 
727*a2b0a27dSPhilippe Mathieu-Daudé     for (i = fd = 0; i < 8; ++i) {
728*a2b0a27dSPhilippe Mathieu-Daudé         fd += (fs >> (i * 8)) & 0xff;
729*a2b0a27dSPhilippe Mathieu-Daudé     }
730*a2b0a27dSPhilippe Mathieu-Daudé     return fd & 0xffff;
731*a2b0a27dSPhilippe Mathieu-Daudé }
732*a2b0a27dSPhilippe Mathieu-Daudé 
733*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmovmskb(uint64_t fs)
734*a2b0a27dSPhilippe Mathieu-Daudé {
735*a2b0a27dSPhilippe Mathieu-Daudé     unsigned fd = 0;
736*a2b0a27dSPhilippe Mathieu-Daudé 
737*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >>  7) & 1) << 0;
738*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 15) & 1) << 1;
739*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 23) & 1) << 2;
740*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 31) & 1) << 3;
741*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 39) & 1) << 4;
742*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 47) & 1) << 5;
743*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 55) & 1) << 6;
744*a2b0a27dSPhilippe Mathieu-Daudé     fd |= ((fs >> 63) & 1) << 7;
745*a2b0a27dSPhilippe Mathieu-Daudé 
746*a2b0a27dSPhilippe Mathieu-Daudé     return fd & 0xff;
747*a2b0a27dSPhilippe Mathieu-Daudé }
748