xref: /openbmc/qemu/target/loongarch/tcg/vec_helper.c (revision 5c23704e)
1*5c23704eSSong Gao /* SPDX-License-Identifier: GPL-2.0-or-later */
2*5c23704eSSong Gao /*
3*5c23704eSSong Gao  * QEMU LoongArch vector helper functions.
4*5c23704eSSong Gao  *
5*5c23704eSSong Gao  * Copyright (c) 2022-2023 Loongson Technology Corporation Limited
6*5c23704eSSong Gao  */
7*5c23704eSSong Gao 
8*5c23704eSSong Gao #include "qemu/osdep.h"
9*5c23704eSSong Gao #include "cpu.h"
10*5c23704eSSong Gao #include "exec/exec-all.h"
11*5c23704eSSong Gao #include "exec/helper-proto.h"
12*5c23704eSSong Gao #include "fpu/softfloat.h"
13*5c23704eSSong Gao #include "internals.h"
14*5c23704eSSong Gao #include "tcg/tcg.h"
15*5c23704eSSong Gao #include "vec.h"
16*5c23704eSSong Gao #include "tcg/tcg-gvec-desc.h"
17*5c23704eSSong Gao 
18*5c23704eSSong Gao #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP)                        \
19*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)       \
20*5c23704eSSong Gao {                                                                    \
21*5c23704eSSong Gao     int i;                                                           \
22*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                           \
23*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                           \
24*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                           \
25*5c23704eSSong Gao     typedef __typeof(Vd->E1(0)) TD;                                  \
26*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                    \
27*5c23704eSSong Gao                                                                      \
28*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                        \
29*5c23704eSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \
30*5c23704eSSong Gao     }                                                                \
31*5c23704eSSong Gao }
32*5c23704eSSong Gao 
33*5c23704eSSong Gao DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD)
34*5c23704eSSong Gao DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD)
35*5c23704eSSong Gao DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD)
36*5c23704eSSong Gao 
HELPER(vhaddw_q_d)37*5c23704eSSong Gao void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
38*5c23704eSSong Gao {
39*5c23704eSSong Gao     int i;
40*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
41*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
42*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
43*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
44*5c23704eSSong Gao 
45*5c23704eSSong Gao     for (i = 0; i < oprsz / 16 ; i++) {
46*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)),
47*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i)));
48*5c23704eSSong Gao     }
49*5c23704eSSong Gao }
50*5c23704eSSong Gao 
51*5c23704eSSong Gao DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB)
52*5c23704eSSong Gao DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB)
53*5c23704eSSong Gao DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB)
54*5c23704eSSong Gao 
HELPER(vhsubw_q_d)55*5c23704eSSong Gao void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
56*5c23704eSSong Gao {
57*5c23704eSSong Gao     int i;
58*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
59*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
60*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
61*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
62*5c23704eSSong Gao 
63*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
64*5c23704eSSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
65*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i)));
66*5c23704eSSong Gao     }
67*5c23704eSSong Gao }
68*5c23704eSSong Gao 
69*5c23704eSSong Gao DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD)
70*5c23704eSSong Gao DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD)
71*5c23704eSSong Gao DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD)
72*5c23704eSSong Gao 
HELPER(vhaddw_qu_du)73*5c23704eSSong Gao void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
74*5c23704eSSong Gao {
75*5c23704eSSong Gao     int i;
76*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
77*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
78*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
79*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
80*5c23704eSSong Gao 
81*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i ++) {
82*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
83*5c23704eSSong Gao                               int128_make64(Vk->UD(2 * i)));
84*5c23704eSSong Gao     }
85*5c23704eSSong Gao }
86*5c23704eSSong Gao 
87*5c23704eSSong Gao DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB)
88*5c23704eSSong Gao DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB)
89*5c23704eSSong Gao DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB)
90*5c23704eSSong Gao 
HELPER(vhsubw_qu_du)91*5c23704eSSong Gao void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
92*5c23704eSSong Gao {
93*5c23704eSSong Gao     int i;
94*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
95*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
96*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
97*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
98*5c23704eSSong Gao 
99*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
100*5c23704eSSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
101*5c23704eSSong Gao                               int128_make64(Vk->UD(2 * i)));
102*5c23704eSSong Gao     }
103*5c23704eSSong Gao }
104*5c23704eSSong Gao 
105*5c23704eSSong Gao #define DO_EVEN(NAME, BIT, E1, E2, DO_OP)                        \
106*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)   \
107*5c23704eSSong Gao {                                                                \
108*5c23704eSSong Gao     int i;                                                       \
109*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                       \
110*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                       \
111*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                       \
112*5c23704eSSong Gao     typedef __typeof(Vd->E1(0)) TD;                              \
113*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                \
114*5c23704eSSong Gao                                                                  \
115*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                    \
116*5c23704eSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \
117*5c23704eSSong Gao     }                                                            \
118*5c23704eSSong Gao }
119*5c23704eSSong Gao 
120*5c23704eSSong Gao #define DO_ODD(NAME, BIT, E1, E2, DO_OP)                                 \
121*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)           \
122*5c23704eSSong Gao {                                                                        \
123*5c23704eSSong Gao     int i;                                                               \
124*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                               \
125*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                               \
126*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                               \
127*5c23704eSSong Gao     typedef __typeof(Vd->E1(0)) TD;                                      \
128*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                        \
129*5c23704eSSong Gao                                                                          \
130*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                            \
131*5c23704eSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \
132*5c23704eSSong Gao     }                                                                    \
133*5c23704eSSong Gao }
134*5c23704eSSong Gao 
HELPER(vaddwev_q_d)135*5c23704eSSong Gao void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
136*5c23704eSSong Gao {
137*5c23704eSSong Gao     int i;
138*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
139*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
140*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
141*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
142*5c23704eSSong Gao 
143*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
144*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)),
145*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i)));
146*5c23704eSSong Gao     }
147*5c23704eSSong Gao }
148*5c23704eSSong Gao 
149*5c23704eSSong Gao DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD)
150*5c23704eSSong Gao DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD)
151*5c23704eSSong Gao DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD)
152*5c23704eSSong Gao 
HELPER(vaddwod_q_d)153*5c23704eSSong Gao void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
154*5c23704eSSong Gao {
155*5c23704eSSong Gao     int i;
156*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
157*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
158*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
159*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
160*5c23704eSSong Gao 
161*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
162*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)),
163*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i +1)));
164*5c23704eSSong Gao     }
165*5c23704eSSong Gao }
166*5c23704eSSong Gao 
167*5c23704eSSong Gao DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD)
168*5c23704eSSong Gao DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD)
169*5c23704eSSong Gao DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD)
170*5c23704eSSong Gao 
HELPER(vsubwev_q_d)171*5c23704eSSong Gao void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
172*5c23704eSSong Gao {
173*5c23704eSSong Gao     int i;
174*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
175*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
176*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
177*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
178*5c23704eSSong Gao 
179*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
180*5c23704eSSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)),
181*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i)));
182*5c23704eSSong Gao     }
183*5c23704eSSong Gao }
184*5c23704eSSong Gao 
185*5c23704eSSong Gao DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB)
186*5c23704eSSong Gao DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB)
187*5c23704eSSong Gao DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB)
188*5c23704eSSong Gao 
HELPER(vsubwod_q_d)189*5c23704eSSong Gao void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
190*5c23704eSSong Gao {
191*5c23704eSSong Gao     int i;
192*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
193*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
194*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
195*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
196*5c23704eSSong Gao 
197*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
198*5c23704eSSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
199*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i + 1)));
200*5c23704eSSong Gao     }
201*5c23704eSSong Gao }
202*5c23704eSSong Gao 
203*5c23704eSSong Gao DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB)
204*5c23704eSSong Gao DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB)
205*5c23704eSSong Gao DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB)
206*5c23704eSSong Gao 
HELPER(vaddwev_q_du)207*5c23704eSSong Gao void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
208*5c23704eSSong Gao {
209*5c23704eSSong Gao     int i;
210*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
211*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
212*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
213*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
214*5c23704eSSong Gao 
215*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
216*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
217*5c23704eSSong Gao                               int128_make64(Vk->UD(2 * i)));
218*5c23704eSSong Gao     }
219*5c23704eSSong Gao }
220*5c23704eSSong Gao 
221*5c23704eSSong Gao DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD)
222*5c23704eSSong Gao DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD)
223*5c23704eSSong Gao DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD)
224*5c23704eSSong Gao 
HELPER(vaddwod_q_du)225*5c23704eSSong Gao void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
226*5c23704eSSong Gao {
227*5c23704eSSong Gao     int i;
228*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
229*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
230*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
231*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
232*5c23704eSSong Gao 
233*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
234*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
235*5c23704eSSong Gao                               int128_make64(Vk->UD(2 * i + 1)));
236*5c23704eSSong Gao     }
237*5c23704eSSong Gao }
238*5c23704eSSong Gao 
239*5c23704eSSong Gao DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD)
240*5c23704eSSong Gao DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD)
241*5c23704eSSong Gao DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD)
242*5c23704eSSong Gao 
HELPER(vsubwev_q_du)243*5c23704eSSong Gao void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
244*5c23704eSSong Gao {
245*5c23704eSSong Gao     int i;
246*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
247*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
248*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
249*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
250*5c23704eSSong Gao 
251*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
252*5c23704eSSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)),
253*5c23704eSSong Gao                               int128_make64(Vk->UD(2 * i)));
254*5c23704eSSong Gao     }
255*5c23704eSSong Gao }
256*5c23704eSSong Gao 
257*5c23704eSSong Gao DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB)
258*5c23704eSSong Gao DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB)
259*5c23704eSSong Gao DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB)
260*5c23704eSSong Gao 
HELPER(vsubwod_q_du)261*5c23704eSSong Gao void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
262*5c23704eSSong Gao {
263*5c23704eSSong Gao     int i;
264*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
265*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
266*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
267*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
268*5c23704eSSong Gao 
269*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
270*5c23704eSSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
271*5c23704eSSong Gao                               int128_make64(Vk->UD(2 * i + 1)));
272*5c23704eSSong Gao     }
273*5c23704eSSong Gao }
274*5c23704eSSong Gao 
275*5c23704eSSong Gao DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB)
276*5c23704eSSong Gao DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB)
277*5c23704eSSong Gao DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB)
278*5c23704eSSong Gao 
279*5c23704eSSong Gao #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)             \
280*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)        \
281*5c23704eSSong Gao {                                                                     \
282*5c23704eSSong Gao     int i;                                                            \
283*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                            \
284*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                            \
285*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                            \
286*5c23704eSSong Gao     typedef __typeof(Vd->ES1(0)) TDS;                                 \
287*5c23704eSSong Gao     typedef __typeof(Vd->EU1(0)) TDU;                                 \
288*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                     \
289*5c23704eSSong Gao                                                                       \
290*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                         \
291*5c23704eSSong Gao         Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \
292*5c23704eSSong Gao     }                                                                 \
293*5c23704eSSong Gao }
294*5c23704eSSong Gao 
295*5c23704eSSong Gao #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)                      \
296*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                \
297*5c23704eSSong Gao {                                                                             \
298*5c23704eSSong Gao     int i;                                                                    \
299*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                    \
300*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                    \
301*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                    \
302*5c23704eSSong Gao     typedef __typeof(Vd->ES1(0)) TDS;                                         \
303*5c23704eSSong Gao     typedef __typeof(Vd->EU1(0)) TDU;                                         \
304*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                             \
305*5c23704eSSong Gao                                                                               \
306*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                                 \
307*5c23704eSSong Gao         Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \
308*5c23704eSSong Gao     }                                                                         \
309*5c23704eSSong Gao }
310*5c23704eSSong Gao 
HELPER(vaddwev_q_du_d)311*5c23704eSSong Gao void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
312*5c23704eSSong Gao {
313*5c23704eSSong Gao     int i;
314*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
315*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
316*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
317*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
318*5c23704eSSong Gao 
319*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
320*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
321*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i)));
322*5c23704eSSong Gao     }
323*5c23704eSSong Gao }
324*5c23704eSSong Gao 
325*5c23704eSSong Gao DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD)
326*5c23704eSSong Gao DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD)
327*5c23704eSSong Gao DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD)
328*5c23704eSSong Gao 
HELPER(vaddwod_q_du_d)329*5c23704eSSong Gao void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
330*5c23704eSSong Gao {
331*5c23704eSSong Gao     int i;
332*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
333*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
334*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
335*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
336*5c23704eSSong Gao 
337*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
338*5c23704eSSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
339*5c23704eSSong Gao                               int128_makes64(Vk->D(2 * i + 1)));
340*5c23704eSSong Gao     }
341*5c23704eSSong Gao }
342*5c23704eSSong Gao 
343*5c23704eSSong Gao DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD)
344*5c23704eSSong Gao DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD)
345*5c23704eSSong Gao DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD)
346*5c23704eSSong Gao 
347*5c23704eSSong Gao #define DO_3OP(NAME, BIT, E, DO_OP)                            \
348*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
349*5c23704eSSong Gao {                                                              \
350*5c23704eSSong Gao     int i;                                                     \
351*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
352*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
353*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
354*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
355*5c23704eSSong Gao                                                                \
356*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
357*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i));                  \
358*5c23704eSSong Gao     }                                                          \
359*5c23704eSSong Gao }
360*5c23704eSSong Gao 
361*5c23704eSSong Gao DO_3OP(vavg_b, 8, B, DO_VAVG)
362*5c23704eSSong Gao DO_3OP(vavg_h, 16, H, DO_VAVG)
363*5c23704eSSong Gao DO_3OP(vavg_w, 32, W, DO_VAVG)
364*5c23704eSSong Gao DO_3OP(vavg_d, 64, D, DO_VAVG)
365*5c23704eSSong Gao DO_3OP(vavgr_b, 8, B, DO_VAVGR)
366*5c23704eSSong Gao DO_3OP(vavgr_h, 16, H, DO_VAVGR)
367*5c23704eSSong Gao DO_3OP(vavgr_w, 32, W, DO_VAVGR)
368*5c23704eSSong Gao DO_3OP(vavgr_d, 64, D, DO_VAVGR)
369*5c23704eSSong Gao DO_3OP(vavg_bu, 8, UB, DO_VAVG)
370*5c23704eSSong Gao DO_3OP(vavg_hu, 16, UH, DO_VAVG)
371*5c23704eSSong Gao DO_3OP(vavg_wu, 32, UW, DO_VAVG)
372*5c23704eSSong Gao DO_3OP(vavg_du, 64, UD, DO_VAVG)
373*5c23704eSSong Gao DO_3OP(vavgr_bu, 8, UB, DO_VAVGR)
374*5c23704eSSong Gao DO_3OP(vavgr_hu, 16, UH, DO_VAVGR)
375*5c23704eSSong Gao DO_3OP(vavgr_wu, 32, UW, DO_VAVGR)
376*5c23704eSSong Gao DO_3OP(vavgr_du, 64, UD, DO_VAVGR)
377*5c23704eSSong Gao 
378*5c23704eSSong Gao DO_3OP(vabsd_b, 8, B, DO_VABSD)
379*5c23704eSSong Gao DO_3OP(vabsd_h, 16, H, DO_VABSD)
380*5c23704eSSong Gao DO_3OP(vabsd_w, 32, W, DO_VABSD)
381*5c23704eSSong Gao DO_3OP(vabsd_d, 64, D, DO_VABSD)
382*5c23704eSSong Gao DO_3OP(vabsd_bu, 8, UB, DO_VABSD)
383*5c23704eSSong Gao DO_3OP(vabsd_hu, 16, UH, DO_VABSD)
384*5c23704eSSong Gao DO_3OP(vabsd_wu, 32, UW, DO_VABSD)
385*5c23704eSSong Gao DO_3OP(vabsd_du, 64, UD, DO_VABSD)
386*5c23704eSSong Gao 
387*5c23704eSSong Gao #define DO_VADDA(NAME, BIT, E)                                 \
388*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
389*5c23704eSSong Gao {                                                              \
390*5c23704eSSong Gao     int i;                                                     \
391*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
392*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
393*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
394*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
395*5c23704eSSong Gao                                                                \
396*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
397*5c23704eSSong Gao         Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i));      \
398*5c23704eSSong Gao     }                                                          \
399*5c23704eSSong Gao }
400*5c23704eSSong Gao 
401*5c23704eSSong Gao DO_VADDA(vadda_b, 8, B)
402*5c23704eSSong Gao DO_VADDA(vadda_h, 16, H)
403*5c23704eSSong Gao DO_VADDA(vadda_w, 32, W)
404*5c23704eSSong Gao DO_VADDA(vadda_d, 64, D)
405*5c23704eSSong Gao 
406*5c23704eSSong Gao #define VMINMAXI(NAME, BIT, E, DO_OP)                              \
407*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
408*5c23704eSSong Gao {                                                                  \
409*5c23704eSSong Gao     int i;                                                         \
410*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
411*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
412*5c23704eSSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
413*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
414*5c23704eSSong Gao                                                                    \
415*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
416*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i), (TD)imm);                       \
417*5c23704eSSong Gao     }                                                              \
418*5c23704eSSong Gao }
419*5c23704eSSong Gao 
420*5c23704eSSong Gao VMINMAXI(vmini_b, 8, B, DO_MIN)
421*5c23704eSSong Gao VMINMAXI(vmini_h, 16, H, DO_MIN)
422*5c23704eSSong Gao VMINMAXI(vmini_w, 32, W, DO_MIN)
423*5c23704eSSong Gao VMINMAXI(vmini_d, 64, D, DO_MIN)
424*5c23704eSSong Gao VMINMAXI(vmaxi_b, 8, B, DO_MAX)
425*5c23704eSSong Gao VMINMAXI(vmaxi_h, 16, H, DO_MAX)
426*5c23704eSSong Gao VMINMAXI(vmaxi_w, 32, W, DO_MAX)
427*5c23704eSSong Gao VMINMAXI(vmaxi_d, 64, D, DO_MAX)
428*5c23704eSSong Gao VMINMAXI(vmini_bu, 8, UB, DO_MIN)
429*5c23704eSSong Gao VMINMAXI(vmini_hu, 16, UH, DO_MIN)
430*5c23704eSSong Gao VMINMAXI(vmini_wu, 32, UW, DO_MIN)
431*5c23704eSSong Gao VMINMAXI(vmini_du, 64, UD, DO_MIN)
432*5c23704eSSong Gao VMINMAXI(vmaxi_bu, 8, UB, DO_MAX)
433*5c23704eSSong Gao VMINMAXI(vmaxi_hu, 16, UH, DO_MAX)
434*5c23704eSSong Gao VMINMAXI(vmaxi_wu, 32, UW, DO_MAX)
435*5c23704eSSong Gao VMINMAXI(vmaxi_du, 64, UD, DO_MAX)
436*5c23704eSSong Gao 
437*5c23704eSSong Gao #define DO_VMUH(NAME, BIT, E1, E2, DO_OP)                      \
438*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
439*5c23704eSSong Gao {                                                              \
440*5c23704eSSong Gao     int i;                                                     \
441*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
442*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
443*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
444*5c23704eSSong Gao     typedef __typeof(Vd->E1(0)) T;                             \
445*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
446*5c23704eSSong Gao                                                                \
447*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
448*5c23704eSSong Gao         Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT;    \
449*5c23704eSSong Gao     }                                                          \
450*5c23704eSSong Gao }
451*5c23704eSSong Gao 
HELPER(vmuh_d)452*5c23704eSSong Gao void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc)
453*5c23704eSSong Gao {
454*5c23704eSSong Gao     int i;
455*5c23704eSSong Gao     uint64_t l, h;
456*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
457*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
458*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
459*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
460*5c23704eSSong Gao 
461*5c23704eSSong Gao     for (i = 0; i < oprsz / 8; i++) {
462*5c23704eSSong Gao         muls64(&l, &h, Vj->D(i), Vk->D(i));
463*5c23704eSSong Gao         Vd->D(i) = h;
464*5c23704eSSong Gao     }
465*5c23704eSSong Gao }
466*5c23704eSSong Gao 
467*5c23704eSSong Gao DO_VMUH(vmuh_b, 8, H, B, DO_MUH)
468*5c23704eSSong Gao DO_VMUH(vmuh_h, 16, W, H, DO_MUH)
469*5c23704eSSong Gao DO_VMUH(vmuh_w, 32, D, W, DO_MUH)
470*5c23704eSSong Gao 
HELPER(vmuh_du)471*5c23704eSSong Gao void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc)
472*5c23704eSSong Gao {
473*5c23704eSSong Gao     int i;
474*5c23704eSSong Gao     uint64_t l, h;
475*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
476*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
477*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
478*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
479*5c23704eSSong Gao 
480*5c23704eSSong Gao     for (i = 0; i < oprsz / 8; i++) {
481*5c23704eSSong Gao         mulu64(&l, &h, Vj->D(i), Vk->D(i));
482*5c23704eSSong Gao         Vd->D(i) = h;
483*5c23704eSSong Gao     }
484*5c23704eSSong Gao }
485*5c23704eSSong Gao 
486*5c23704eSSong Gao DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH)
487*5c23704eSSong Gao DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH)
488*5c23704eSSong Gao DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH)
489*5c23704eSSong Gao 
490*5c23704eSSong Gao DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL)
491*5c23704eSSong Gao DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL)
492*5c23704eSSong Gao DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL)
493*5c23704eSSong Gao 
494*5c23704eSSong Gao DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL)
495*5c23704eSSong Gao DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL)
496*5c23704eSSong Gao DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL)
497*5c23704eSSong Gao 
498*5c23704eSSong Gao DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL)
499*5c23704eSSong Gao DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL)
500*5c23704eSSong Gao DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL)
501*5c23704eSSong Gao 
502*5c23704eSSong Gao DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL)
503*5c23704eSSong Gao DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL)
504*5c23704eSSong Gao DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL)
505*5c23704eSSong Gao 
506*5c23704eSSong Gao DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
507*5c23704eSSong Gao DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
508*5c23704eSSong Gao DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
509*5c23704eSSong Gao 
510*5c23704eSSong Gao DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
511*5c23704eSSong Gao DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
512*5c23704eSSong Gao DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
513*5c23704eSSong Gao 
514*5c23704eSSong Gao #define VMADDSUB(NAME, BIT, E, DO_OP)                          \
515*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
516*5c23704eSSong Gao {                                                              \
517*5c23704eSSong Gao     int i;                                                     \
518*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
519*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
520*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
521*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
522*5c23704eSSong Gao                                                                \
523*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
524*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i));        \
525*5c23704eSSong Gao     }                                                          \
526*5c23704eSSong Gao }
527*5c23704eSSong Gao 
528*5c23704eSSong Gao VMADDSUB(vmadd_b, 8, B, DO_MADD)
529*5c23704eSSong Gao VMADDSUB(vmadd_h, 16, H, DO_MADD)
530*5c23704eSSong Gao VMADDSUB(vmadd_w, 32, W, DO_MADD)
531*5c23704eSSong Gao VMADDSUB(vmadd_d, 64, D, DO_MADD)
532*5c23704eSSong Gao VMADDSUB(vmsub_b, 8, B, DO_MSUB)
533*5c23704eSSong Gao VMADDSUB(vmsub_h, 16, H, DO_MSUB)
534*5c23704eSSong Gao VMADDSUB(vmsub_w, 32, W, DO_MSUB)
535*5c23704eSSong Gao VMADDSUB(vmsub_d, 64, D, DO_MSUB)
536*5c23704eSSong Gao 
537*5c23704eSSong Gao #define VMADDWEV(NAME, BIT, E1, E2, DO_OP)                        \
538*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)    \
539*5c23704eSSong Gao {                                                                 \
540*5c23704eSSong Gao     int i;                                                        \
541*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                        \
542*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                        \
543*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                        \
544*5c23704eSSong Gao     typedef __typeof(Vd->E1(0)) TD;                               \
545*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                 \
546*5c23704eSSong Gao                                                                   \
547*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                     \
548*5c23704eSSong Gao         Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \
549*5c23704eSSong Gao     }                                                             \
550*5c23704eSSong Gao }
551*5c23704eSSong Gao 
552*5c23704eSSong Gao VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL)
553*5c23704eSSong Gao VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL)
554*5c23704eSSong Gao VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL)
555*5c23704eSSong Gao VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL)
556*5c23704eSSong Gao VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL)
557*5c23704eSSong Gao VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL)
558*5c23704eSSong Gao 
559*5c23704eSSong Gao #define VMADDWOD(NAME, BIT, E1, E2, DO_OP)                     \
560*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
561*5c23704eSSong Gao {                                                              \
562*5c23704eSSong Gao     int i;                                                     \
563*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
564*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
565*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
566*5c23704eSSong Gao     typedef __typeof(Vd->E1(0)) TD;                            \
567*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
568*5c23704eSSong Gao                                                                \
569*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
570*5c23704eSSong Gao         Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1),              \
571*5c23704eSSong Gao                            (TD)Vk->E2(2 * i + 1));             \
572*5c23704eSSong Gao     }                                                          \
573*5c23704eSSong Gao }
574*5c23704eSSong Gao 
575*5c23704eSSong Gao VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL)
576*5c23704eSSong Gao VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL)
577*5c23704eSSong Gao VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL)
578*5c23704eSSong Gao VMADDWOD(vmaddwod_h_bu, 16,  UH, UB, DO_MUL)
579*5c23704eSSong Gao VMADDWOD(vmaddwod_w_hu, 32,  UW, UH, DO_MUL)
580*5c23704eSSong Gao VMADDWOD(vmaddwod_d_wu, 64,  UD, UW, DO_MUL)
581*5c23704eSSong Gao 
582*5c23704eSSong Gao #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)     \
583*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
584*5c23704eSSong Gao {                                                              \
585*5c23704eSSong Gao     int i;                                                     \
586*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
587*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
588*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
589*5c23704eSSong Gao     typedef __typeof(Vd->ES1(0)) TS1;                          \
590*5c23704eSSong Gao     typedef __typeof(Vd->EU1(0)) TU1;                          \
591*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
592*5c23704eSSong Gao                                                                \
593*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
594*5c23704eSSong Gao         Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i),               \
595*5c23704eSSong Gao                             (TS1)Vk->ES2(2 * i));              \
596*5c23704eSSong Gao     }                                                          \
597*5c23704eSSong Gao }
598*5c23704eSSong Gao 
599*5c23704eSSong Gao VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
600*5c23704eSSong Gao VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
601*5c23704eSSong Gao VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
602*5c23704eSSong Gao 
603*5c23704eSSong Gao #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)     \
604*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
605*5c23704eSSong Gao {                                                              \
606*5c23704eSSong Gao     int i;                                                     \
607*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
608*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
609*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
610*5c23704eSSong Gao     typedef __typeof(Vd->ES1(0)) TS1;                          \
611*5c23704eSSong Gao     typedef __typeof(Vd->EU1(0)) TU1;                          \
612*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
613*5c23704eSSong Gao                                                                \
614*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
615*5c23704eSSong Gao         Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1),           \
616*5c23704eSSong Gao                             (TS1)Vk->ES2(2 * i + 1));          \
617*5c23704eSSong Gao     }                                                          \
618*5c23704eSSong Gao }
619*5c23704eSSong Gao 
620*5c23704eSSong Gao VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
621*5c23704eSSong Gao VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
622*5c23704eSSong Gao VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
623*5c23704eSSong Gao 
624*5c23704eSSong Gao #define VDIV(NAME, BIT, E, DO_OP)                              \
625*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
626*5c23704eSSong Gao {                                                              \
627*5c23704eSSong Gao     int i;                                                     \
628*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
629*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
630*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
631*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
632*5c23704eSSong Gao                                                                \
633*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
634*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i));                  \
635*5c23704eSSong Gao     }                                                          \
636*5c23704eSSong Gao }
637*5c23704eSSong Gao 
638*5c23704eSSong Gao VDIV(vdiv_b, 8, B, DO_DIV)
639*5c23704eSSong Gao VDIV(vdiv_h, 16, H, DO_DIV)
640*5c23704eSSong Gao VDIV(vdiv_w, 32, W, DO_DIV)
641*5c23704eSSong Gao VDIV(vdiv_d, 64, D, DO_DIV)
642*5c23704eSSong Gao VDIV(vdiv_bu, 8, UB, DO_DIVU)
643*5c23704eSSong Gao VDIV(vdiv_hu, 16, UH, DO_DIVU)
644*5c23704eSSong Gao VDIV(vdiv_wu, 32, UW, DO_DIVU)
645*5c23704eSSong Gao VDIV(vdiv_du, 64, UD, DO_DIVU)
646*5c23704eSSong Gao VDIV(vmod_b, 8, B, DO_REM)
647*5c23704eSSong Gao VDIV(vmod_h, 16, H, DO_REM)
648*5c23704eSSong Gao VDIV(vmod_w, 32, W, DO_REM)
649*5c23704eSSong Gao VDIV(vmod_d, 64, D, DO_REM)
650*5c23704eSSong Gao VDIV(vmod_bu, 8, UB, DO_REMU)
651*5c23704eSSong Gao VDIV(vmod_hu, 16, UH, DO_REMU)
652*5c23704eSSong Gao VDIV(vmod_wu, 32, UW, DO_REMU)
653*5c23704eSSong Gao VDIV(vmod_du, 64, UD, DO_REMU)
654*5c23704eSSong Gao 
655*5c23704eSSong Gao #define VSAT_S(NAME, BIT, E)                                       \
656*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
657*5c23704eSSong Gao {                                                                  \
658*5c23704eSSong Gao     int i;                                                         \
659*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
660*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
661*5c23704eSSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
662*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
663*5c23704eSSong Gao                                                                    \
664*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
665*5c23704eSSong Gao         Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max :                  \
666*5c23704eSSong Gao                    Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i);       \
667*5c23704eSSong Gao     }                                                              \
668*5c23704eSSong Gao }
669*5c23704eSSong Gao 
670*5c23704eSSong Gao VSAT_S(vsat_b, 8, B)
671*5c23704eSSong Gao VSAT_S(vsat_h, 16, H)
672*5c23704eSSong Gao VSAT_S(vsat_w, 32, W)
673*5c23704eSSong Gao VSAT_S(vsat_d, 64, D)
674*5c23704eSSong Gao 
675*5c23704eSSong Gao #define VSAT_U(NAME, BIT, E)                                       \
676*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
677*5c23704eSSong Gao {                                                                  \
678*5c23704eSSong Gao     int i;                                                         \
679*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
680*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
681*5c23704eSSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
682*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
683*5c23704eSSong Gao                                                                    \
684*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
685*5c23704eSSong Gao         Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i);        \
686*5c23704eSSong Gao     }                                                              \
687*5c23704eSSong Gao }
688*5c23704eSSong Gao 
689*5c23704eSSong Gao VSAT_U(vsat_bu, 8, UB)
690*5c23704eSSong Gao VSAT_U(vsat_hu, 16, UH)
691*5c23704eSSong Gao VSAT_U(vsat_wu, 32, UW)
692*5c23704eSSong Gao VSAT_U(vsat_du, 64, UD)
693*5c23704eSSong Gao 
694*5c23704eSSong Gao #define VEXTH(NAME, BIT, E1, E2)                                 \
695*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc)             \
696*5c23704eSSong Gao {                                                                \
697*5c23704eSSong Gao     int i, j, ofs;                                               \
698*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                       \
699*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                       \
700*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                \
701*5c23704eSSong Gao                                                                  \
702*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                         \
703*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                           \
704*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                              \
705*5c23704eSSong Gao             Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \
706*5c23704eSSong Gao         }                                                        \
707*5c23704eSSong Gao     }                                                            \
708*5c23704eSSong Gao }
709*5c23704eSSong Gao 
HELPER(vexth_q_d)710*5c23704eSSong Gao void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc)
711*5c23704eSSong Gao {
712*5c23704eSSong Gao     int i;
713*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
714*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
715*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
716*5c23704eSSong Gao 
717*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
718*5c23704eSSong Gao         Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1));
719*5c23704eSSong Gao     }
720*5c23704eSSong Gao }
721*5c23704eSSong Gao 
HELPER(vexth_qu_du)722*5c23704eSSong Gao void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc)
723*5c23704eSSong Gao {
724*5c23704eSSong Gao     int i;
725*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
726*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
727*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
728*5c23704eSSong Gao 
729*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
730*5c23704eSSong Gao         Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1));
731*5c23704eSSong Gao     }
732*5c23704eSSong Gao }
733*5c23704eSSong Gao 
734*5c23704eSSong Gao VEXTH(vexth_h_b, 16, H, B)
735*5c23704eSSong Gao VEXTH(vexth_w_h, 32, W, H)
736*5c23704eSSong Gao VEXTH(vexth_d_w, 64, D, W)
737*5c23704eSSong Gao VEXTH(vexth_hu_bu, 16, UH, UB)
738*5c23704eSSong Gao VEXTH(vexth_wu_hu, 32, UW, UH)
739*5c23704eSSong Gao VEXTH(vexth_du_wu, 64, UD, UW)
740*5c23704eSSong Gao 
741*5c23704eSSong Gao #define VEXT2XV(NAME, BIT, E1, E2)                   \
742*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
743*5c23704eSSong Gao {                                                    \
744*5c23704eSSong Gao     int i;                                           \
745*5c23704eSSong Gao     VReg temp = {};                                  \
746*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                           \
747*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                           \
748*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                    \
749*5c23704eSSong Gao                                                      \
750*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {        \
751*5c23704eSSong Gao         temp.E1(i) = Vj->E2(i);                      \
752*5c23704eSSong Gao     }                                                \
753*5c23704eSSong Gao     *Vd = temp;                                      \
754*5c23704eSSong Gao }
755*5c23704eSSong Gao 
756*5c23704eSSong Gao VEXT2XV(vext2xv_h_b, 16, H, B)
757*5c23704eSSong Gao VEXT2XV(vext2xv_w_b, 32, W, B)
758*5c23704eSSong Gao VEXT2XV(vext2xv_d_b, 64, D, B)
759*5c23704eSSong Gao VEXT2XV(vext2xv_w_h, 32, W, H)
760*5c23704eSSong Gao VEXT2XV(vext2xv_d_h, 64, D, H)
761*5c23704eSSong Gao VEXT2XV(vext2xv_d_w, 64, D, W)
762*5c23704eSSong Gao VEXT2XV(vext2xv_hu_bu, 16, UH, UB)
763*5c23704eSSong Gao VEXT2XV(vext2xv_wu_bu, 32, UW, UB)
764*5c23704eSSong Gao VEXT2XV(vext2xv_du_bu, 64, UD, UB)
765*5c23704eSSong Gao VEXT2XV(vext2xv_wu_hu, 32, UW, UH)
766*5c23704eSSong Gao VEXT2XV(vext2xv_du_hu, 64, UD, UH)
767*5c23704eSSong Gao VEXT2XV(vext2xv_du_wu, 64, UD, UW)
768*5c23704eSSong Gao 
769*5c23704eSSong Gao DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV)
770*5c23704eSSong Gao DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV)
771*5c23704eSSong Gao DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV)
772*5c23704eSSong Gao DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV)
773*5c23704eSSong Gao 
do_vmskltz_b(int64_t val)774*5c23704eSSong Gao static uint64_t do_vmskltz_b(int64_t val)
775*5c23704eSSong Gao {
776*5c23704eSSong Gao     uint64_t m = 0x8080808080808080ULL;
777*5c23704eSSong Gao     uint64_t c =  val & m;
778*5c23704eSSong Gao     c |= c << 7;
779*5c23704eSSong Gao     c |= c << 14;
780*5c23704eSSong Gao     c |= c << 28;
781*5c23704eSSong Gao     return c >> 56;
782*5c23704eSSong Gao }
783*5c23704eSSong Gao 
HELPER(vmskltz_b)784*5c23704eSSong Gao void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc)
785*5c23704eSSong Gao {
786*5c23704eSSong Gao     int i;
787*5c23704eSSong Gao     uint16_t temp = 0;
788*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
789*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
790*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
791*5c23704eSSong Gao 
792*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
793*5c23704eSSong Gao         temp = 0;
794*5c23704eSSong Gao         temp = do_vmskltz_b(Vj->D(2 * i));
795*5c23704eSSong Gao         temp |= (do_vmskltz_b(Vj->D(2 * i  + 1)) << 8);
796*5c23704eSSong Gao         Vd->D(2 * i) = temp;
797*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;
798*5c23704eSSong Gao     }
799*5c23704eSSong Gao }
800*5c23704eSSong Gao 
do_vmskltz_h(int64_t val)801*5c23704eSSong Gao static uint64_t do_vmskltz_h(int64_t val)
802*5c23704eSSong Gao {
803*5c23704eSSong Gao     uint64_t m = 0x8000800080008000ULL;
804*5c23704eSSong Gao     uint64_t c =  val & m;
805*5c23704eSSong Gao     c |= c << 15;
806*5c23704eSSong Gao     c |= c << 30;
807*5c23704eSSong Gao     return c >> 60;
808*5c23704eSSong Gao }
809*5c23704eSSong Gao 
HELPER(vmskltz_h)810*5c23704eSSong Gao void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc)
811*5c23704eSSong Gao {
812*5c23704eSSong Gao     int i;
813*5c23704eSSong Gao     uint16_t temp = 0;
814*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
815*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
816*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
817*5c23704eSSong Gao 
818*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
819*5c23704eSSong Gao         temp = 0;
820*5c23704eSSong Gao         temp = do_vmskltz_h(Vj->D(2 * i));
821*5c23704eSSong Gao         temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4);
822*5c23704eSSong Gao         Vd->D(2 * i) = temp;
823*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;
824*5c23704eSSong Gao     }
825*5c23704eSSong Gao }
826*5c23704eSSong Gao 
do_vmskltz_w(int64_t val)827*5c23704eSSong Gao static uint64_t do_vmskltz_w(int64_t val)
828*5c23704eSSong Gao {
829*5c23704eSSong Gao     uint64_t m = 0x8000000080000000ULL;
830*5c23704eSSong Gao     uint64_t c =  val & m;
831*5c23704eSSong Gao     c |= c << 31;
832*5c23704eSSong Gao     return c >> 62;
833*5c23704eSSong Gao }
834*5c23704eSSong Gao 
HELPER(vmskltz_w)835*5c23704eSSong Gao void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc)
836*5c23704eSSong Gao {
837*5c23704eSSong Gao     int i;
838*5c23704eSSong Gao     uint16_t temp = 0;
839*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
840*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
841*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
842*5c23704eSSong Gao 
843*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
844*5c23704eSSong Gao         temp = 0;
845*5c23704eSSong Gao         temp = do_vmskltz_w(Vj->D(2 * i));
846*5c23704eSSong Gao         temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2);
847*5c23704eSSong Gao         Vd->D(2 * i) = temp;
848*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;
849*5c23704eSSong Gao     }
850*5c23704eSSong Gao }
851*5c23704eSSong Gao 
do_vmskltz_d(int64_t val)852*5c23704eSSong Gao static uint64_t do_vmskltz_d(int64_t val)
853*5c23704eSSong Gao {
854*5c23704eSSong Gao     return (uint64_t)val >> 63;
855*5c23704eSSong Gao }
HELPER(vmskltz_d)856*5c23704eSSong Gao void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc)
857*5c23704eSSong Gao {
858*5c23704eSSong Gao     int i;
859*5c23704eSSong Gao     uint16_t temp = 0;
860*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
861*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
862*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
863*5c23704eSSong Gao 
864*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
865*5c23704eSSong Gao         temp = 0;
866*5c23704eSSong Gao         temp = do_vmskltz_d(Vj->D(2 * i));
867*5c23704eSSong Gao         temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1);
868*5c23704eSSong Gao         Vd->D(2 * i) = temp;
869*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;
870*5c23704eSSong Gao     }
871*5c23704eSSong Gao }
872*5c23704eSSong Gao 
HELPER(vmskgez_b)873*5c23704eSSong Gao void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc)
874*5c23704eSSong Gao {
875*5c23704eSSong Gao     int i;
876*5c23704eSSong Gao     uint16_t temp = 0;
877*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
878*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
879*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
880*5c23704eSSong Gao 
881*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
882*5c23704eSSong Gao         temp = 0;
883*5c23704eSSong Gao         temp =  do_vmskltz_b(Vj->D(2 * i));
884*5c23704eSSong Gao         temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8);
885*5c23704eSSong Gao         Vd->D(2 * i) = (uint16_t)(~temp);
886*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;
887*5c23704eSSong Gao     }
888*5c23704eSSong Gao }
889*5c23704eSSong Gao 
do_vmskez_b(uint64_t a)890*5c23704eSSong Gao static uint64_t do_vmskez_b(uint64_t a)
891*5c23704eSSong Gao {
892*5c23704eSSong Gao     uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
893*5c23704eSSong Gao     uint64_t c = ~(((a & m) + m) | a | m);
894*5c23704eSSong Gao     c |= c << 7;
895*5c23704eSSong Gao     c |= c << 14;
896*5c23704eSSong Gao     c |= c << 28;
897*5c23704eSSong Gao     return c >> 56;
898*5c23704eSSong Gao }
899*5c23704eSSong Gao 
HELPER(vmsknz_b)900*5c23704eSSong Gao void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc)
901*5c23704eSSong Gao {
902*5c23704eSSong Gao     int i;
903*5c23704eSSong Gao     uint16_t temp = 0;
904*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
905*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
906*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
907*5c23704eSSong Gao 
908*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
909*5c23704eSSong Gao         temp = 0;
910*5c23704eSSong Gao         temp = do_vmskez_b(Vj->D(2 * i));
911*5c23704eSSong Gao         temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8);
912*5c23704eSSong Gao         Vd->D(2 * i) = (uint16_t)(~temp);
913*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;
914*5c23704eSSong Gao     }
915*5c23704eSSong Gao }
916*5c23704eSSong Gao 
HELPER(vnori_b)917*5c23704eSSong Gao void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc)
918*5c23704eSSong Gao {
919*5c23704eSSong Gao     int i;
920*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
921*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
922*5c23704eSSong Gao 
923*5c23704eSSong Gao     for (i = 0; i < simd_oprsz(desc); i++) {
924*5c23704eSSong Gao         Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm);
925*5c23704eSSong Gao     }
926*5c23704eSSong Gao }
927*5c23704eSSong Gao 
928*5c23704eSSong Gao #define VSLLWIL(NAME, BIT, E1, E2)                                             \
929*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)             \
930*5c23704eSSong Gao {                                                                              \
931*5c23704eSSong Gao     int i, j, ofs;                                                             \
932*5c23704eSSong Gao     VReg temp = {};                                                            \
933*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                     \
934*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                     \
935*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                              \
936*5c23704eSSong Gao     typedef __typeof(temp.E1(0)) TD;                                           \
937*5c23704eSSong Gao                                                                                \
938*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                       \
939*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
940*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                            \
941*5c23704eSSong Gao             temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \
942*5c23704eSSong Gao         }                                                                      \
943*5c23704eSSong Gao     }                                                                          \
944*5c23704eSSong Gao     *Vd = temp;                                                                \
945*5c23704eSSong Gao }
946*5c23704eSSong Gao 
947*5c23704eSSong Gao 
HELPER(vextl_q_d)948*5c23704eSSong Gao void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc)
949*5c23704eSSong Gao {
950*5c23704eSSong Gao     int i;
951*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
952*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
953*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
954*5c23704eSSong Gao 
955*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
956*5c23704eSSong Gao         Vd->Q(i) = int128_makes64(Vj->D(2 * i));
957*5c23704eSSong Gao     }
958*5c23704eSSong Gao }
959*5c23704eSSong Gao 
HELPER(vextl_qu_du)960*5c23704eSSong Gao void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc)
961*5c23704eSSong Gao {
962*5c23704eSSong Gao     int i;
963*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
964*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
965*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
966*5c23704eSSong Gao 
967*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
968*5c23704eSSong Gao         Vd->Q(i) = int128_make64(Vj->UD(2 * i));
969*5c23704eSSong Gao     }
970*5c23704eSSong Gao }
971*5c23704eSSong Gao 
972*5c23704eSSong Gao VSLLWIL(vsllwil_h_b, 16, H, B)
973*5c23704eSSong Gao VSLLWIL(vsllwil_w_h, 32, W, H)
974*5c23704eSSong Gao VSLLWIL(vsllwil_d_w, 64, D, W)
975*5c23704eSSong Gao VSLLWIL(vsllwil_hu_bu, 16, UH, UB)
976*5c23704eSSong Gao VSLLWIL(vsllwil_wu_hu, 32, UW, UH)
977*5c23704eSSong Gao VSLLWIL(vsllwil_du_wu, 64, UD, UW)
978*5c23704eSSong Gao 
979*5c23704eSSong Gao #define do_vsrlr(E, T)                                  \
980*5c23704eSSong Gao static T do_vsrlr_ ##E(T s1, int sh)                    \
981*5c23704eSSong Gao {                                                       \
982*5c23704eSSong Gao     if (sh == 0) {                                      \
983*5c23704eSSong Gao         return s1;                                      \
984*5c23704eSSong Gao     } else {                                            \
985*5c23704eSSong Gao         return  (s1 >> sh)  + ((s1 >> (sh - 1)) & 0x1); \
986*5c23704eSSong Gao     }                                                   \
987*5c23704eSSong Gao }
988*5c23704eSSong Gao 
do_vsrlr(B,uint8_t)989*5c23704eSSong Gao do_vsrlr(B, uint8_t)
990*5c23704eSSong Gao do_vsrlr(H, uint16_t)
991*5c23704eSSong Gao do_vsrlr(W, uint32_t)
992*5c23704eSSong Gao do_vsrlr(D, uint64_t)
993*5c23704eSSong Gao 
994*5c23704eSSong Gao #define VSRLR(NAME, BIT, T, E)                                  \
995*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)  \
996*5c23704eSSong Gao {                                                               \
997*5c23704eSSong Gao     int i;                                                      \
998*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                      \
999*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                      \
1000*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                      \
1001*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                               \
1002*5c23704eSSong Gao                                                                 \
1003*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                   \
1004*5c23704eSSong Gao         Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1005*5c23704eSSong Gao     }                                                           \
1006*5c23704eSSong Gao }
1007*5c23704eSSong Gao 
1008*5c23704eSSong Gao VSRLR(vsrlr_b, 8,  uint8_t, B)
1009*5c23704eSSong Gao VSRLR(vsrlr_h, 16, uint16_t, H)
1010*5c23704eSSong Gao VSRLR(vsrlr_w, 32, uint32_t, W)
1011*5c23704eSSong Gao VSRLR(vsrlr_d, 64, uint64_t, D)
1012*5c23704eSSong Gao 
1013*5c23704eSSong Gao #define VSRLRI(NAME, BIT, E)                                       \
1014*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1015*5c23704eSSong Gao {                                                                  \
1016*5c23704eSSong Gao     int i;                                                         \
1017*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
1018*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
1019*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
1020*5c23704eSSong Gao                                                                    \
1021*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
1022*5c23704eSSong Gao         Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm);                  \
1023*5c23704eSSong Gao     }                                                              \
1024*5c23704eSSong Gao }
1025*5c23704eSSong Gao 
1026*5c23704eSSong Gao VSRLRI(vsrlri_b, 8, B)
1027*5c23704eSSong Gao VSRLRI(vsrlri_h, 16, H)
1028*5c23704eSSong Gao VSRLRI(vsrlri_w, 32, W)
1029*5c23704eSSong Gao VSRLRI(vsrlri_d, 64, D)
1030*5c23704eSSong Gao 
1031*5c23704eSSong Gao #define do_vsrar(E, T)                                  \
1032*5c23704eSSong Gao static T do_vsrar_ ##E(T s1, int sh)                    \
1033*5c23704eSSong Gao {                                                       \
1034*5c23704eSSong Gao     if (sh == 0) {                                      \
1035*5c23704eSSong Gao         return s1;                                      \
1036*5c23704eSSong Gao     } else {                                            \
1037*5c23704eSSong Gao         return  (s1 >> sh)  + ((s1 >> (sh - 1)) & 0x1); \
1038*5c23704eSSong Gao     }                                                   \
1039*5c23704eSSong Gao }
1040*5c23704eSSong Gao 
1041*5c23704eSSong Gao do_vsrar(B, int8_t)
1042*5c23704eSSong Gao do_vsrar(H, int16_t)
1043*5c23704eSSong Gao do_vsrar(W, int32_t)
1044*5c23704eSSong Gao do_vsrar(D, int64_t)
1045*5c23704eSSong Gao 
1046*5c23704eSSong Gao #define VSRAR(NAME, BIT, T, E)                                  \
1047*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)  \
1048*5c23704eSSong Gao {                                                               \
1049*5c23704eSSong Gao     int i;                                                      \
1050*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                      \
1051*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                      \
1052*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                      \
1053*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                               \
1054*5c23704eSSong Gao                                                                 \
1055*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                   \
1056*5c23704eSSong Gao         Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1057*5c23704eSSong Gao     }                                                           \
1058*5c23704eSSong Gao }
1059*5c23704eSSong Gao 
1060*5c23704eSSong Gao VSRAR(vsrar_b, 8,  uint8_t, B)
1061*5c23704eSSong Gao VSRAR(vsrar_h, 16, uint16_t, H)
1062*5c23704eSSong Gao VSRAR(vsrar_w, 32, uint32_t, W)
1063*5c23704eSSong Gao VSRAR(vsrar_d, 64, uint64_t, D)
1064*5c23704eSSong Gao 
1065*5c23704eSSong Gao #define VSRARI(NAME, BIT, E)                                       \
1066*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1067*5c23704eSSong Gao {                                                                  \
1068*5c23704eSSong Gao     int i;                                                         \
1069*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
1070*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
1071*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
1072*5c23704eSSong Gao                                                                    \
1073*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
1074*5c23704eSSong Gao         Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm);                  \
1075*5c23704eSSong Gao     }                                                              \
1076*5c23704eSSong Gao }
1077*5c23704eSSong Gao 
1078*5c23704eSSong Gao VSRARI(vsrari_b, 8, B)
1079*5c23704eSSong Gao VSRARI(vsrari_h, 16, H)
1080*5c23704eSSong Gao VSRARI(vsrari_w, 32, W)
1081*5c23704eSSong Gao VSRARI(vsrari_d, 64, D)
1082*5c23704eSSong Gao 
1083*5c23704eSSong Gao #define VSRLN(NAME, BIT, E1, E2)                                          \
1084*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
1085*5c23704eSSong Gao {                                                                         \
1086*5c23704eSSong Gao     int i, j, ofs;                                                        \
1087*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
1088*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
1089*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                \
1090*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1091*5c23704eSSong Gao                                                                           \
1092*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
1093*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
1094*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
1095*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i),        \
1096*5c23704eSSong Gao                                               Vk->E2(j + ofs * i) % BIT); \
1097*5c23704eSSong Gao         }                                                                 \
1098*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                             \
1099*5c23704eSSong Gao     }                                                                     \
1100*5c23704eSSong Gao }
1101*5c23704eSSong Gao 
1102*5c23704eSSong Gao VSRLN(vsrln_b_h, 16, B, UH)
1103*5c23704eSSong Gao VSRLN(vsrln_h_w, 32, H, UW)
1104*5c23704eSSong Gao VSRLN(vsrln_w_d, 64, W, UD)
1105*5c23704eSSong Gao 
1106*5c23704eSSong Gao #define VSRAN(NAME, BIT, E1, E2, E3)                                      \
1107*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
1108*5c23704eSSong Gao {                                                                         \
1109*5c23704eSSong Gao     int i, j, ofs;                                                        \
1110*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
1111*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
1112*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                \
1113*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1114*5c23704eSSong Gao                                                                           \
1115*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
1116*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
1117*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
1118*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i),        \
1119*5c23704eSSong Gao                                               Vk->E3(j + ofs * i) % BIT); \
1120*5c23704eSSong Gao         }                                                                 \
1121*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                             \
1122*5c23704eSSong Gao     }                                                                     \
1123*5c23704eSSong Gao }
1124*5c23704eSSong Gao 
1125*5c23704eSSong Gao VSRAN(vsran_b_h, 16, B, H, UH)
1126*5c23704eSSong Gao VSRAN(vsran_h_w, 32, H, W, UW)
1127*5c23704eSSong Gao VSRAN(vsran_w_d, 64, W, D, UD)
1128*5c23704eSSong Gao 
1129*5c23704eSSong Gao #define VSRLNI(NAME, BIT, E1, E2)                                         \
1130*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)        \
1131*5c23704eSSong Gao {                                                                         \
1132*5c23704eSSong Gao     int i, j, ofs;                                                        \
1133*5c23704eSSong Gao     VReg temp = {};                                                       \
1134*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
1135*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
1136*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1137*5c23704eSSong Gao                                                                           \
1138*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
1139*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
1140*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
1141*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
1142*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
1143*5c23704eSSong Gao                                                      imm);                \
1144*5c23704eSSong Gao         }                                                                 \
1145*5c23704eSSong Gao     }                                                                     \
1146*5c23704eSSong Gao     *Vd = temp;                                                           \
1147*5c23704eSSong Gao }
1148*5c23704eSSong Gao 
1149*5c23704eSSong Gao void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1150*5c23704eSSong Gao {
1151*5c23704eSSong Gao     int i;
1152*5c23704eSSong Gao     VReg temp = {};
1153*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1154*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1155*5c23704eSSong Gao 
1156*5c23704eSSong Gao     for (i = 0; i < 2; i++) {
1157*5c23704eSSong Gao         temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128));
1158*5c23704eSSong Gao         temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128));
1159*5c23704eSSong Gao     }
1160*5c23704eSSong Gao     *Vd = temp;
1161*5c23704eSSong Gao }
1162*5c23704eSSong Gao 
1163*5c23704eSSong Gao VSRLNI(vsrlni_b_h, 16, B, UH)
1164*5c23704eSSong Gao VSRLNI(vsrlni_h_w, 32, H, UW)
1165*5c23704eSSong Gao VSRLNI(vsrlni_w_d, 64, W, UD)
1166*5c23704eSSong Gao 
1167*5c23704eSSong Gao #define VSRANI(NAME, BIT, E1, E2)                                         \
1168*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)        \
1169*5c23704eSSong Gao {                                                                         \
1170*5c23704eSSong Gao     int i, j, ofs;                                                        \
1171*5c23704eSSong Gao     VReg temp = {};                                                       \
1172*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
1173*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
1174*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1175*5c23704eSSong Gao                                                                           \
1176*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
1177*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
1178*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
1179*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
1180*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
1181*5c23704eSSong Gao                                                      imm);                \
1182*5c23704eSSong Gao         }                                                                 \
1183*5c23704eSSong Gao     }                                                                     \
1184*5c23704eSSong Gao     *Vd = temp;                                                           \
1185*5c23704eSSong Gao }
1186*5c23704eSSong Gao 
HELPER(vsrani_d_q)1187*5c23704eSSong Gao void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1188*5c23704eSSong Gao {
1189*5c23704eSSong Gao     int i;
1190*5c23704eSSong Gao     VReg temp = {};
1191*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1192*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1193*5c23704eSSong Gao 
1194*5c23704eSSong Gao     for (i = 0; i < 2; i++) {
1195*5c23704eSSong Gao         temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128));
1196*5c23704eSSong Gao         temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128));
1197*5c23704eSSong Gao     }
1198*5c23704eSSong Gao     *Vd = temp;
1199*5c23704eSSong Gao }
1200*5c23704eSSong Gao 
1201*5c23704eSSong Gao VSRANI(vsrani_b_h, 16, B, H)
1202*5c23704eSSong Gao VSRANI(vsrani_h_w, 32, H, W)
1203*5c23704eSSong Gao VSRANI(vsrani_w_d, 64, W, D)
1204*5c23704eSSong Gao 
1205*5c23704eSSong Gao #define VSRLRN(NAME, BIT, E1, E2, E3)                                      \
1206*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)             \
1207*5c23704eSSong Gao {                                                                          \
1208*5c23704eSSong Gao     int i, j, ofs;                                                         \
1209*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                 \
1210*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                 \
1211*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                 \
1212*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                          \
1213*5c23704eSSong Gao                                                                            \
1214*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                   \
1215*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                     \
1216*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                        \
1217*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i),  \
1218*5c23704eSSong Gao                                                Vk->E3(j + ofs * i) % BIT); \
1219*5c23704eSSong Gao         }                                                                  \
1220*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                              \
1221*5c23704eSSong Gao     }                                                                      \
1222*5c23704eSSong Gao }
1223*5c23704eSSong Gao 
1224*5c23704eSSong Gao VSRLRN(vsrlrn_b_h, 16, B, H, UH)
1225*5c23704eSSong Gao VSRLRN(vsrlrn_h_w, 32, H, W, UW)
1226*5c23704eSSong Gao VSRLRN(vsrlrn_w_d, 64, W, D, UD)
1227*5c23704eSSong Gao 
1228*5c23704eSSong Gao #define VSRARN(NAME, BIT, E1, E2, E3)                                       \
1229*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
1230*5c23704eSSong Gao {                                                                           \
1231*5c23704eSSong Gao     int i, j, ofs;                                                          \
1232*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                  \
1233*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                  \
1234*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                  \
1235*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                           \
1236*5c23704eSSong Gao                                                                             \
1237*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                    \
1238*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
1239*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                         \
1240*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i),  \
1241*5c23704eSSong Gao                                                 Vk->E3(j + ofs * i) % BIT); \
1242*5c23704eSSong Gao         }                                                                   \
1243*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
1244*5c23704eSSong Gao     }                                                                       \
1245*5c23704eSSong Gao }
1246*5c23704eSSong Gao 
1247*5c23704eSSong Gao VSRARN(vsrarn_b_h, 16, B, H, UH)
1248*5c23704eSSong Gao VSRARN(vsrarn_h_w, 32, H, W, UW)
1249*5c23704eSSong Gao VSRARN(vsrarn_w_d, 64, W, D, UD)
1250*5c23704eSSong Gao 
1251*5c23704eSSong Gao #define VSRLRNI(NAME, BIT, E1, E2)                                                \
1252*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                \
1253*5c23704eSSong Gao {                                                                                 \
1254*5c23704eSSong Gao     int i, j, ofs;                                                                \
1255*5c23704eSSong Gao     VReg temp = {};                                                               \
1256*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                        \
1257*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                        \
1258*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                 \
1259*5c23704eSSong Gao                                                                                   \
1260*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                          \
1261*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                            \
1262*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                               \
1263*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \
1264*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \
1265*5c23704eSSong Gao                                                                  imm);            \
1266*5c23704eSSong Gao         }                                                                         \
1267*5c23704eSSong Gao     }                                                                             \
1268*5c23704eSSong Gao     *Vd = temp;                                                                   \
1269*5c23704eSSong Gao }
1270*5c23704eSSong Gao 
HELPER(vsrlrni_d_q)1271*5c23704eSSong Gao void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1272*5c23704eSSong Gao {
1273*5c23704eSSong Gao     int i;
1274*5c23704eSSong Gao     VReg temp = {};
1275*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1276*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1277*5c23704eSSong Gao     Int128 r[4];
1278*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
1279*5c23704eSSong Gao 
1280*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1281*5c23704eSSong Gao         if (imm == 0) {
1282*5c23704eSSong Gao             temp.D(2 * i) = int128_getlo(Vj->Q(i));
1283*5c23704eSSong Gao             temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1284*5c23704eSSong Gao         } else {
1285*5c23704eSSong Gao             r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)),
1286*5c23704eSSong Gao                                   int128_one());
1287*5c23704eSSong Gao             r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)),
1288*5c23704eSSong Gao                                       int128_one());
1289*5c23704eSSong Gao             temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i),
1290*5c23704eSSong Gao                                                     imm), r[2 * i]));
1291*5c23704eSSong Gao             temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i),
1292*5c23704eSSong Gao                                                         imm), r[ 2 * i + 1]));
1293*5c23704eSSong Gao         }
1294*5c23704eSSong Gao     }
1295*5c23704eSSong Gao     *Vd = temp;
1296*5c23704eSSong Gao }
1297*5c23704eSSong Gao 
1298*5c23704eSSong Gao VSRLRNI(vsrlrni_b_h, 16, B, H)
1299*5c23704eSSong Gao VSRLRNI(vsrlrni_h_w, 32, H, W)
1300*5c23704eSSong Gao VSRLRNI(vsrlrni_w_d, 64, W, D)
1301*5c23704eSSong Gao 
1302*5c23704eSSong Gao #define VSRARNI(NAME, BIT, E1, E2)                                                \
1303*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                \
1304*5c23704eSSong Gao {                                                                                 \
1305*5c23704eSSong Gao     int i, j, ofs;                                                                \
1306*5c23704eSSong Gao     VReg temp = {};                                                               \
1307*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                        \
1308*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                        \
1309*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                 \
1310*5c23704eSSong Gao                                                                                   \
1311*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                          \
1312*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                            \
1313*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                               \
1314*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \
1315*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \
1316*5c23704eSSong Gao                                                              imm);                \
1317*5c23704eSSong Gao         }                                                                         \
1318*5c23704eSSong Gao     }                                                                             \
1319*5c23704eSSong Gao     *Vd = temp;                                                                   \
1320*5c23704eSSong Gao }
1321*5c23704eSSong Gao 
HELPER(vsrarni_d_q)1322*5c23704eSSong Gao void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1323*5c23704eSSong Gao {
1324*5c23704eSSong Gao     int i;
1325*5c23704eSSong Gao     VReg temp = {};
1326*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1327*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1328*5c23704eSSong Gao     Int128 r[4];
1329*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
1330*5c23704eSSong Gao 
1331*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1332*5c23704eSSong Gao         if (imm == 0) {
1333*5c23704eSSong Gao             temp.D(2 * i) = int128_getlo(Vj->Q(i));
1334*5c23704eSSong Gao             temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1335*5c23704eSSong Gao         } else {
1336*5c23704eSSong Gao             r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)),
1337*5c23704eSSong Gao                                   int128_one());
1338*5c23704eSSong Gao             r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)),
1339*5c23704eSSong Gao                                       int128_one());
1340*5c23704eSSong Gao             temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i),
1341*5c23704eSSong Gao                                                     imm), r[2 * i]));
1342*5c23704eSSong Gao             temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i),
1343*5c23704eSSong Gao                                                         imm), r[2 * i + 1]));
1344*5c23704eSSong Gao         }
1345*5c23704eSSong Gao     }
1346*5c23704eSSong Gao     *Vd = temp;
1347*5c23704eSSong Gao }
1348*5c23704eSSong Gao 
1349*5c23704eSSong Gao VSRARNI(vsrarni_b_h, 16, B, H)
1350*5c23704eSSong Gao VSRARNI(vsrarni_h_w, 32, H, W)
1351*5c23704eSSong Gao VSRARNI(vsrarni_w_d, 64, W, D)
1352*5c23704eSSong Gao 
1353*5c23704eSSong Gao #define SSRLNS(NAME, T1, T2, T3)                    \
1354*5c23704eSSong Gao static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \
1355*5c23704eSSong Gao {                                                   \
1356*5c23704eSSong Gao         T1 shft_res;                                \
1357*5c23704eSSong Gao         if (sa == 0) {                              \
1358*5c23704eSSong Gao             shft_res = e2;                          \
1359*5c23704eSSong Gao         } else {                                    \
1360*5c23704eSSong Gao             shft_res = (((T1)e2) >> sa);            \
1361*5c23704eSSong Gao         }                                           \
1362*5c23704eSSong Gao         T3 mask;                                    \
1363*5c23704eSSong Gao         mask = (1ull << sh) -1;                     \
1364*5c23704eSSong Gao         if (shft_res > mask) {                      \
1365*5c23704eSSong Gao             return mask;                            \
1366*5c23704eSSong Gao         } else {                                    \
1367*5c23704eSSong Gao             return  shft_res;                       \
1368*5c23704eSSong Gao         }                                           \
1369*5c23704eSSong Gao }
1370*5c23704eSSong Gao 
SSRLNS(B,uint16_t,int16_t,uint8_t)1371*5c23704eSSong Gao SSRLNS(B, uint16_t, int16_t, uint8_t)
1372*5c23704eSSong Gao SSRLNS(H, uint32_t, int32_t, uint16_t)
1373*5c23704eSSong Gao SSRLNS(W, uint64_t, int64_t, uint32_t)
1374*5c23704eSSong Gao 
1375*5c23704eSSong Gao #define VSSRLN(NAME, BIT, E1, E2, E3)                                       \
1376*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
1377*5c23704eSSong Gao {                                                                           \
1378*5c23704eSSong Gao     int i, j, ofs;                                                          \
1379*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                  \
1380*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                  \
1381*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                  \
1382*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                           \
1383*5c23704eSSong Gao                                                                             \
1384*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                    \
1385*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
1386*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                         \
1387*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \
1388*5c23704eSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
1389*5c23704eSSong Gao                                                 BIT / 2 - 1);               \
1390*5c23704eSSong Gao         }                                                                   \
1391*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
1392*5c23704eSSong Gao     }                                                                       \
1393*5c23704eSSong Gao }
1394*5c23704eSSong Gao 
1395*5c23704eSSong Gao VSSRLN(vssrln_b_h, 16, B, H, UH)
1396*5c23704eSSong Gao VSSRLN(vssrln_h_w, 32, H, W, UW)
1397*5c23704eSSong Gao VSSRLN(vssrln_w_d, 64, W, D, UD)
1398*5c23704eSSong Gao 
1399*5c23704eSSong Gao #define SSRANS(E, T1, T2)                        \
1400*5c23704eSSong Gao static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \
1401*5c23704eSSong Gao {                                                \
1402*5c23704eSSong Gao         T1 shft_res;                             \
1403*5c23704eSSong Gao         if (sa == 0) {                           \
1404*5c23704eSSong Gao             shft_res = e2;                       \
1405*5c23704eSSong Gao         } else {                                 \
1406*5c23704eSSong Gao             shft_res = e2 >> sa;                 \
1407*5c23704eSSong Gao         }                                        \
1408*5c23704eSSong Gao         T2 mask;                                 \
1409*5c23704eSSong Gao         mask = (1ll << sh) - 1;                  \
1410*5c23704eSSong Gao         if (shft_res > mask) {                   \
1411*5c23704eSSong Gao             return  mask;                        \
1412*5c23704eSSong Gao         } else if (shft_res < -(mask + 1)) {     \
1413*5c23704eSSong Gao             return  ~mask;                       \
1414*5c23704eSSong Gao         } else {                                 \
1415*5c23704eSSong Gao             return shft_res;                     \
1416*5c23704eSSong Gao         }                                        \
1417*5c23704eSSong Gao }
1418*5c23704eSSong Gao 
1419*5c23704eSSong Gao SSRANS(B, int16_t, int8_t)
1420*5c23704eSSong Gao SSRANS(H, int32_t, int16_t)
1421*5c23704eSSong Gao SSRANS(W, int64_t, int32_t)
1422*5c23704eSSong Gao 
1423*5c23704eSSong Gao #define VSSRAN(NAME, BIT, E1, E2, E3)                                       \
1424*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
1425*5c23704eSSong Gao {                                                                           \
1426*5c23704eSSong Gao     int i, j, ofs;                                                          \
1427*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                  \
1428*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                  \
1429*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                  \
1430*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                           \
1431*5c23704eSSong Gao                                                                             \
1432*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                    \
1433*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
1434*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                         \
1435*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \
1436*5c23704eSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
1437*5c23704eSSong Gao                                                 BIT / 2 - 1);               \
1438*5c23704eSSong Gao         }                                                                   \
1439*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
1440*5c23704eSSong Gao     }                                                                       \
1441*5c23704eSSong Gao }
1442*5c23704eSSong Gao 
1443*5c23704eSSong Gao VSSRAN(vssran_b_h, 16, B, H, UH)
1444*5c23704eSSong Gao VSSRAN(vssran_h_w, 32, H, W, UW)
1445*5c23704eSSong Gao VSSRAN(vssran_w_d, 64, W, D, UD)
1446*5c23704eSSong Gao 
1447*5c23704eSSong Gao #define SSRLNU(E, T1, T2, T3)                    \
1448*5c23704eSSong Gao static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \
1449*5c23704eSSong Gao {                                                \
1450*5c23704eSSong Gao         T1 shft_res;                             \
1451*5c23704eSSong Gao         if (sa == 0) {                           \
1452*5c23704eSSong Gao             shft_res = e2;                       \
1453*5c23704eSSong Gao         } else {                                 \
1454*5c23704eSSong Gao             shft_res = (((T1)e2) >> sa);         \
1455*5c23704eSSong Gao         }                                        \
1456*5c23704eSSong Gao         T2 mask;                                 \
1457*5c23704eSSong Gao         mask = (1ull << sh) - 1;                 \
1458*5c23704eSSong Gao         if (shft_res > mask) {                   \
1459*5c23704eSSong Gao             return mask;                         \
1460*5c23704eSSong Gao         } else {                                 \
1461*5c23704eSSong Gao             return shft_res;                     \
1462*5c23704eSSong Gao         }                                        \
1463*5c23704eSSong Gao }
1464*5c23704eSSong Gao 
1465*5c23704eSSong Gao SSRLNU(B, uint16_t, uint8_t,  int16_t)
1466*5c23704eSSong Gao SSRLNU(H, uint32_t, uint16_t, int32_t)
1467*5c23704eSSong Gao SSRLNU(W, uint64_t, uint32_t, int64_t)
1468*5c23704eSSong Gao 
1469*5c23704eSSong Gao #define VSSRLNU(NAME, BIT, E1, E2, E3)                                      \
1470*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
1471*5c23704eSSong Gao {                                                                           \
1472*5c23704eSSong Gao     int i, j, ofs;                                                          \
1473*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                  \
1474*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                  \
1475*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                  \
1476*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                           \
1477*5c23704eSSong Gao                                                                             \
1478*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                    \
1479*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
1480*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                         \
1481*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \
1482*5c23704eSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
1483*5c23704eSSong Gao                                                 BIT / 2);                   \
1484*5c23704eSSong Gao         }                                                                   \
1485*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
1486*5c23704eSSong Gao     }                                                                       \
1487*5c23704eSSong Gao }
1488*5c23704eSSong Gao 
1489*5c23704eSSong Gao VSSRLNU(vssrln_bu_h, 16, B, H, UH)
1490*5c23704eSSong Gao VSSRLNU(vssrln_hu_w, 32, H, W, UW)
1491*5c23704eSSong Gao VSSRLNU(vssrln_wu_d, 64, W, D, UD)
1492*5c23704eSSong Gao 
1493*5c23704eSSong Gao #define SSRANU(E, T1, T2, T3)                    \
1494*5c23704eSSong Gao static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \
1495*5c23704eSSong Gao {                                                \
1496*5c23704eSSong Gao         T1 shft_res;                             \
1497*5c23704eSSong Gao         if (sa == 0) {                           \
1498*5c23704eSSong Gao             shft_res = e2;                       \
1499*5c23704eSSong Gao         } else {                                 \
1500*5c23704eSSong Gao             shft_res = e2 >> sa;                 \
1501*5c23704eSSong Gao         }                                        \
1502*5c23704eSSong Gao         if (e2 < 0) {                            \
1503*5c23704eSSong Gao             shft_res = 0;                        \
1504*5c23704eSSong Gao         }                                        \
1505*5c23704eSSong Gao         T2 mask;                                 \
1506*5c23704eSSong Gao         mask = (1ull << sh) - 1;                 \
1507*5c23704eSSong Gao         if (shft_res > mask) {                   \
1508*5c23704eSSong Gao             return mask;                         \
1509*5c23704eSSong Gao         } else {                                 \
1510*5c23704eSSong Gao             return shft_res;                     \
1511*5c23704eSSong Gao         }                                        \
1512*5c23704eSSong Gao }
1513*5c23704eSSong Gao 
1514*5c23704eSSong Gao SSRANU(B, uint16_t, uint8_t,  int16_t)
1515*5c23704eSSong Gao SSRANU(H, uint32_t, uint16_t, int32_t)
1516*5c23704eSSong Gao SSRANU(W, uint64_t, uint32_t, int64_t)
1517*5c23704eSSong Gao 
1518*5c23704eSSong Gao #define VSSRANU(NAME, BIT, E1, E2, E3)                                         \
1519*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1520*5c23704eSSong Gao {                                                                              \
1521*5c23704eSSong Gao     int i, j, ofs;                                                             \
1522*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                     \
1523*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                     \
1524*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                     \
1525*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                              \
1526*5c23704eSSong Gao                                                                                \
1527*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                       \
1528*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
1529*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                            \
1530*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i),    \
1531*5c23704eSSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
1532*5c23704eSSong Gao                                                     BIT / 2);                  \
1533*5c23704eSSong Gao         }                                                                      \
1534*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
1535*5c23704eSSong Gao     }                                                                          \
1536*5c23704eSSong Gao }
1537*5c23704eSSong Gao 
1538*5c23704eSSong Gao VSSRANU(vssran_bu_h, 16, B, H, UH)
1539*5c23704eSSong Gao VSSRANU(vssran_hu_w, 32, H, W, UW)
1540*5c23704eSSong Gao VSSRANU(vssran_wu_d, 64, W, D, UD)
1541*5c23704eSSong Gao 
1542*5c23704eSSong Gao #define VSSRLNI(NAME, BIT, E1, E2)                                                 \
1543*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
1544*5c23704eSSong Gao {                                                                                  \
1545*5c23704eSSong Gao     int i, j, ofs;                                                                 \
1546*5c23704eSSong Gao     VReg temp = {};                                                                \
1547*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1548*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                         \
1549*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
1550*5c23704eSSong Gao                                                                                    \
1551*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                           \
1552*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
1553*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                \
1554*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i),       \
1555*5c23704eSSong Gao                                                      imm, BIT / 2 - 1);            \
1556*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \
1557*5c23704eSSong Gao                                                            imm, BIT / 2 - 1);      \
1558*5c23704eSSong Gao         }                                                                          \
1559*5c23704eSSong Gao     }                                                                              \
1560*5c23704eSSong Gao     *Vd = temp;                                                                    \
1561*5c23704eSSong Gao }
1562*5c23704eSSong Gao 
1563*5c23704eSSong Gao static void do_vssrlni_q(VReg *Vd, VReg *Vj,
1564*5c23704eSSong Gao                          uint64_t imm, int idx, Int128 mask)
1565*5c23704eSSong Gao {
1566*5c23704eSSong Gao     Int128 shft_res1, shft_res2;
1567*5c23704eSSong Gao 
1568*5c23704eSSong Gao     if (imm == 0) {
1569*5c23704eSSong Gao         shft_res1 = Vj->Q(idx);
1570*5c23704eSSong Gao         shft_res2 = Vd->Q(idx);
1571*5c23704eSSong Gao     } else {
1572*5c23704eSSong Gao         shft_res1 = int128_urshift(Vj->Q(idx), imm);
1573*5c23704eSSong Gao         shft_res2 = int128_urshift(Vd->Q(idx), imm);
1574*5c23704eSSong Gao     }
1575*5c23704eSSong Gao 
1576*5c23704eSSong Gao     if (int128_ult(mask, shft_res1)) {
1577*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
1578*5c23704eSSong Gao     }else {
1579*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
1580*5c23704eSSong Gao     }
1581*5c23704eSSong Gao 
1582*5c23704eSSong Gao     if (int128_ult(mask, shft_res2)) {
1583*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
1584*5c23704eSSong Gao     }else {
1585*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
1586*5c23704eSSong Gao     }
1587*5c23704eSSong Gao }
1588*5c23704eSSong Gao 
HELPER(vssrlni_d_q)1589*5c23704eSSong Gao void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1590*5c23704eSSong Gao {
1591*5c23704eSSong Gao     int i;
1592*5c23704eSSong Gao     Int128 mask;
1593*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1594*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1595*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
1596*5c23704eSSong Gao 
1597*5c23704eSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1598*5c23704eSSong Gao 
1599*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1600*5c23704eSSong Gao         do_vssrlni_q(Vd, Vj, imm, i, mask);
1601*5c23704eSSong Gao     }
1602*5c23704eSSong Gao }
1603*5c23704eSSong Gao 
1604*5c23704eSSong Gao VSSRLNI(vssrlni_b_h, 16, B, H)
1605*5c23704eSSong Gao VSSRLNI(vssrlni_h_w, 32, H, W)
1606*5c23704eSSong Gao VSSRLNI(vssrlni_w_d, 64, W, D)
1607*5c23704eSSong Gao 
1608*5c23704eSSong Gao #define VSSRANI(NAME, BIT, E1, E2)                                                 \
1609*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
1610*5c23704eSSong Gao {                                                                                  \
1611*5c23704eSSong Gao     int i, j, ofs;                                                                 \
1612*5c23704eSSong Gao     VReg temp = {};                                                                \
1613*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1614*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                         \
1615*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
1616*5c23704eSSong Gao                                                                                    \
1617*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                           \
1618*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
1619*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                \
1620*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i),       \
1621*5c23704eSSong Gao                                                         imm, BIT / 2 - 1);         \
1622*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \
1623*5c23704eSSong Gao                                                               imm, BIT / 2 - 1);   \
1624*5c23704eSSong Gao         }                                                                          \
1625*5c23704eSSong Gao     }                                                                              \
1626*5c23704eSSong Gao     *Vd = temp;                                                                    \
1627*5c23704eSSong Gao }
1628*5c23704eSSong Gao 
do_vssrani_d_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask,Int128 min)1629*5c23704eSSong Gao static void do_vssrani_d_q(VReg *Vd, VReg *Vj,
1630*5c23704eSSong Gao                            uint64_t imm, int idx, Int128 mask, Int128 min)
1631*5c23704eSSong Gao {
1632*5c23704eSSong Gao     Int128 shft_res1, shft_res2;
1633*5c23704eSSong Gao 
1634*5c23704eSSong Gao     if (imm == 0) {
1635*5c23704eSSong Gao         shft_res1 = Vj->Q(idx);
1636*5c23704eSSong Gao         shft_res2 = Vd->Q(idx);
1637*5c23704eSSong Gao     } else {
1638*5c23704eSSong Gao         shft_res1 = int128_rshift(Vj->Q(idx), imm);
1639*5c23704eSSong Gao         shft_res2 = int128_rshift(Vd->Q(idx), imm);
1640*5c23704eSSong Gao     }
1641*5c23704eSSong Gao 
1642*5c23704eSSong Gao     if (int128_gt(shft_res1, mask)) {
1643*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
1644*5c23704eSSong Gao     } else if (int128_lt(shft_res1, int128_neg(min))) {
1645*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(min);
1646*5c23704eSSong Gao     } else {
1647*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
1648*5c23704eSSong Gao     }
1649*5c23704eSSong Gao 
1650*5c23704eSSong Gao     if (int128_gt(shft_res2, mask)) {
1651*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
1652*5c23704eSSong Gao     } else if (int128_lt(shft_res2, int128_neg(min))) {
1653*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(min);
1654*5c23704eSSong Gao     } else {
1655*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
1656*5c23704eSSong Gao     }
1657*5c23704eSSong Gao }
1658*5c23704eSSong Gao 
HELPER(vssrani_d_q)1659*5c23704eSSong Gao void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1660*5c23704eSSong Gao {
1661*5c23704eSSong Gao     int i;
1662*5c23704eSSong Gao     Int128 mask, min;
1663*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1664*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1665*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
1666*5c23704eSSong Gao 
1667*5c23704eSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1668*5c23704eSSong Gao     min  = int128_lshift(int128_one(), 63);
1669*5c23704eSSong Gao 
1670*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1671*5c23704eSSong Gao         do_vssrani_d_q(Vd, Vj, imm, i, mask, min);
1672*5c23704eSSong Gao     }
1673*5c23704eSSong Gao }
1674*5c23704eSSong Gao 
1675*5c23704eSSong Gao 
1676*5c23704eSSong Gao VSSRANI(vssrani_b_h, 16, B, H)
1677*5c23704eSSong Gao VSSRANI(vssrani_h_w, 32, H, W)
1678*5c23704eSSong Gao VSSRANI(vssrani_w_d, 64, W, D)
1679*5c23704eSSong Gao 
1680*5c23704eSSong Gao #define VSSRLNUI(NAME, BIT, E1, E2)                                                \
1681*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
1682*5c23704eSSong Gao {                                                                                  \
1683*5c23704eSSong Gao     int i, j, ofs;                                                                 \
1684*5c23704eSSong Gao     VReg temp = {};                                                                \
1685*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1686*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                         \
1687*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
1688*5c23704eSSong Gao                                                                                    \
1689*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                           \
1690*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
1691*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                \
1692*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i),       \
1693*5c23704eSSong Gao                                                         imm, BIT / 2);             \
1694*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \
1695*5c23704eSSong Gao                                                               imm, BIT / 2);       \
1696*5c23704eSSong Gao         }                                                                          \
1697*5c23704eSSong Gao     }                                                                              \
1698*5c23704eSSong Gao     *Vd = temp;                                                                    \
1699*5c23704eSSong Gao }
1700*5c23704eSSong Gao 
HELPER(vssrlni_du_q)1701*5c23704eSSong Gao void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1702*5c23704eSSong Gao {
1703*5c23704eSSong Gao     int i;
1704*5c23704eSSong Gao     Int128 mask;
1705*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1706*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1707*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
1708*5c23704eSSong Gao 
1709*5c23704eSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1710*5c23704eSSong Gao 
1711*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1712*5c23704eSSong Gao         do_vssrlni_q(Vd, Vj, imm, i, mask);
1713*5c23704eSSong Gao     }
1714*5c23704eSSong Gao }
1715*5c23704eSSong Gao 
1716*5c23704eSSong Gao VSSRLNUI(vssrlni_bu_h, 16, B, H)
1717*5c23704eSSong Gao VSSRLNUI(vssrlni_hu_w, 32, H, W)
1718*5c23704eSSong Gao VSSRLNUI(vssrlni_wu_d, 64, W, D)
1719*5c23704eSSong Gao 
1720*5c23704eSSong Gao #define VSSRANUI(NAME, BIT, E1, E2)                                                \
1721*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
1722*5c23704eSSong Gao {                                                                                  \
1723*5c23704eSSong Gao     int i, j, ofs;                                                                 \
1724*5c23704eSSong Gao     VReg temp = {};                                                                \
1725*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1726*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                         \
1727*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
1728*5c23704eSSong Gao                                                                                    \
1729*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                           \
1730*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
1731*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                \
1732*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i),       \
1733*5c23704eSSong Gao                                                         imm, BIT / 2);             \
1734*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \
1735*5c23704eSSong Gao                                                               imm, BIT / 2);       \
1736*5c23704eSSong Gao         }                                                                          \
1737*5c23704eSSong Gao     }                                                                              \
1738*5c23704eSSong Gao     *Vd = temp;                                                                    \
1739*5c23704eSSong Gao }
1740*5c23704eSSong Gao 
do_vssrani_du_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask)1741*5c23704eSSong Gao static void do_vssrani_du_q(VReg *Vd, VReg *Vj,
1742*5c23704eSSong Gao                             uint64_t imm, int idx, Int128 mask)
1743*5c23704eSSong Gao {
1744*5c23704eSSong Gao     Int128 shft_res1, shft_res2;
1745*5c23704eSSong Gao 
1746*5c23704eSSong Gao     if (imm == 0) {
1747*5c23704eSSong Gao         shft_res1 = Vj->Q(idx);
1748*5c23704eSSong Gao         shft_res2 = Vd->Q(idx);
1749*5c23704eSSong Gao     } else {
1750*5c23704eSSong Gao         shft_res1 = int128_rshift(Vj->Q(idx), imm);
1751*5c23704eSSong Gao         shft_res2 = int128_rshift(Vd->Q(idx), imm);
1752*5c23704eSSong Gao     }
1753*5c23704eSSong Gao 
1754*5c23704eSSong Gao     if (int128_lt(Vj->Q(idx), int128_zero())) {
1755*5c23704eSSong Gao         shft_res1 = int128_zero();
1756*5c23704eSSong Gao     }
1757*5c23704eSSong Gao 
1758*5c23704eSSong Gao     if (int128_lt(Vd->Q(idx), int128_zero())) {
1759*5c23704eSSong Gao         shft_res2 = int128_zero();
1760*5c23704eSSong Gao     }
1761*5c23704eSSong Gao     if (int128_ult(mask, shft_res1)) {
1762*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
1763*5c23704eSSong Gao     }else {
1764*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
1765*5c23704eSSong Gao     }
1766*5c23704eSSong Gao 
1767*5c23704eSSong Gao     if (int128_ult(mask, shft_res2)) {
1768*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
1769*5c23704eSSong Gao     }else {
1770*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
1771*5c23704eSSong Gao     }
1772*5c23704eSSong Gao 
1773*5c23704eSSong Gao }
1774*5c23704eSSong Gao 
HELPER(vssrani_du_q)1775*5c23704eSSong Gao void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1776*5c23704eSSong Gao {
1777*5c23704eSSong Gao     int i;
1778*5c23704eSSong Gao     Int128 mask;
1779*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
1780*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
1781*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
1782*5c23704eSSong Gao 
1783*5c23704eSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1784*5c23704eSSong Gao 
1785*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1786*5c23704eSSong Gao         do_vssrani_du_q(Vd, Vj, imm, i, mask);
1787*5c23704eSSong Gao     }
1788*5c23704eSSong Gao }
1789*5c23704eSSong Gao 
1790*5c23704eSSong Gao VSSRANUI(vssrani_bu_h, 16, B, H)
1791*5c23704eSSong Gao VSSRANUI(vssrani_hu_w, 32, H, W)
1792*5c23704eSSong Gao VSSRANUI(vssrani_wu_d, 64, W, D)
1793*5c23704eSSong Gao 
1794*5c23704eSSong Gao #define SSRLRNS(E1, E2, T1, T2, T3)                \
1795*5c23704eSSong Gao static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \
1796*5c23704eSSong Gao {                                                  \
1797*5c23704eSSong Gao     T1 shft_res;                                   \
1798*5c23704eSSong Gao                                                    \
1799*5c23704eSSong Gao     shft_res = do_vsrlr_ ## E2(e2, sa);            \
1800*5c23704eSSong Gao     T1 mask;                                       \
1801*5c23704eSSong Gao     mask = (1ull << sh) - 1;                       \
1802*5c23704eSSong Gao     if (shft_res > mask) {                         \
1803*5c23704eSSong Gao         return mask;                               \
1804*5c23704eSSong Gao     } else {                                       \
1805*5c23704eSSong Gao         return  shft_res;                          \
1806*5c23704eSSong Gao     }                                              \
1807*5c23704eSSong Gao }
1808*5c23704eSSong Gao 
SSRLRNS(B,H,uint16_t,int16_t,uint8_t)1809*5c23704eSSong Gao SSRLRNS(B, H, uint16_t, int16_t, uint8_t)
1810*5c23704eSSong Gao SSRLRNS(H, W, uint32_t, int32_t, uint16_t)
1811*5c23704eSSong Gao SSRLRNS(W, D, uint64_t, int64_t, uint32_t)
1812*5c23704eSSong Gao 
1813*5c23704eSSong Gao #define VSSRLRN(NAME, BIT, E1, E2, E3)                                         \
1814*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1815*5c23704eSSong Gao {                                                                              \
1816*5c23704eSSong Gao     int i, j, ofs;                                                             \
1817*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                     \
1818*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                     \
1819*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                     \
1820*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                              \
1821*5c23704eSSong Gao                                                                                \
1822*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                       \
1823*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
1824*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                            \
1825*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i),   \
1826*5c23704eSSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
1827*5c23704eSSong Gao                                                     BIT / 2 - 1);              \
1828*5c23704eSSong Gao         }                                                                      \
1829*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
1830*5c23704eSSong Gao     }                                                                          \
1831*5c23704eSSong Gao }
1832*5c23704eSSong Gao 
1833*5c23704eSSong Gao VSSRLRN(vssrlrn_b_h, 16, B, H, UH)
1834*5c23704eSSong Gao VSSRLRN(vssrlrn_h_w, 32, H, W, UW)
1835*5c23704eSSong Gao VSSRLRN(vssrlrn_w_d, 64, W, D, UD)
1836*5c23704eSSong Gao 
1837*5c23704eSSong Gao #define SSRARNS(E1, E2, T1, T2)                    \
1838*5c23704eSSong Gao static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \
1839*5c23704eSSong Gao {                                                  \
1840*5c23704eSSong Gao     T1 shft_res;                                   \
1841*5c23704eSSong Gao                                                    \
1842*5c23704eSSong Gao     shft_res = do_vsrar_ ## E2(e2, sa);            \
1843*5c23704eSSong Gao     T2 mask;                                       \
1844*5c23704eSSong Gao     mask = (1ll << sh) - 1;                        \
1845*5c23704eSSong Gao     if (shft_res > mask) {                         \
1846*5c23704eSSong Gao         return  mask;                              \
1847*5c23704eSSong Gao     } else if (shft_res < -(mask +1)) {            \
1848*5c23704eSSong Gao         return  ~mask;                             \
1849*5c23704eSSong Gao     } else {                                       \
1850*5c23704eSSong Gao         return shft_res;                           \
1851*5c23704eSSong Gao     }                                              \
1852*5c23704eSSong Gao }
1853*5c23704eSSong Gao 
1854*5c23704eSSong Gao SSRARNS(B, H, int16_t, int8_t)
1855*5c23704eSSong Gao SSRARNS(H, W, int32_t, int16_t)
1856*5c23704eSSong Gao SSRARNS(W, D, int64_t, int32_t)
1857*5c23704eSSong Gao 
1858*5c23704eSSong Gao #define VSSRARN(NAME, BIT, E1, E2, E3)                                         \
1859*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1860*5c23704eSSong Gao {                                                                              \
1861*5c23704eSSong Gao     int i, j, ofs;                                                             \
1862*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                     \
1863*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                     \
1864*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                     \
1865*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                              \
1866*5c23704eSSong Gao                                                                                \
1867*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                       \
1868*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
1869*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                            \
1870*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i),   \
1871*5c23704eSSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
1872*5c23704eSSong Gao                                                     BIT/ 2 - 1);               \
1873*5c23704eSSong Gao         }                                                                      \
1874*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
1875*5c23704eSSong Gao     }                                                                          \
1876*5c23704eSSong Gao }
1877*5c23704eSSong Gao 
1878*5c23704eSSong Gao VSSRARN(vssrarn_b_h, 16, B, H, UH)
1879*5c23704eSSong Gao VSSRARN(vssrarn_h_w, 32, H, W, UW)
1880*5c23704eSSong Gao VSSRARN(vssrarn_w_d, 64, W, D, UD)
1881*5c23704eSSong Gao 
1882*5c23704eSSong Gao #define SSRLRNU(E1, E2, T1, T2, T3)                \
1883*5c23704eSSong Gao static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \
1884*5c23704eSSong Gao {                                                  \
1885*5c23704eSSong Gao     T1 shft_res;                                   \
1886*5c23704eSSong Gao                                                    \
1887*5c23704eSSong Gao     shft_res = do_vsrlr_ ## E2(e2, sa);            \
1888*5c23704eSSong Gao                                                    \
1889*5c23704eSSong Gao     T2 mask;                                       \
1890*5c23704eSSong Gao     mask = (1ull << sh) - 1;                       \
1891*5c23704eSSong Gao     if (shft_res > mask) {                         \
1892*5c23704eSSong Gao         return mask;                               \
1893*5c23704eSSong Gao     } else {                                       \
1894*5c23704eSSong Gao         return shft_res;                           \
1895*5c23704eSSong Gao     }                                              \
1896*5c23704eSSong Gao }
1897*5c23704eSSong Gao 
1898*5c23704eSSong Gao SSRLRNU(B, H, uint16_t, uint8_t, int16_t)
1899*5c23704eSSong Gao SSRLRNU(H, W, uint32_t, uint16_t, int32_t)
1900*5c23704eSSong Gao SSRLRNU(W, D, uint64_t, uint32_t, int64_t)
1901*5c23704eSSong Gao 
1902*5c23704eSSong Gao #define VSSRLRNU(NAME, BIT, E1, E2, E3)                                        \
1903*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1904*5c23704eSSong Gao {                                                                              \
1905*5c23704eSSong Gao     int i, j, ofs;                                                             \
1906*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                     \
1907*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                     \
1908*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                     \
1909*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                              \
1910*5c23704eSSong Gao                                                                                \
1911*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                       \
1912*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
1913*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                            \
1914*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i),   \
1915*5c23704eSSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
1916*5c23704eSSong Gao                                                     BIT / 2);                  \
1917*5c23704eSSong Gao         }                                                                      \
1918*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
1919*5c23704eSSong Gao     }                                                                          \
1920*5c23704eSSong Gao }
1921*5c23704eSSong Gao 
1922*5c23704eSSong Gao VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH)
1923*5c23704eSSong Gao VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW)
1924*5c23704eSSong Gao VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD)
1925*5c23704eSSong Gao 
1926*5c23704eSSong Gao #define SSRARNU(E1, E2, T1, T2, T3)                \
1927*5c23704eSSong Gao static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \
1928*5c23704eSSong Gao {                                                  \
1929*5c23704eSSong Gao     T1 shft_res;                                   \
1930*5c23704eSSong Gao                                                    \
1931*5c23704eSSong Gao     if (e2 < 0) {                                  \
1932*5c23704eSSong Gao         shft_res = 0;                              \
1933*5c23704eSSong Gao     } else {                                       \
1934*5c23704eSSong Gao         shft_res = do_vsrar_ ## E2(e2, sa);        \
1935*5c23704eSSong Gao     }                                              \
1936*5c23704eSSong Gao     T2 mask;                                       \
1937*5c23704eSSong Gao     mask = (1ull << sh) - 1;                       \
1938*5c23704eSSong Gao     if (shft_res > mask) {                         \
1939*5c23704eSSong Gao         return mask;                               \
1940*5c23704eSSong Gao     } else {                                       \
1941*5c23704eSSong Gao         return shft_res;                           \
1942*5c23704eSSong Gao     }                                              \
1943*5c23704eSSong Gao }
1944*5c23704eSSong Gao 
1945*5c23704eSSong Gao SSRARNU(B, H, uint16_t, uint8_t, int16_t)
1946*5c23704eSSong Gao SSRARNU(H, W, uint32_t, uint16_t, int32_t)
1947*5c23704eSSong Gao SSRARNU(W, D, uint64_t, uint32_t, int64_t)
1948*5c23704eSSong Gao 
1949*5c23704eSSong Gao #define VSSRARNU(NAME, BIT, E1, E2, E3)                                      \
1950*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void  *vk, uint32_t desc)              \
1951*5c23704eSSong Gao {                                                                            \
1952*5c23704eSSong Gao     int i, j, ofs;                                                           \
1953*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                   \
1954*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                   \
1955*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                   \
1956*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                            \
1957*5c23704eSSong Gao                                                                              \
1958*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                     \
1959*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                       \
1960*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                          \
1961*5c23704eSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \
1962*5c23704eSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,   \
1963*5c23704eSSong Gao                                                 BIT / 2);                    \
1964*5c23704eSSong Gao         }                                                                    \
1965*5c23704eSSong Gao         Vd->D(2 * i + 1) = 0;                                                \
1966*5c23704eSSong Gao     }                                                                        \
1967*5c23704eSSong Gao }
1968*5c23704eSSong Gao 
1969*5c23704eSSong Gao VSSRARNU(vssrarn_bu_h, 16, B, H, UH)
1970*5c23704eSSong Gao VSSRARNU(vssrarn_hu_w, 32, H, W, UW)
1971*5c23704eSSong Gao VSSRARNU(vssrarn_wu_d, 64, W, D, UD)
1972*5c23704eSSong Gao 
1973*5c23704eSSong Gao #define VSSRLRNI(NAME, BIT, E1, E2)                                                 \
1974*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
1975*5c23704eSSong Gao {                                                                                   \
1976*5c23704eSSong Gao     int i, j, ofs;                                                                  \
1977*5c23704eSSong Gao     VReg temp = {};                                                                 \
1978*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                          \
1979*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                          \
1980*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                   \
1981*5c23704eSSong Gao                                                                                     \
1982*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                            \
1983*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
1984*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                 \
1985*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i),       \
1986*5c23704eSSong Gao                                                          imm, BIT / 2 - 1);         \
1987*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \
1988*5c23704eSSong Gao                                                                imm, BIT / 2 - 1);   \
1989*5c23704eSSong Gao         }                                                                           \
1990*5c23704eSSong Gao     }                                                                               \
1991*5c23704eSSong Gao     *Vd = temp;                                                                     \
1992*5c23704eSSong Gao }
1993*5c23704eSSong Gao 
1994*5c23704eSSong Gao static void do_vssrlrni_q(VReg *Vd, VReg * Vj,
1995*5c23704eSSong Gao                           uint64_t imm, int idx, Int128 mask)
1996*5c23704eSSong Gao {
1997*5c23704eSSong Gao     Int128 shft_res1, shft_res2, r1, r2;
1998*5c23704eSSong Gao     if (imm == 0) {
1999*5c23704eSSong Gao         shft_res1 = Vj->Q(idx);
2000*5c23704eSSong Gao         shft_res2 = Vd->Q(idx);
2001*5c23704eSSong Gao     } else {
2002*5c23704eSSong Gao         r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one());
2003*5c23704eSSong Gao         r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one());
2004*5c23704eSSong Gao         shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1));
2005*5c23704eSSong Gao         shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2));
2006*5c23704eSSong Gao     }
2007*5c23704eSSong Gao 
2008*5c23704eSSong Gao     if (int128_ult(mask, shft_res1)) {
2009*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
2010*5c23704eSSong Gao     }else {
2011*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
2012*5c23704eSSong Gao     }
2013*5c23704eSSong Gao 
2014*5c23704eSSong Gao     if (int128_ult(mask, shft_res2)) {
2015*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
2016*5c23704eSSong Gao     }else {
2017*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
2018*5c23704eSSong Gao     }
2019*5c23704eSSong Gao }
2020*5c23704eSSong Gao 
HELPER(vssrlrni_d_q)2021*5c23704eSSong Gao void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2022*5c23704eSSong Gao {
2023*5c23704eSSong Gao     int i;
2024*5c23704eSSong Gao     Int128 mask;
2025*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2026*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2027*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2028*5c23704eSSong Gao 
2029*5c23704eSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
2030*5c23704eSSong Gao 
2031*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2032*5c23704eSSong Gao         do_vssrlrni_q(Vd, Vj, imm, i, mask);
2033*5c23704eSSong Gao     }
2034*5c23704eSSong Gao }
2035*5c23704eSSong Gao 
2036*5c23704eSSong Gao VSSRLRNI(vssrlrni_b_h, 16, B, H)
2037*5c23704eSSong Gao VSSRLRNI(vssrlrni_h_w, 32, H, W)
2038*5c23704eSSong Gao VSSRLRNI(vssrlrni_w_d, 64, W, D)
2039*5c23704eSSong Gao 
2040*5c23704eSSong Gao #define VSSRARNI(NAME, BIT, E1, E2)                                                 \
2041*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2042*5c23704eSSong Gao {                                                                                   \
2043*5c23704eSSong Gao     int i, j, ofs;                                                                  \
2044*5c23704eSSong Gao     VReg temp = {};                                                                 \
2045*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2046*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                          \
2047*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2048*5c23704eSSong Gao                                                                                     \
2049*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                            \
2050*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
2051*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                 \
2052*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i),       \
2053*5c23704eSSong Gao                                                          imm, BIT / 2 - 1);         \
2054*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \
2055*5c23704eSSong Gao                                                                imm, BIT / 2 - 1);   \
2056*5c23704eSSong Gao         }                                                                           \
2057*5c23704eSSong Gao     }                                                                               \
2058*5c23704eSSong Gao     *Vd = temp;                                                                     \
2059*5c23704eSSong Gao }
2060*5c23704eSSong Gao 
do_vssrarni_d_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask1,Int128 mask2)2061*5c23704eSSong Gao static void do_vssrarni_d_q(VReg *Vd, VReg *Vj,
2062*5c23704eSSong Gao                            uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2063*5c23704eSSong Gao {
2064*5c23704eSSong Gao     Int128 shft_res1, shft_res2, r1, r2;
2065*5c23704eSSong Gao 
2066*5c23704eSSong Gao     if (imm == 0) {
2067*5c23704eSSong Gao         shft_res1 = Vj->Q(idx);
2068*5c23704eSSong Gao         shft_res2 = Vd->Q(idx);
2069*5c23704eSSong Gao     } else {
2070*5c23704eSSong Gao         r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
2071*5c23704eSSong Gao         r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
2072*5c23704eSSong Gao         shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
2073*5c23704eSSong Gao         shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2074*5c23704eSSong Gao     }
2075*5c23704eSSong Gao     if (int128_gt(shft_res1, mask1)) {
2076*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask1);
2077*5c23704eSSong Gao     } else if (int128_lt(shft_res1, int128_neg(mask2))) {
2078*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask2);
2079*5c23704eSSong Gao     } else {
2080*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
2081*5c23704eSSong Gao     }
2082*5c23704eSSong Gao 
2083*5c23704eSSong Gao     if (int128_gt(shft_res2, mask1)) {
2084*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask1);
2085*5c23704eSSong Gao     } else if (int128_lt(shft_res2, int128_neg(mask2))) {
2086*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask2);
2087*5c23704eSSong Gao     } else {
2088*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
2089*5c23704eSSong Gao     }
2090*5c23704eSSong Gao }
2091*5c23704eSSong Gao 
HELPER(vssrarni_d_q)2092*5c23704eSSong Gao void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2093*5c23704eSSong Gao {
2094*5c23704eSSong Gao     int i;
2095*5c23704eSSong Gao     Int128 mask1, mask2;
2096*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2097*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2098*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2099*5c23704eSSong Gao 
2100*5c23704eSSong Gao     mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one());
2101*5c23704eSSong Gao     mask2  = int128_lshift(int128_one(), 63);
2102*5c23704eSSong Gao 
2103*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2104*5c23704eSSong Gao         do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2);
2105*5c23704eSSong Gao     }
2106*5c23704eSSong Gao }
2107*5c23704eSSong Gao 
2108*5c23704eSSong Gao VSSRARNI(vssrarni_b_h, 16, B, H)
2109*5c23704eSSong Gao VSSRARNI(vssrarni_h_w, 32, H, W)
2110*5c23704eSSong Gao VSSRARNI(vssrarni_w_d, 64, W, D)
2111*5c23704eSSong Gao 
2112*5c23704eSSong Gao #define VSSRLRNUI(NAME, BIT, E1, E2)                                                \
2113*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2114*5c23704eSSong Gao {                                                                                   \
2115*5c23704eSSong Gao     int i, j, ofs;                                                                  \
2116*5c23704eSSong Gao     VReg temp = {};                                                                 \
2117*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2118*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                          \
2119*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2120*5c23704eSSong Gao                                                                                     \
2121*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                            \
2122*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
2123*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                 \
2124*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i),       \
2125*5c23704eSSong Gao                                                          imm, BIT / 2);             \
2126*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \
2127*5c23704eSSong Gao                                                                imm, BIT / 2);       \
2128*5c23704eSSong Gao         }                                                                           \
2129*5c23704eSSong Gao     }                                                                               \
2130*5c23704eSSong Gao     *Vd = temp;                                                                     \
2131*5c23704eSSong Gao }
2132*5c23704eSSong Gao 
HELPER(vssrlrni_du_q)2133*5c23704eSSong Gao void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2134*5c23704eSSong Gao {
2135*5c23704eSSong Gao     int i;
2136*5c23704eSSong Gao     Int128 mask;
2137*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2138*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2139*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2140*5c23704eSSong Gao 
2141*5c23704eSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
2142*5c23704eSSong Gao 
2143*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2144*5c23704eSSong Gao         do_vssrlrni_q(Vd, Vj, imm, i, mask);
2145*5c23704eSSong Gao     }
2146*5c23704eSSong Gao }
2147*5c23704eSSong Gao 
2148*5c23704eSSong Gao VSSRLRNUI(vssrlrni_bu_h, 16, B, H)
2149*5c23704eSSong Gao VSSRLRNUI(vssrlrni_hu_w, 32, H, W)
2150*5c23704eSSong Gao VSSRLRNUI(vssrlrni_wu_d, 64, W, D)
2151*5c23704eSSong Gao 
2152*5c23704eSSong Gao #define VSSRARNUI(NAME, BIT, E1, E2)                                                \
2153*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2154*5c23704eSSong Gao {                                                                                   \
2155*5c23704eSSong Gao     int i, j, ofs;                                                                  \
2156*5c23704eSSong Gao     VReg temp = {};                                                                 \
2157*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2158*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                          \
2159*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2160*5c23704eSSong Gao                                                                                     \
2161*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                            \
2162*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
2163*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                                 \
2164*5c23704eSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i),       \
2165*5c23704eSSong Gao                                                          imm, BIT / 2);             \
2166*5c23704eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \
2167*5c23704eSSong Gao                                                                imm, BIT / 2);       \
2168*5c23704eSSong Gao         }                                                                           \
2169*5c23704eSSong Gao     }                                                                               \
2170*5c23704eSSong Gao     *Vd = temp;                                                                     \
2171*5c23704eSSong Gao }
2172*5c23704eSSong Gao 
do_vssrarni_du_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask1,Int128 mask2)2173*5c23704eSSong Gao static void do_vssrarni_du_q(VReg *Vd, VReg *Vj,
2174*5c23704eSSong Gao                              uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2175*5c23704eSSong Gao {
2176*5c23704eSSong Gao     Int128 shft_res1, shft_res2, r1, r2;
2177*5c23704eSSong Gao 
2178*5c23704eSSong Gao     if (imm == 0) {
2179*5c23704eSSong Gao         shft_res1 = Vj->Q(idx);
2180*5c23704eSSong Gao         shft_res2 = Vd->Q(idx);
2181*5c23704eSSong Gao     } else {
2182*5c23704eSSong Gao         r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
2183*5c23704eSSong Gao         r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
2184*5c23704eSSong Gao         shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
2185*5c23704eSSong Gao         shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2186*5c23704eSSong Gao     }
2187*5c23704eSSong Gao 
2188*5c23704eSSong Gao     if (int128_lt(Vj->Q(idx), int128_zero())) {
2189*5c23704eSSong Gao         shft_res1 = int128_zero();
2190*5c23704eSSong Gao     }
2191*5c23704eSSong Gao     if (int128_lt(Vd->Q(idx), int128_zero())) {
2192*5c23704eSSong Gao         shft_res2 = int128_zero();
2193*5c23704eSSong Gao     }
2194*5c23704eSSong Gao 
2195*5c23704eSSong Gao     if (int128_gt(shft_res1,  mask1)) {
2196*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask1);
2197*5c23704eSSong Gao     } else if (int128_lt(shft_res1, int128_neg(mask2))) {
2198*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(mask2);
2199*5c23704eSSong Gao     } else {
2200*5c23704eSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
2201*5c23704eSSong Gao     }
2202*5c23704eSSong Gao 
2203*5c23704eSSong Gao     if (int128_gt(shft_res2, mask1)) {
2204*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask1);
2205*5c23704eSSong Gao     } else if (int128_lt(shft_res2, int128_neg(mask2))) {
2206*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask2);
2207*5c23704eSSong Gao     } else {
2208*5c23704eSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
2209*5c23704eSSong Gao     }
2210*5c23704eSSong Gao }
2211*5c23704eSSong Gao 
HELPER(vssrarni_du_q)2212*5c23704eSSong Gao void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2213*5c23704eSSong Gao {
2214*5c23704eSSong Gao     int i;
2215*5c23704eSSong Gao     Int128 mask1, mask2;
2216*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2217*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2218*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2219*5c23704eSSong Gao 
2220*5c23704eSSong Gao     mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one());
2221*5c23704eSSong Gao     mask2  = int128_lshift(int128_one(), 64);
2222*5c23704eSSong Gao 
2223*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2224*5c23704eSSong Gao         do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2);
2225*5c23704eSSong Gao     }
2226*5c23704eSSong Gao }
2227*5c23704eSSong Gao 
2228*5c23704eSSong Gao VSSRARNUI(vssrarni_bu_h, 16, B, H)
2229*5c23704eSSong Gao VSSRARNUI(vssrarni_hu_w, 32, H, W)
2230*5c23704eSSong Gao VSSRARNUI(vssrarni_wu_d, 64, W, D)
2231*5c23704eSSong Gao 
2232*5c23704eSSong Gao #define DO_2OP(NAME, BIT, E, DO_OP)                  \
2233*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
2234*5c23704eSSong Gao {                                                    \
2235*5c23704eSSong Gao     int i;                                           \
2236*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                           \
2237*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                           \
2238*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                    \
2239*5c23704eSSong Gao                                                      \
2240*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++)          \
2241*5c23704eSSong Gao     {                                                \
2242*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i));                  \
2243*5c23704eSSong Gao     }                                                \
2244*5c23704eSSong Gao }
2245*5c23704eSSong Gao 
2246*5c23704eSSong Gao DO_2OP(vclo_b, 8, UB, DO_CLO_B)
2247*5c23704eSSong Gao DO_2OP(vclo_h, 16, UH, DO_CLO_H)
2248*5c23704eSSong Gao DO_2OP(vclo_w, 32, UW, DO_CLO_W)
2249*5c23704eSSong Gao DO_2OP(vclo_d, 64, UD, DO_CLO_D)
2250*5c23704eSSong Gao DO_2OP(vclz_b, 8, UB, DO_CLZ_B)
2251*5c23704eSSong Gao DO_2OP(vclz_h, 16, UH, DO_CLZ_H)
2252*5c23704eSSong Gao DO_2OP(vclz_w, 32, UW, DO_CLZ_W)
2253*5c23704eSSong Gao DO_2OP(vclz_d, 64, UD, DO_CLZ_D)
2254*5c23704eSSong Gao 
2255*5c23704eSSong Gao #define VPCNT(NAME, BIT, E, FN)                      \
2256*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
2257*5c23704eSSong Gao {                                                    \
2258*5c23704eSSong Gao     int i;                                           \
2259*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                           \
2260*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                           \
2261*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                    \
2262*5c23704eSSong Gao                                                      \
2263*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++)          \
2264*5c23704eSSong Gao     {                                                \
2265*5c23704eSSong Gao         Vd->E(i) = FN(Vj->E(i));                     \
2266*5c23704eSSong Gao     }                                                \
2267*5c23704eSSong Gao }
2268*5c23704eSSong Gao 
2269*5c23704eSSong Gao VPCNT(vpcnt_b, 8, UB, ctpop8)
2270*5c23704eSSong Gao VPCNT(vpcnt_h, 16, UH, ctpop16)
2271*5c23704eSSong Gao VPCNT(vpcnt_w, 32, UW, ctpop32)
2272*5c23704eSSong Gao VPCNT(vpcnt_d, 64, UD, ctpop64)
2273*5c23704eSSong Gao 
2274*5c23704eSSong Gao #define DO_BIT(NAME, BIT, E, DO_OP)                            \
2275*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2276*5c23704eSSong Gao {                                                              \
2277*5c23704eSSong Gao     int i;                                                     \
2278*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
2279*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
2280*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
2281*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
2282*5c23704eSSong Gao                                                                \
2283*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
2284*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT);              \
2285*5c23704eSSong Gao     }                                                          \
2286*5c23704eSSong Gao }
2287*5c23704eSSong Gao 
2288*5c23704eSSong Gao DO_BIT(vbitclr_b, 8, UB, DO_BITCLR)
2289*5c23704eSSong Gao DO_BIT(vbitclr_h, 16, UH, DO_BITCLR)
2290*5c23704eSSong Gao DO_BIT(vbitclr_w, 32, UW, DO_BITCLR)
2291*5c23704eSSong Gao DO_BIT(vbitclr_d, 64, UD, DO_BITCLR)
2292*5c23704eSSong Gao DO_BIT(vbitset_b, 8, UB, DO_BITSET)
2293*5c23704eSSong Gao DO_BIT(vbitset_h, 16, UH, DO_BITSET)
2294*5c23704eSSong Gao DO_BIT(vbitset_w, 32, UW, DO_BITSET)
2295*5c23704eSSong Gao DO_BIT(vbitset_d, 64, UD, DO_BITSET)
2296*5c23704eSSong Gao DO_BIT(vbitrev_b, 8, UB, DO_BITREV)
2297*5c23704eSSong Gao DO_BIT(vbitrev_h, 16, UH, DO_BITREV)
2298*5c23704eSSong Gao DO_BIT(vbitrev_w, 32, UW, DO_BITREV)
2299*5c23704eSSong Gao DO_BIT(vbitrev_d, 64, UD, DO_BITREV)
2300*5c23704eSSong Gao 
2301*5c23704eSSong Gao #define DO_BITI(NAME, BIT, E, DO_OP)                               \
2302*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2303*5c23704eSSong Gao {                                                                  \
2304*5c23704eSSong Gao     int i;                                                         \
2305*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
2306*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
2307*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
2308*5c23704eSSong Gao                                                                    \
2309*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
2310*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i), imm);                           \
2311*5c23704eSSong Gao     }                                                              \
2312*5c23704eSSong Gao }
2313*5c23704eSSong Gao 
2314*5c23704eSSong Gao DO_BITI(vbitclri_b, 8, UB, DO_BITCLR)
2315*5c23704eSSong Gao DO_BITI(vbitclri_h, 16, UH, DO_BITCLR)
2316*5c23704eSSong Gao DO_BITI(vbitclri_w, 32, UW, DO_BITCLR)
2317*5c23704eSSong Gao DO_BITI(vbitclri_d, 64, UD, DO_BITCLR)
2318*5c23704eSSong Gao DO_BITI(vbitseti_b, 8, UB, DO_BITSET)
2319*5c23704eSSong Gao DO_BITI(vbitseti_h, 16, UH, DO_BITSET)
2320*5c23704eSSong Gao DO_BITI(vbitseti_w, 32, UW, DO_BITSET)
2321*5c23704eSSong Gao DO_BITI(vbitseti_d, 64, UD, DO_BITSET)
2322*5c23704eSSong Gao DO_BITI(vbitrevi_b, 8, UB, DO_BITREV)
2323*5c23704eSSong Gao DO_BITI(vbitrevi_h, 16, UH, DO_BITREV)
2324*5c23704eSSong Gao DO_BITI(vbitrevi_w, 32, UW, DO_BITREV)
2325*5c23704eSSong Gao DO_BITI(vbitrevi_d, 64, UD, DO_BITREV)
2326*5c23704eSSong Gao 
2327*5c23704eSSong Gao #define VFRSTP(NAME, BIT, MASK, E)                             \
2328*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2329*5c23704eSSong Gao {                                                              \
2330*5c23704eSSong Gao     int i, j, m, ofs;                                          \
2331*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
2332*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
2333*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
2334*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
2335*5c23704eSSong Gao                                                                \
2336*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                       \
2337*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                         \
2338*5c23704eSSong Gao         m = Vk->E(i * ofs) & MASK;                             \
2339*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                            \
2340*5c23704eSSong Gao             if (Vj->E(j + ofs * i) < 0) {                      \
2341*5c23704eSSong Gao                 break;                                         \
2342*5c23704eSSong Gao             }                                                  \
2343*5c23704eSSong Gao         }                                                      \
2344*5c23704eSSong Gao         Vd->E(m + i * ofs) = j;                                \
2345*5c23704eSSong Gao     }                                                          \
2346*5c23704eSSong Gao }
2347*5c23704eSSong Gao 
2348*5c23704eSSong Gao VFRSTP(vfrstp_b, 8, 0xf, B)
2349*5c23704eSSong Gao VFRSTP(vfrstp_h, 16, 0x7, H)
2350*5c23704eSSong Gao 
2351*5c23704eSSong Gao #define VFRSTPI(NAME, BIT, E)                                      \
2352*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2353*5c23704eSSong Gao {                                                                  \
2354*5c23704eSSong Gao     int i, j, m, ofs;                                              \
2355*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
2356*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
2357*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
2358*5c23704eSSong Gao                                                                    \
2359*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                           \
2360*5c23704eSSong Gao     m = imm % ofs;                                                 \
2361*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                             \
2362*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                \
2363*5c23704eSSong Gao             if (Vj->E(j + ofs * i) < 0) {                          \
2364*5c23704eSSong Gao                 break;                                             \
2365*5c23704eSSong Gao             }                                                      \
2366*5c23704eSSong Gao         }                                                          \
2367*5c23704eSSong Gao         Vd->E(m + i * ofs) = j;                                    \
2368*5c23704eSSong Gao     }                                                              \
2369*5c23704eSSong Gao }
2370*5c23704eSSong Gao 
2371*5c23704eSSong Gao VFRSTPI(vfrstpi_b, 8,  B)
2372*5c23704eSSong Gao VFRSTPI(vfrstpi_h, 16, H)
2373*5c23704eSSong Gao 
vec_update_fcsr0_mask(CPULoongArchState * env,uintptr_t pc,int mask)2374*5c23704eSSong Gao static void vec_update_fcsr0_mask(CPULoongArchState *env,
2375*5c23704eSSong Gao                                   uintptr_t pc, int mask)
2376*5c23704eSSong Gao {
2377*5c23704eSSong Gao     int flags = get_float_exception_flags(&env->fp_status);
2378*5c23704eSSong Gao 
2379*5c23704eSSong Gao     set_float_exception_flags(0, &env->fp_status);
2380*5c23704eSSong Gao 
2381*5c23704eSSong Gao     flags &= ~mask;
2382*5c23704eSSong Gao 
2383*5c23704eSSong Gao     if (flags) {
2384*5c23704eSSong Gao         flags = ieee_ex_to_loongarch(flags);
2385*5c23704eSSong Gao         UPDATE_FP_CAUSE(env->fcsr0, flags);
2386*5c23704eSSong Gao     }
2387*5c23704eSSong Gao 
2388*5c23704eSSong Gao     if (GET_FP_ENABLES(env->fcsr0) & flags) {
2389*5c23704eSSong Gao         do_raise_exception(env, EXCCODE_FPE, pc);
2390*5c23704eSSong Gao     } else {
2391*5c23704eSSong Gao         UPDATE_FP_FLAGS(env->fcsr0, flags);
2392*5c23704eSSong Gao     }
2393*5c23704eSSong Gao }
2394*5c23704eSSong Gao 
vec_update_fcsr0(CPULoongArchState * env,uintptr_t pc)2395*5c23704eSSong Gao static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc)
2396*5c23704eSSong Gao {
2397*5c23704eSSong Gao     vec_update_fcsr0_mask(env, pc, 0);
2398*5c23704eSSong Gao }
2399*5c23704eSSong Gao 
vec_clear_cause(CPULoongArchState * env)2400*5c23704eSSong Gao static inline void vec_clear_cause(CPULoongArchState *env)
2401*5c23704eSSong Gao {
2402*5c23704eSSong Gao     SET_FP_CAUSE(env->fcsr0, 0);
2403*5c23704eSSong Gao }
2404*5c23704eSSong Gao 
2405*5c23704eSSong Gao #define DO_3OP_F(NAME, BIT, E, FN)                          \
2406*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk,             \
2407*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc)    \
2408*5c23704eSSong Gao {                                                           \
2409*5c23704eSSong Gao     int i;                                                  \
2410*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                  \
2411*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                  \
2412*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                  \
2413*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                           \
2414*5c23704eSSong Gao                                                             \
2415*5c23704eSSong Gao     vec_clear_cause(env);                                   \
2416*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {               \
2417*5c23704eSSong Gao         Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
2418*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());                     \
2419*5c23704eSSong Gao     }                                                       \
2420*5c23704eSSong Gao }
2421*5c23704eSSong Gao 
2422*5c23704eSSong Gao DO_3OP_F(vfadd_s, 32, UW, float32_add)
2423*5c23704eSSong Gao DO_3OP_F(vfadd_d, 64, UD, float64_add)
2424*5c23704eSSong Gao DO_3OP_F(vfsub_s, 32, UW, float32_sub)
2425*5c23704eSSong Gao DO_3OP_F(vfsub_d, 64, UD, float64_sub)
2426*5c23704eSSong Gao DO_3OP_F(vfmul_s, 32, UW, float32_mul)
2427*5c23704eSSong Gao DO_3OP_F(vfmul_d, 64, UD, float64_mul)
2428*5c23704eSSong Gao DO_3OP_F(vfdiv_s, 32, UW, float32_div)
2429*5c23704eSSong Gao DO_3OP_F(vfdiv_d, 64, UD, float64_div)
2430*5c23704eSSong Gao DO_3OP_F(vfmax_s, 32, UW, float32_maxnum)
2431*5c23704eSSong Gao DO_3OP_F(vfmax_d, 64, UD, float64_maxnum)
2432*5c23704eSSong Gao DO_3OP_F(vfmin_s, 32, UW, float32_minnum)
2433*5c23704eSSong Gao DO_3OP_F(vfmin_d, 64, UD, float64_minnum)
2434*5c23704eSSong Gao DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag)
2435*5c23704eSSong Gao DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag)
2436*5c23704eSSong Gao DO_3OP_F(vfmina_s, 32, UW, float32_minnummag)
2437*5c23704eSSong Gao DO_3OP_F(vfmina_d, 64, UD, float64_minnummag)
2438*5c23704eSSong Gao 
2439*5c23704eSSong Gao #define DO_4OP_F(NAME, BIT, E, FN, flags)                                    \
2440*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, void *va,                    \
2441*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc)                     \
2442*5c23704eSSong Gao {                                                                            \
2443*5c23704eSSong Gao     int i;                                                                   \
2444*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                   \
2445*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                   \
2446*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                   \
2447*5c23704eSSong Gao     VReg *Va = (VReg *)va;                                                   \
2448*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                            \
2449*5c23704eSSong Gao                                                                              \
2450*5c23704eSSong Gao     vec_clear_cause(env);                                                    \
2451*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                                \
2452*5c23704eSSong Gao         Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \
2453*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());                                      \
2454*5c23704eSSong Gao     }                                                                        \
2455*5c23704eSSong Gao }
2456*5c23704eSSong Gao 
2457*5c23704eSSong Gao DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0)
2458*5c23704eSSong Gao DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0)
2459*5c23704eSSong Gao DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c)
2460*5c23704eSSong Gao DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c)
2461*5c23704eSSong Gao DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result)
2462*5c23704eSSong Gao DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result)
2463*5c23704eSSong Gao DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd,
2464*5c23704eSSong Gao          float_muladd_negate_c | float_muladd_negate_result)
2465*5c23704eSSong Gao DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd,
2466*5c23704eSSong Gao          float_muladd_negate_c | float_muladd_negate_result)
2467*5c23704eSSong Gao 
2468*5c23704eSSong Gao #define DO_2OP_F(NAME, BIT, E, FN)                       \
2469*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj,                    \
2470*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc) \
2471*5c23704eSSong Gao {                                                        \
2472*5c23704eSSong Gao     int i;                                               \
2473*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                               \
2474*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                               \
2475*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                        \
2476*5c23704eSSong Gao                                                          \
2477*5c23704eSSong Gao     vec_clear_cause(env);                                \
2478*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {            \
2479*5c23704eSSong Gao         Vd->E(i) = FN(env, Vj->E(i));                    \
2480*5c23704eSSong Gao     }                                                    \
2481*5c23704eSSong Gao }
2482*5c23704eSSong Gao 
2483*5c23704eSSong Gao #define FLOGB(BIT, T)                                            \
2484*5c23704eSSong Gao static T do_flogb_## BIT(CPULoongArchState *env, T fj)           \
2485*5c23704eSSong Gao {                                                                \
2486*5c23704eSSong Gao     T fp, fd;                                                    \
2487*5c23704eSSong Gao     float_status *status = &env->fp_status;                      \
2488*5c23704eSSong Gao     FloatRoundMode old_mode = get_float_rounding_mode(status);   \
2489*5c23704eSSong Gao                                                                  \
2490*5c23704eSSong Gao     set_float_rounding_mode(float_round_down, status);           \
2491*5c23704eSSong Gao     fp = float ## BIT ##_log2(fj, status);                       \
2492*5c23704eSSong Gao     fd = float ## BIT ##_round_to_int(fp, status);               \
2493*5c23704eSSong Gao     set_float_rounding_mode(old_mode, status);                   \
2494*5c23704eSSong Gao     vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact);     \
2495*5c23704eSSong Gao     return fd;                                                   \
2496*5c23704eSSong Gao }
2497*5c23704eSSong Gao 
2498*5c23704eSSong Gao FLOGB(32, uint32_t)
2499*5c23704eSSong Gao FLOGB(64, uint64_t)
2500*5c23704eSSong Gao 
2501*5c23704eSSong Gao #define FCLASS(NAME, BIT, E, FN)                         \
2502*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj,                    \
2503*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc) \
2504*5c23704eSSong Gao {                                                        \
2505*5c23704eSSong Gao     int i;                                               \
2506*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                               \
2507*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                               \
2508*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                        \
2509*5c23704eSSong Gao                                                          \
2510*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {            \
2511*5c23704eSSong Gao         Vd->E(i) = FN(env, Vj->E(i));                    \
2512*5c23704eSSong Gao     }                                                    \
2513*5c23704eSSong Gao }
2514*5c23704eSSong Gao 
2515*5c23704eSSong Gao FCLASS(vfclass_s, 32, UW, helper_fclass_s)
2516*5c23704eSSong Gao FCLASS(vfclass_d, 64, UD, helper_fclass_d)
2517*5c23704eSSong Gao 
2518*5c23704eSSong Gao #define FSQRT(BIT, T)                                  \
2519*5c23704eSSong Gao static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \
2520*5c23704eSSong Gao {                                                      \
2521*5c23704eSSong Gao     T fd;                                              \
2522*5c23704eSSong Gao     fd = float ## BIT ##_sqrt(fj, &env->fp_status);    \
2523*5c23704eSSong Gao     vec_update_fcsr0(env, GETPC());                    \
2524*5c23704eSSong Gao     return fd;                                         \
2525*5c23704eSSong Gao }
2526*5c23704eSSong Gao 
2527*5c23704eSSong Gao FSQRT(32, uint32_t)
2528*5c23704eSSong Gao FSQRT(64, uint64_t)
2529*5c23704eSSong Gao 
2530*5c23704eSSong Gao #define FRECIP(BIT, T)                                                  \
2531*5c23704eSSong Gao static T do_frecip_## BIT(CPULoongArchState *env, T fj)                 \
2532*5c23704eSSong Gao {                                                                       \
2533*5c23704eSSong Gao     T fd;                                                               \
2534*5c23704eSSong Gao     fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \
2535*5c23704eSSong Gao     vec_update_fcsr0(env, GETPC());                                     \
2536*5c23704eSSong Gao     return fd;                                                          \
2537*5c23704eSSong Gao }
2538*5c23704eSSong Gao 
2539*5c23704eSSong Gao FRECIP(32, uint32_t)
2540*5c23704eSSong Gao FRECIP(64, uint64_t)
2541*5c23704eSSong Gao 
2542*5c23704eSSong Gao #define FRSQRT(BIT, T)                                                  \
2543*5c23704eSSong Gao static T do_frsqrt_## BIT(CPULoongArchState *env, T fj)                 \
2544*5c23704eSSong Gao {                                                                       \
2545*5c23704eSSong Gao     T fd, fp;                                                           \
2546*5c23704eSSong Gao     fp = float ## BIT ##_sqrt(fj, &env->fp_status);                     \
2547*5c23704eSSong Gao     fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \
2548*5c23704eSSong Gao     vec_update_fcsr0(env, GETPC());                                     \
2549*5c23704eSSong Gao     return fd;                                                          \
2550*5c23704eSSong Gao }
2551*5c23704eSSong Gao 
2552*5c23704eSSong Gao FRSQRT(32, uint32_t)
2553*5c23704eSSong Gao FRSQRT(64, uint64_t)
2554*5c23704eSSong Gao 
2555*5c23704eSSong Gao DO_2OP_F(vflogb_s, 32, UW, do_flogb_32)
2556*5c23704eSSong Gao DO_2OP_F(vflogb_d, 64, UD, do_flogb_64)
2557*5c23704eSSong Gao DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32)
2558*5c23704eSSong Gao DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64)
2559*5c23704eSSong Gao DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32)
2560*5c23704eSSong Gao DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64)
2561*5c23704eSSong Gao DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32)
2562*5c23704eSSong Gao DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64)
2563*5c23704eSSong Gao 
float16_cvt_float32(uint16_t h,float_status * status)2564*5c23704eSSong Gao static uint32_t float16_cvt_float32(uint16_t h, float_status *status)
2565*5c23704eSSong Gao {
2566*5c23704eSSong Gao     return float16_to_float32(h, true, status);
2567*5c23704eSSong Gao }
float32_cvt_float64(uint32_t s,float_status * status)2568*5c23704eSSong Gao static uint64_t float32_cvt_float64(uint32_t s, float_status *status)
2569*5c23704eSSong Gao {
2570*5c23704eSSong Gao     return float32_to_float64(s, status);
2571*5c23704eSSong Gao }
2572*5c23704eSSong Gao 
float32_cvt_float16(uint32_t s,float_status * status)2573*5c23704eSSong Gao static uint16_t float32_cvt_float16(uint32_t s, float_status *status)
2574*5c23704eSSong Gao {
2575*5c23704eSSong Gao     return float32_to_float16(s, true, status);
2576*5c23704eSSong Gao }
float64_cvt_float32(uint64_t d,float_status * status)2577*5c23704eSSong Gao static uint32_t float64_cvt_float32(uint64_t d, float_status *status)
2578*5c23704eSSong Gao {
2579*5c23704eSSong Gao     return float64_to_float32(d, status);
2580*5c23704eSSong Gao }
2581*5c23704eSSong Gao 
HELPER(vfcvtl_s_h)2582*5c23704eSSong Gao void HELPER(vfcvtl_s_h)(void *vd, void *vj,
2583*5c23704eSSong Gao                         CPULoongArchState *env, uint32_t desc)
2584*5c23704eSSong Gao {
2585*5c23704eSSong Gao     int i, j, ofs;
2586*5c23704eSSong Gao     VReg temp = {};
2587*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2588*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2589*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2590*5c23704eSSong Gao 
2591*5c23704eSSong Gao     ofs = LSX_LEN / 32;
2592*5c23704eSSong Gao     vec_clear_cause(env);
2593*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2594*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2595*5c23704eSSong Gao             temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i),
2596*5c23704eSSong Gao                                                       &env->fp_status);
2597*5c23704eSSong Gao         }
2598*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2599*5c23704eSSong Gao     }
2600*5c23704eSSong Gao     *Vd = temp;
2601*5c23704eSSong Gao }
2602*5c23704eSSong Gao 
HELPER(vfcvtl_d_s)2603*5c23704eSSong Gao void HELPER(vfcvtl_d_s)(void *vd, void *vj,
2604*5c23704eSSong Gao                         CPULoongArchState *env, uint32_t desc)
2605*5c23704eSSong Gao {
2606*5c23704eSSong Gao     int i, j, ofs;
2607*5c23704eSSong Gao     VReg temp = {};
2608*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2609*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2610*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2611*5c23704eSSong Gao 
2612*5c23704eSSong Gao     ofs = LSX_LEN / 64;
2613*5c23704eSSong Gao     vec_clear_cause(env);
2614*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2615*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2616*5c23704eSSong Gao             temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i),
2617*5c23704eSSong Gao                                                        &env->fp_status);
2618*5c23704eSSong Gao         }
2619*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2620*5c23704eSSong Gao     }
2621*5c23704eSSong Gao     *Vd = temp;
2622*5c23704eSSong Gao }
2623*5c23704eSSong Gao 
HELPER(vfcvth_s_h)2624*5c23704eSSong Gao void HELPER(vfcvth_s_h)(void *vd, void *vj,
2625*5c23704eSSong Gao                         CPULoongArchState *env, uint32_t desc)
2626*5c23704eSSong Gao {
2627*5c23704eSSong Gao     int i, j, ofs;
2628*5c23704eSSong Gao     VReg temp = {};
2629*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2630*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2631*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2632*5c23704eSSong Gao 
2633*5c23704eSSong Gao     ofs = LSX_LEN / 32;
2634*5c23704eSSong Gao     vec_clear_cause(env);
2635*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2636*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2637*5c23704eSSong Gao             temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)),
2638*5c23704eSSong Gao                                                        &env->fp_status);
2639*5c23704eSSong Gao         }
2640*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2641*5c23704eSSong Gao     }
2642*5c23704eSSong Gao     *Vd = temp;
2643*5c23704eSSong Gao }
2644*5c23704eSSong Gao 
HELPER(vfcvth_d_s)2645*5c23704eSSong Gao void HELPER(vfcvth_d_s)(void *vd, void *vj,
2646*5c23704eSSong Gao                         CPULoongArchState *env, uint32_t desc)
2647*5c23704eSSong Gao {
2648*5c23704eSSong Gao     int i, j, ofs;
2649*5c23704eSSong Gao     VReg temp = {};
2650*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2651*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2652*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2653*5c23704eSSong Gao 
2654*5c23704eSSong Gao     ofs = LSX_LEN / 64;
2655*5c23704eSSong Gao     vec_clear_cause(env);
2656*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2657*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2658*5c23704eSSong Gao             temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)),
2659*5c23704eSSong Gao                                                         &env->fp_status);
2660*5c23704eSSong Gao         }
2661*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2662*5c23704eSSong Gao     }
2663*5c23704eSSong Gao     *Vd = temp;
2664*5c23704eSSong Gao }
2665*5c23704eSSong Gao 
HELPER(vfcvt_h_s)2666*5c23704eSSong Gao void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk,
2667*5c23704eSSong Gao                        CPULoongArchState *env, uint32_t desc)
2668*5c23704eSSong Gao {
2669*5c23704eSSong Gao     int i, j, ofs;
2670*5c23704eSSong Gao     VReg temp = {};
2671*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2672*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2673*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
2674*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2675*5c23704eSSong Gao 
2676*5c23704eSSong Gao     ofs = LSX_LEN / 32;
2677*5c23704eSSong Gao     vec_clear_cause(env);
2678*5c23704eSSong Gao     for(i = 0; i < oprsz / 16; i++) {
2679*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2680*5c23704eSSong Gao             temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i),
2681*5c23704eSSong Gao                                                                  &env->fp_status);
2682*5c23704eSSong Gao             temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i),
2683*5c23704eSSong Gao                                                            &env->fp_status);
2684*5c23704eSSong Gao         }
2685*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2686*5c23704eSSong Gao     }
2687*5c23704eSSong Gao     *Vd = temp;
2688*5c23704eSSong Gao }
2689*5c23704eSSong Gao 
HELPER(vfcvt_s_d)2690*5c23704eSSong Gao void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk,
2691*5c23704eSSong Gao                        CPULoongArchState *env, uint32_t desc)
2692*5c23704eSSong Gao {
2693*5c23704eSSong Gao     int i, j, ofs;
2694*5c23704eSSong Gao     VReg temp = {};
2695*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2696*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2697*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
2698*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2699*5c23704eSSong Gao 
2700*5c23704eSSong Gao     ofs = LSX_LEN / 64;
2701*5c23704eSSong Gao     vec_clear_cause(env);
2702*5c23704eSSong Gao     for(i = 0; i < oprsz / 16; i++) {
2703*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2704*5c23704eSSong Gao             temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i),
2705*5c23704eSSong Gao                                                                  &env->fp_status);
2706*5c23704eSSong Gao             temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i),
2707*5c23704eSSong Gao                                                            &env->fp_status);
2708*5c23704eSSong Gao         }
2709*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2710*5c23704eSSong Gao     }
2711*5c23704eSSong Gao     *Vd = temp;
2712*5c23704eSSong Gao }
2713*5c23704eSSong Gao 
HELPER(vfrint_s)2714*5c23704eSSong Gao void HELPER(vfrint_s)(void *vd, void *vj,
2715*5c23704eSSong Gao                       CPULoongArchState *env, uint32_t desc)
2716*5c23704eSSong Gao {
2717*5c23704eSSong Gao     int i;
2718*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2719*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2720*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2721*5c23704eSSong Gao 
2722*5c23704eSSong Gao     vec_clear_cause(env);
2723*5c23704eSSong Gao     for (i = 0; i < oprsz / 4; i++) {
2724*5c23704eSSong Gao         Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status);
2725*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2726*5c23704eSSong Gao     }
2727*5c23704eSSong Gao }
2728*5c23704eSSong Gao 
HELPER(vfrint_d)2729*5c23704eSSong Gao void HELPER(vfrint_d)(void *vd, void *vj,
2730*5c23704eSSong Gao                       CPULoongArchState *env, uint32_t desc)
2731*5c23704eSSong Gao {
2732*5c23704eSSong Gao     int i;
2733*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2734*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2735*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2736*5c23704eSSong Gao 
2737*5c23704eSSong Gao     vec_clear_cause(env);
2738*5c23704eSSong Gao     for (i = 0; i < oprsz / 8; i++) {
2739*5c23704eSSong Gao         Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status);
2740*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2741*5c23704eSSong Gao     }
2742*5c23704eSSong Gao }
2743*5c23704eSSong Gao 
2744*5c23704eSSong Gao #define FCVT_2OP(NAME, BIT, E, MODE)                                        \
2745*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj,                                       \
2746*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc)                    \
2747*5c23704eSSong Gao {                                                                           \
2748*5c23704eSSong Gao     int i;                                                                  \
2749*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                  \
2750*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                  \
2751*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                           \
2752*5c23704eSSong Gao                                                                             \
2753*5c23704eSSong Gao     vec_clear_cause(env);                                                   \
2754*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                               \
2755*5c23704eSSong Gao         FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2756*5c23704eSSong Gao         set_float_rounding_mode(MODE, &env->fp_status);                     \
2757*5c23704eSSong Gao         Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \
2758*5c23704eSSong Gao         set_float_rounding_mode(old_mode, &env->fp_status);                 \
2759*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());                                     \
2760*5c23704eSSong Gao     }                                                                       \
2761*5c23704eSSong Gao }
2762*5c23704eSSong Gao 
2763*5c23704eSSong Gao FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even)
2764*5c23704eSSong Gao FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even)
2765*5c23704eSSong Gao FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero)
2766*5c23704eSSong Gao FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero)
2767*5c23704eSSong Gao FCVT_2OP(vfrintrp_s, 32, UW, float_round_up)
2768*5c23704eSSong Gao FCVT_2OP(vfrintrp_d, 64, UD, float_round_up)
2769*5c23704eSSong Gao FCVT_2OP(vfrintrm_s, 32, UW, float_round_down)
2770*5c23704eSSong Gao FCVT_2OP(vfrintrm_d, 64, UD, float_round_down)
2771*5c23704eSSong Gao 
2772*5c23704eSSong Gao #define FTINT(NAME, FMT1, FMT2, T1, T2,  MODE)                          \
2773*5c23704eSSong Gao static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj)               \
2774*5c23704eSSong Gao {                                                                       \
2775*5c23704eSSong Gao     T2 fd;                                                              \
2776*5c23704eSSong Gao     FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2777*5c23704eSSong Gao                                                                         \
2778*5c23704eSSong Gao     set_float_rounding_mode(MODE, &env->fp_status);                     \
2779*5c23704eSSong Gao     fd = do_## FMT1 ##_to_## FMT2(env, fj);                             \
2780*5c23704eSSong Gao     set_float_rounding_mode(old_mode, &env->fp_status);                 \
2781*5c23704eSSong Gao     return fd;                                                          \
2782*5c23704eSSong Gao }
2783*5c23704eSSong Gao 
2784*5c23704eSSong Gao #define DO_FTINT(FMT1, FMT2, T1, T2)                                         \
2785*5c23704eSSong Gao static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj)            \
2786*5c23704eSSong Gao {                                                                            \
2787*5c23704eSSong Gao     T2 fd;                                                                   \
2788*5c23704eSSong Gao                                                                              \
2789*5c23704eSSong Gao     fd = FMT1 ##_to_## FMT2(fj, &env->fp_status);                            \
2790*5c23704eSSong Gao     if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \
2791*5c23704eSSong Gao         if (FMT1 ##_is_any_nan(fj)) {                                        \
2792*5c23704eSSong Gao             fd = 0;                                                          \
2793*5c23704eSSong Gao         }                                                                    \
2794*5c23704eSSong Gao     }                                                                        \
2795*5c23704eSSong Gao     vec_update_fcsr0(env, GETPC());                                          \
2796*5c23704eSSong Gao     return fd;                                                               \
2797*5c23704eSSong Gao }
2798*5c23704eSSong Gao 
DO_FTINT(float32,int32,uint32_t,uint32_t)2799*5c23704eSSong Gao DO_FTINT(float32, int32, uint32_t, uint32_t)
2800*5c23704eSSong Gao DO_FTINT(float64, int64, uint64_t, uint64_t)
2801*5c23704eSSong Gao DO_FTINT(float32, uint32, uint32_t, uint32_t)
2802*5c23704eSSong Gao DO_FTINT(float64, uint64, uint64_t, uint64_t)
2803*5c23704eSSong Gao DO_FTINT(float64, int32, uint64_t, uint32_t)
2804*5c23704eSSong Gao DO_FTINT(float32, int64, uint32_t, uint64_t)
2805*5c23704eSSong Gao 
2806*5c23704eSSong Gao FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even)
2807*5c23704eSSong Gao FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even)
2808*5c23704eSSong Gao FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up)
2809*5c23704eSSong Gao FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up)
2810*5c23704eSSong Gao FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero)
2811*5c23704eSSong Gao FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero)
2812*5c23704eSSong Gao FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down)
2813*5c23704eSSong Gao FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down)
2814*5c23704eSSong Gao 
2815*5c23704eSSong Gao DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s)
2816*5c23704eSSong Gao DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d)
2817*5c23704eSSong Gao DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s)
2818*5c23704eSSong Gao DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d)
2819*5c23704eSSong Gao DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s)
2820*5c23704eSSong Gao DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d)
2821*5c23704eSSong Gao DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s)
2822*5c23704eSSong Gao DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d)
2823*5c23704eSSong Gao DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32)
2824*5c23704eSSong Gao DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64)
2825*5c23704eSSong Gao 
2826*5c23704eSSong Gao FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero)
2827*5c23704eSSong Gao FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero)
2828*5c23704eSSong Gao 
2829*5c23704eSSong Gao DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s)
2830*5c23704eSSong Gao DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d)
2831*5c23704eSSong Gao DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32)
2832*5c23704eSSong Gao DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64)
2833*5c23704eSSong Gao 
2834*5c23704eSSong Gao FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down)
2835*5c23704eSSong Gao FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up)
2836*5c23704eSSong Gao FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero)
2837*5c23704eSSong Gao FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even)
2838*5c23704eSSong Gao 
2839*5c23704eSSong Gao #define FTINT_W_D(NAME, FN)                                               \
2840*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk,                           \
2841*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc)                  \
2842*5c23704eSSong Gao {                                                                         \
2843*5c23704eSSong Gao     int i, j, ofs;                                                        \
2844*5c23704eSSong Gao     VReg temp = {};                                                       \
2845*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
2846*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
2847*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                \
2848*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
2849*5c23704eSSong Gao                                                                           \
2850*5c23704eSSong Gao     ofs = LSX_LEN / 64;                                                   \
2851*5c23704eSSong Gao     vec_clear_cause(env);                                                 \
2852*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
2853*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
2854*5c23704eSSong Gao             temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \
2855*5c23704eSSong Gao             temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i));       \
2856*5c23704eSSong Gao         }                                                                 \
2857*5c23704eSSong Gao     }                                                                     \
2858*5c23704eSSong Gao     *Vd = temp;                                                           \
2859*5c23704eSSong Gao }
2860*5c23704eSSong Gao 
2861*5c23704eSSong Gao FTINT_W_D(vftint_w_d, do_float64_to_int32)
2862*5c23704eSSong Gao FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d)
2863*5c23704eSSong Gao FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d)
2864*5c23704eSSong Gao FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d)
2865*5c23704eSSong Gao FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d)
2866*5c23704eSSong Gao 
2867*5c23704eSSong Gao FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2868*5c23704eSSong Gao FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2869*5c23704eSSong Gao FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2870*5c23704eSSong Gao FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2871*5c23704eSSong Gao FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2872*5c23704eSSong Gao FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2873*5c23704eSSong Gao FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2874*5c23704eSSong Gao FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2875*5c23704eSSong Gao 
2876*5c23704eSSong Gao #define FTINTL_L_S(NAME, FN)                                        \
2877*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj,                               \
2878*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc)            \
2879*5c23704eSSong Gao {                                                                   \
2880*5c23704eSSong Gao     int i, j, ofs;                                                  \
2881*5c23704eSSong Gao     VReg temp;                                                      \
2882*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                          \
2883*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                          \
2884*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                   \
2885*5c23704eSSong Gao                                                                     \
2886*5c23704eSSong Gao     ofs = LSX_LEN / 64;                                             \
2887*5c23704eSSong Gao     vec_clear_cause(env);                                           \
2888*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                              \
2889*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                 \
2890*5c23704eSSong Gao             temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \
2891*5c23704eSSong Gao         }                                                           \
2892*5c23704eSSong Gao     }                                                               \
2893*5c23704eSSong Gao     *Vd = temp;                                                     \
2894*5c23704eSSong Gao }
2895*5c23704eSSong Gao 
2896*5c23704eSSong Gao FTINTL_L_S(vftintl_l_s, do_float32_to_int64)
2897*5c23704eSSong Gao FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s)
2898*5c23704eSSong Gao FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s)
2899*5c23704eSSong Gao FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s)
2900*5c23704eSSong Gao FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s)
2901*5c23704eSSong Gao 
2902*5c23704eSSong Gao #define FTINTH_L_S(NAME, FN)                                              \
2903*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj,                                     \
2904*5c23704eSSong Gao                   CPULoongArchState *env, uint32_t desc)                  \
2905*5c23704eSSong Gao {                                                                         \
2906*5c23704eSSong Gao     int i, j, ofs;                                                        \
2907*5c23704eSSong Gao     VReg temp = {};                                                       \
2908*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
2909*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
2910*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
2911*5c23704eSSong Gao                                                                           \
2912*5c23704eSSong Gao     ofs = LSX_LEN / 64;                                                   \
2913*5c23704eSSong Gao     vec_clear_cause(env);                                                 \
2914*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
2915*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
2916*5c23704eSSong Gao             temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \
2917*5c23704eSSong Gao         }                                                                 \
2918*5c23704eSSong Gao     }                                                                     \
2919*5c23704eSSong Gao     *Vd = temp;                                                           \
2920*5c23704eSSong Gao }
2921*5c23704eSSong Gao 
2922*5c23704eSSong Gao FTINTH_L_S(vftinth_l_s, do_float32_to_int64)
2923*5c23704eSSong Gao FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s)
2924*5c23704eSSong Gao FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s)
2925*5c23704eSSong Gao FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s)
2926*5c23704eSSong Gao FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s)
2927*5c23704eSSong Gao 
2928*5c23704eSSong Gao #define FFINT(NAME, FMT1, FMT2, T1, T2)                    \
2929*5c23704eSSong Gao static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \
2930*5c23704eSSong Gao {                                                          \
2931*5c23704eSSong Gao     T2 fd;                                                 \
2932*5c23704eSSong Gao                                                            \
2933*5c23704eSSong Gao     fd = FMT1 ##_to_## FMT2(fj, &env->fp_status);          \
2934*5c23704eSSong Gao     vec_update_fcsr0(env, GETPC());                        \
2935*5c23704eSSong Gao     return fd;                                             \
2936*5c23704eSSong Gao }
2937*5c23704eSSong Gao 
2938*5c23704eSSong Gao FFINT(s_w, int32, float32, int32_t, uint32_t)
2939*5c23704eSSong Gao FFINT(d_l, int64, float64, int64_t, uint64_t)
2940*5c23704eSSong Gao FFINT(s_wu, uint32, float32, uint32_t, uint32_t)
2941*5c23704eSSong Gao FFINT(d_lu, uint64, float64, uint64_t, uint64_t)
2942*5c23704eSSong Gao 
2943*5c23704eSSong Gao DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w)
2944*5c23704eSSong Gao DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l)
2945*5c23704eSSong Gao DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu)
2946*5c23704eSSong Gao DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu)
2947*5c23704eSSong Gao 
2948*5c23704eSSong Gao void HELPER(vffintl_d_w)(void *vd, void *vj,
2949*5c23704eSSong Gao                          CPULoongArchState *env, uint32_t desc)
2950*5c23704eSSong Gao {
2951*5c23704eSSong Gao     int i, j, ofs;
2952*5c23704eSSong Gao     VReg temp = {};
2953*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2954*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2955*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2956*5c23704eSSong Gao 
2957*5c23704eSSong Gao     ofs = LSX_LEN / 64;
2958*5c23704eSSong Gao     vec_clear_cause(env);
2959*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
2960*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2961*5c23704eSSong Gao             temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i),
2962*5c23704eSSong Gao                                                    &env->fp_status);
2963*5c23704eSSong Gao         }
2964*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2965*5c23704eSSong Gao     }
2966*5c23704eSSong Gao     *Vd = temp;
2967*5c23704eSSong Gao }
2968*5c23704eSSong Gao 
HELPER(vffinth_d_w)2969*5c23704eSSong Gao void HELPER(vffinth_d_w)(void *vd, void *vj,
2970*5c23704eSSong Gao                          CPULoongArchState *env, uint32_t desc)
2971*5c23704eSSong Gao {
2972*5c23704eSSong Gao     int i, j, ofs;
2973*5c23704eSSong Gao     VReg temp = {};
2974*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2975*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2976*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2977*5c23704eSSong Gao 
2978*5c23704eSSong Gao     ofs = LSX_LEN / 64;
2979*5c23704eSSong Gao     vec_clear_cause(env);
2980*5c23704eSSong Gao     for (i = 0; i < oprsz /16; i++) {
2981*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
2982*5c23704eSSong Gao             temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)),
2983*5c23704eSSong Gao                                                    &env->fp_status);
2984*5c23704eSSong Gao         }
2985*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
2986*5c23704eSSong Gao     }
2987*5c23704eSSong Gao     *Vd = temp;
2988*5c23704eSSong Gao }
2989*5c23704eSSong Gao 
HELPER(vffint_s_l)2990*5c23704eSSong Gao void HELPER(vffint_s_l)(void *vd, void *vj, void *vk,
2991*5c23704eSSong Gao                         CPULoongArchState *env, uint32_t desc)
2992*5c23704eSSong Gao {
2993*5c23704eSSong Gao     int i, j, ofs;
2994*5c23704eSSong Gao     VReg temp = {};
2995*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
2996*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
2997*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
2998*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
2999*5c23704eSSong Gao 
3000*5c23704eSSong Gao     ofs = LSX_LEN / 64;
3001*5c23704eSSong Gao     vec_clear_cause(env);
3002*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
3003*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {
3004*5c23704eSSong Gao             temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i),
3005*5c23704eSSong Gao                                                              &env->fp_status);
3006*5c23704eSSong Gao             temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i),
3007*5c23704eSSong Gao                                                        &env->fp_status);
3008*5c23704eSSong Gao         }
3009*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());
3010*5c23704eSSong Gao     }
3011*5c23704eSSong Gao     *Vd = temp;
3012*5c23704eSSong Gao }
3013*5c23704eSSong Gao 
3014*5c23704eSSong Gao #define VCMPI(NAME, BIT, E, DO_OP)                                 \
3015*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3016*5c23704eSSong Gao {                                                                  \
3017*5c23704eSSong Gao     int i;                                                         \
3018*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
3019*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
3020*5c23704eSSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
3021*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
3022*5c23704eSSong Gao                                                                    \
3023*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
3024*5c23704eSSong Gao         Vd->E(i) = DO_OP(Vj->E(i), (TD)imm);                       \
3025*5c23704eSSong Gao     }                                                              \
3026*5c23704eSSong Gao }
3027*5c23704eSSong Gao 
3028*5c23704eSSong Gao VCMPI(vseqi_b, 8, B, VSEQ)
3029*5c23704eSSong Gao VCMPI(vseqi_h, 16, H, VSEQ)
3030*5c23704eSSong Gao VCMPI(vseqi_w, 32, W, VSEQ)
3031*5c23704eSSong Gao VCMPI(vseqi_d, 64, D, VSEQ)
3032*5c23704eSSong Gao VCMPI(vslei_b, 8, B, VSLE)
3033*5c23704eSSong Gao VCMPI(vslei_h, 16, H, VSLE)
3034*5c23704eSSong Gao VCMPI(vslei_w, 32, W, VSLE)
3035*5c23704eSSong Gao VCMPI(vslei_d, 64, D, VSLE)
3036*5c23704eSSong Gao VCMPI(vslei_bu, 8, UB, VSLE)
3037*5c23704eSSong Gao VCMPI(vslei_hu, 16, UH, VSLE)
3038*5c23704eSSong Gao VCMPI(vslei_wu, 32, UW, VSLE)
3039*5c23704eSSong Gao VCMPI(vslei_du, 64, UD, VSLE)
3040*5c23704eSSong Gao VCMPI(vslti_b, 8, B, VSLT)
3041*5c23704eSSong Gao VCMPI(vslti_h, 16, H, VSLT)
3042*5c23704eSSong Gao VCMPI(vslti_w, 32, W, VSLT)
3043*5c23704eSSong Gao VCMPI(vslti_d, 64, D, VSLT)
3044*5c23704eSSong Gao VCMPI(vslti_bu, 8, UB, VSLT)
3045*5c23704eSSong Gao VCMPI(vslti_hu, 16, UH, VSLT)
3046*5c23704eSSong Gao VCMPI(vslti_wu, 32, UW, VSLT)
3047*5c23704eSSong Gao VCMPI(vslti_du, 64, UD, VSLT)
3048*5c23704eSSong Gao 
vfcmp_common(CPULoongArchState * env,FloatRelation cmp,uint32_t flags)3049*5c23704eSSong Gao static uint64_t vfcmp_common(CPULoongArchState *env,
3050*5c23704eSSong Gao                              FloatRelation cmp, uint32_t flags)
3051*5c23704eSSong Gao {
3052*5c23704eSSong Gao     uint64_t ret = 0;
3053*5c23704eSSong Gao 
3054*5c23704eSSong Gao     switch (cmp) {
3055*5c23704eSSong Gao     case float_relation_less:
3056*5c23704eSSong Gao         ret = (flags & FCMP_LT);
3057*5c23704eSSong Gao         break;
3058*5c23704eSSong Gao     case float_relation_equal:
3059*5c23704eSSong Gao         ret = (flags & FCMP_EQ);
3060*5c23704eSSong Gao         break;
3061*5c23704eSSong Gao     case float_relation_greater:
3062*5c23704eSSong Gao         ret = (flags & FCMP_GT);
3063*5c23704eSSong Gao         break;
3064*5c23704eSSong Gao     case float_relation_unordered:
3065*5c23704eSSong Gao         ret = (flags & FCMP_UN);
3066*5c23704eSSong Gao         break;
3067*5c23704eSSong Gao     default:
3068*5c23704eSSong Gao         g_assert_not_reached();
3069*5c23704eSSong Gao     }
3070*5c23704eSSong Gao 
3071*5c23704eSSong Gao     if (ret) {
3072*5c23704eSSong Gao         ret = -1;
3073*5c23704eSSong Gao     }
3074*5c23704eSSong Gao 
3075*5c23704eSSong Gao     return ret;
3076*5c23704eSSong Gao }
3077*5c23704eSSong Gao 
3078*5c23704eSSong Gao #define VFCMP(NAME, BIT, E, FN)                                          \
3079*5c23704eSSong Gao void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz,                \
3080*5c23704eSSong Gao                   uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \
3081*5c23704eSSong Gao {                                                                        \
3082*5c23704eSSong Gao     int i;                                                               \
3083*5c23704eSSong Gao     VReg t;                                                              \
3084*5c23704eSSong Gao     VReg *Vd = &(env->fpr[vd].vreg);                                     \
3085*5c23704eSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                                     \
3086*5c23704eSSong Gao     VReg *Vk = &(env->fpr[vk].vreg);                                     \
3087*5c23704eSSong Gao                                                                          \
3088*5c23704eSSong Gao     vec_clear_cause(env);                                                \
3089*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                            \
3090*5c23704eSSong Gao         FloatRelation cmp;                                               \
3091*5c23704eSSong Gao         cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status);                   \
3092*5c23704eSSong Gao         t.E(i) = vfcmp_common(env, cmp, flags);                          \
3093*5c23704eSSong Gao         vec_update_fcsr0(env, GETPC());                                  \
3094*5c23704eSSong Gao     }                                                                    \
3095*5c23704eSSong Gao     *Vd = t;                                                             \
3096*5c23704eSSong Gao }
3097*5c23704eSSong Gao 
3098*5c23704eSSong Gao VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet)
3099*5c23704eSSong Gao VFCMP(vfcmp_s_s, 32, UW, float32_compare)
3100*5c23704eSSong Gao VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet)
3101*5c23704eSSong Gao VFCMP(vfcmp_s_d, 64, UD, float64_compare)
3102*5c23704eSSong Gao 
HELPER(vbitseli_b)3103*5c23704eSSong Gao void HELPER(vbitseli_b)(void *vd, void *vj,  uint64_t imm, uint32_t desc)
3104*5c23704eSSong Gao {
3105*5c23704eSSong Gao     int i;
3106*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3107*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3108*5c23704eSSong Gao 
3109*5c23704eSSong Gao     for (i = 0; i < simd_oprsz(desc); i++) {
3110*5c23704eSSong Gao         Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm);
3111*5c23704eSSong Gao     }
3112*5c23704eSSong Gao }
3113*5c23704eSSong Gao 
3114*5c23704eSSong Gao /* Copy from target/arm/tcg/sve_helper.c */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)3115*5c23704eSSong Gao static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
3116*5c23704eSSong Gao {
3117*5c23704eSSong Gao     int bits = 8 << esz;
3118*5c23704eSSong Gao     uint64_t ones = dup_const(esz, 1);
3119*5c23704eSSong Gao     uint64_t signs = ones << (bits - 1);
3120*5c23704eSSong Gao     uint64_t cmp0, cmp1;
3121*5c23704eSSong Gao 
3122*5c23704eSSong Gao     cmp1 = dup_const(esz, n);
3123*5c23704eSSong Gao     cmp0 = cmp1 ^ m0;
3124*5c23704eSSong Gao     cmp1 = cmp1 ^ m1;
3125*5c23704eSSong Gao     cmp0 = (cmp0 - ones) & ~cmp0;
3126*5c23704eSSong Gao     cmp1 = (cmp1 - ones) & ~cmp1;
3127*5c23704eSSong Gao     return (cmp0 | cmp1) & signs;
3128*5c23704eSSong Gao }
3129*5c23704eSSong Gao 
3130*5c23704eSSong Gao #define SETANYEQZ(NAME, MO)                                       \
3131*5c23704eSSong Gao void HELPER(NAME)(CPULoongArchState *env,                         \
3132*5c23704eSSong Gao                   uint32_t oprsz, uint32_t cd, uint32_t vj)       \
3133*5c23704eSSong Gao {                                                                 \
3134*5c23704eSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                              \
3135*5c23704eSSong Gao                                                                   \
3136*5c23704eSSong Gao     env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO);     \
3137*5c23704eSSong Gao     if (oprsz == 32) {                                            \
3138*5c23704eSSong Gao         env->cf[cd & 0x7] = env->cf[cd & 0x7] ||                  \
3139*5c23704eSSong Gao                             do_match2(0, Vj->D(2), Vj->D(3), MO); \
3140*5c23704eSSong Gao     }                                                             \
3141*5c23704eSSong Gao }
3142*5c23704eSSong Gao 
SETANYEQZ(vsetanyeqz_b,MO_8)3143*5c23704eSSong Gao SETANYEQZ(vsetanyeqz_b, MO_8)
3144*5c23704eSSong Gao SETANYEQZ(vsetanyeqz_h, MO_16)
3145*5c23704eSSong Gao SETANYEQZ(vsetanyeqz_w, MO_32)
3146*5c23704eSSong Gao SETANYEQZ(vsetanyeqz_d, MO_64)
3147*5c23704eSSong Gao 
3148*5c23704eSSong Gao #define SETALLNEZ(NAME, MO)                                        \
3149*5c23704eSSong Gao void HELPER(NAME)(CPULoongArchState *env,                          \
3150*5c23704eSSong Gao                   uint32_t oprsz, uint32_t cd, uint32_t vj)        \
3151*5c23704eSSong Gao {                                                                  \
3152*5c23704eSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                               \
3153*5c23704eSSong Gao                                                                    \
3154*5c23704eSSong Gao     env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO);      \
3155*5c23704eSSong Gao     if (oprsz == 32) {                                             \
3156*5c23704eSSong Gao         env->cf[cd & 0x7] = env->cf[cd & 0x7] &&                   \
3157*5c23704eSSong Gao                             !do_match2(0, Vj->D(2), Vj->D(3), MO); \
3158*5c23704eSSong Gao     }                                                              \
3159*5c23704eSSong Gao }
3160*5c23704eSSong Gao 
3161*5c23704eSSong Gao SETALLNEZ(vsetallnez_b, MO_8)
3162*5c23704eSSong Gao SETALLNEZ(vsetallnez_h, MO_16)
3163*5c23704eSSong Gao SETALLNEZ(vsetallnez_w, MO_32)
3164*5c23704eSSong Gao SETALLNEZ(vsetallnez_d, MO_64)
3165*5c23704eSSong Gao 
3166*5c23704eSSong Gao #define XVINSVE0(NAME, E, MASK)                                    \
3167*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3168*5c23704eSSong Gao {                                                                  \
3169*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
3170*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
3171*5c23704eSSong Gao     Vd->E(imm & MASK) = Vj->E(0);                                  \
3172*5c23704eSSong Gao }
3173*5c23704eSSong Gao 
3174*5c23704eSSong Gao XVINSVE0(xvinsve0_w, W, 0x7)
3175*5c23704eSSong Gao XVINSVE0(xvinsve0_d, D, 0x3)
3176*5c23704eSSong Gao 
3177*5c23704eSSong Gao #define XVPICKVE(NAME, E, BIT, MASK)                               \
3178*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3179*5c23704eSSong Gao {                                                                  \
3180*5c23704eSSong Gao     int i;                                                         \
3181*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
3182*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
3183*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
3184*5c23704eSSong Gao                                                                    \
3185*5c23704eSSong Gao     Vd->E(0) = Vj->E(imm & MASK);                                  \
3186*5c23704eSSong Gao     for (i = 1; i < oprsz / (BIT / 8); i++) {                      \
3187*5c23704eSSong Gao         Vd->E(i) = 0;                                              \
3188*5c23704eSSong Gao     }                                                              \
3189*5c23704eSSong Gao }
3190*5c23704eSSong Gao 
3191*5c23704eSSong Gao XVPICKVE(xvpickve_w, W, 32, 0x7)
3192*5c23704eSSong Gao XVPICKVE(xvpickve_d, D, 64, 0x3)
3193*5c23704eSSong Gao 
3194*5c23704eSSong Gao #define VPACKEV(NAME, BIT, E)                                  \
3195*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3196*5c23704eSSong Gao {                                                              \
3197*5c23704eSSong Gao     int i;                                                     \
3198*5c23704eSSong Gao     VReg temp = {};                                            \
3199*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
3200*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
3201*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
3202*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
3203*5c23704eSSong Gao                                                                \
3204*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
3205*5c23704eSSong Gao         temp.E(2 * i + 1) = Vj->E(2 * i);                      \
3206*5c23704eSSong Gao         temp.E(2 *i) = Vk->E(2 * i);                           \
3207*5c23704eSSong Gao     }                                                          \
3208*5c23704eSSong Gao     *Vd = temp;                                                \
3209*5c23704eSSong Gao }
3210*5c23704eSSong Gao 
3211*5c23704eSSong Gao VPACKEV(vpackev_b, 16, B)
3212*5c23704eSSong Gao VPACKEV(vpackev_h, 32, H)
3213*5c23704eSSong Gao VPACKEV(vpackev_w, 64, W)
3214*5c23704eSSong Gao VPACKEV(vpackev_d, 128, D)
3215*5c23704eSSong Gao 
3216*5c23704eSSong Gao #define VPACKOD(NAME, BIT, E)                                  \
3217*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3218*5c23704eSSong Gao {                                                              \
3219*5c23704eSSong Gao     int i;                                                     \
3220*5c23704eSSong Gao     VReg temp = {};                                            \
3221*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                     \
3222*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                     \
3223*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                     \
3224*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                              \
3225*5c23704eSSong Gao                                                                \
3226*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                 \
3227*5c23704eSSong Gao         temp.E(2 * i + 1) = Vj->E(2 * i + 1);                  \
3228*5c23704eSSong Gao         temp.E(2 * i) = Vk->E(2 * i + 1);                      \
3229*5c23704eSSong Gao     }                                                          \
3230*5c23704eSSong Gao     *Vd = temp;                                                \
3231*5c23704eSSong Gao }
3232*5c23704eSSong Gao 
3233*5c23704eSSong Gao VPACKOD(vpackod_b, 16, B)
3234*5c23704eSSong Gao VPACKOD(vpackod_h, 32, H)
3235*5c23704eSSong Gao VPACKOD(vpackod_w, 64, W)
3236*5c23704eSSong Gao VPACKOD(vpackod_d, 128, D)
3237*5c23704eSSong Gao 
3238*5c23704eSSong Gao #define VPICKEV(NAME, BIT, E)                                         \
3239*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)        \
3240*5c23704eSSong Gao {                                                                     \
3241*5c23704eSSong Gao     int i, j, ofs;                                                    \
3242*5c23704eSSong Gao     VReg temp = {};                                                   \
3243*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                            \
3244*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                            \
3245*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                            \
3246*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                     \
3247*5c23704eSSong Gao                                                                       \
3248*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                              \
3249*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                \
3250*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                   \
3251*5c23704eSSong Gao             temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \
3252*5c23704eSSong Gao             temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i));       \
3253*5c23704eSSong Gao         }                                                             \
3254*5c23704eSSong Gao     }                                                                 \
3255*5c23704eSSong Gao     *Vd = temp;                                                       \
3256*5c23704eSSong Gao }
3257*5c23704eSSong Gao 
3258*5c23704eSSong Gao VPICKEV(vpickev_b, 16, B)
3259*5c23704eSSong Gao VPICKEV(vpickev_h, 32, H)
3260*5c23704eSSong Gao VPICKEV(vpickev_w, 64, W)
3261*5c23704eSSong Gao VPICKEV(vpickev_d, 128, D)
3262*5c23704eSSong Gao 
3263*5c23704eSSong Gao #define VPICKOD(NAME, BIT, E)                                             \
3264*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
3265*5c23704eSSong Gao {                                                                         \
3266*5c23704eSSong Gao     int i, j, ofs;                                                        \
3267*5c23704eSSong Gao     VReg temp = {};                                                       \
3268*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
3269*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
3270*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                \
3271*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
3272*5c23704eSSong Gao                                                                           \
3273*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
3274*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
3275*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
3276*5c23704eSSong Gao             temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \
3277*5c23704eSSong Gao             temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1);       \
3278*5c23704eSSong Gao         }                                                                 \
3279*5c23704eSSong Gao     }                                                                     \
3280*5c23704eSSong Gao     *Vd = temp;                                                           \
3281*5c23704eSSong Gao }
3282*5c23704eSSong Gao 
3283*5c23704eSSong Gao VPICKOD(vpickod_b, 16, B)
3284*5c23704eSSong Gao VPICKOD(vpickod_h, 32, H)
3285*5c23704eSSong Gao VPICKOD(vpickod_w, 64, W)
3286*5c23704eSSong Gao VPICKOD(vpickod_d, 128, D)
3287*5c23704eSSong Gao 
3288*5c23704eSSong Gao #define VILVL(NAME, BIT, E)                                         \
3289*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)      \
3290*5c23704eSSong Gao {                                                                   \
3291*5c23704eSSong Gao     int i, j, ofs;                                                  \
3292*5c23704eSSong Gao     VReg temp = {};                                                 \
3293*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                          \
3294*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                          \
3295*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                          \
3296*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                   \
3297*5c23704eSSong Gao                                                                     \
3298*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                            \
3299*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                              \
3300*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                 \
3301*5c23704eSSong Gao             temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \
3302*5c23704eSSong Gao             temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i);     \
3303*5c23704eSSong Gao         }                                                           \
3304*5c23704eSSong Gao     }                                                               \
3305*5c23704eSSong Gao     *Vd = temp;                                                     \
3306*5c23704eSSong Gao }
3307*5c23704eSSong Gao 
3308*5c23704eSSong Gao VILVL(vilvl_b, 16, B)
3309*5c23704eSSong Gao VILVL(vilvl_h, 32, H)
3310*5c23704eSSong Gao VILVL(vilvl_w, 64, W)
3311*5c23704eSSong Gao VILVL(vilvl_d, 128, D)
3312*5c23704eSSong Gao 
3313*5c23704eSSong Gao #define VILVH(NAME, BIT, E)                                               \
3314*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
3315*5c23704eSSong Gao {                                                                         \
3316*5c23704eSSong Gao     int i, j, ofs;                                                        \
3317*5c23704eSSong Gao     VReg temp = {};                                                       \
3318*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                \
3319*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                \
3320*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                                \
3321*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
3322*5c23704eSSong Gao                                                                           \
3323*5c23704eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
3324*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
3325*5c23704eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
3326*5c23704eSSong Gao             temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \
3327*5c23704eSSong Gao             temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1));     \
3328*5c23704eSSong Gao         }                                                                 \
3329*5c23704eSSong Gao     }                                                                     \
3330*5c23704eSSong Gao     *Vd = temp;                                                           \
3331*5c23704eSSong Gao }
3332*5c23704eSSong Gao 
3333*5c23704eSSong Gao VILVH(vilvh_b, 16, B)
3334*5c23704eSSong Gao VILVH(vilvh_h, 32, H)
3335*5c23704eSSong Gao VILVH(vilvh_w, 64, W)
3336*5c23704eSSong Gao VILVH(vilvh_d, 128, D)
3337*5c23704eSSong Gao 
3338*5c23704eSSong Gao void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
3339*5c23704eSSong Gao {
3340*5c23704eSSong Gao     int i, j, m;
3341*5c23704eSSong Gao     VReg temp = {};
3342*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3343*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3344*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
3345*5c23704eSSong Gao     VReg *Va = (VReg *)va;
3346*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
3347*5c23704eSSong Gao 
3348*5c23704eSSong Gao     m = LSX_LEN / 8;
3349*5c23704eSSong Gao     for (i = 0; i < (oprsz / 16) * m; i++) {
3350*5c23704eSSong Gao         j = i < m ? 0 : 1;
3351*5c23704eSSong Gao         uint64_t k = (uint8_t)Va->B(i) % (2 * m);
3352*5c23704eSSong Gao         temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m);
3353*5c23704eSSong Gao     }
3354*5c23704eSSong Gao     *Vd = temp;
3355*5c23704eSSong Gao }
3356*5c23704eSSong Gao 
3357*5c23704eSSong Gao #define VSHUF(NAME, BIT, E)                                            \
3358*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)         \
3359*5c23704eSSong Gao {                                                                      \
3360*5c23704eSSong Gao     int i, j, m;                                                       \
3361*5c23704eSSong Gao     VReg temp = {};                                                    \
3362*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                             \
3363*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                             \
3364*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;                                             \
3365*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                      \
3366*5c23704eSSong Gao                                                                        \
3367*5c23704eSSong Gao     m = LSX_LEN / BIT;                                                 \
3368*5c23704eSSong Gao     for (i = 0; i < (oprsz / 16) * m; i++) {                           \
3369*5c23704eSSong Gao         j = i < m ? 0 : 1;                                             \
3370*5c23704eSSong Gao         uint64_t k  = ((uint8_t)Vd->E(i)) % (2 * m);                   \
3371*5c23704eSSong Gao         temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \
3372*5c23704eSSong Gao     }                                                                  \
3373*5c23704eSSong Gao     *Vd = temp;                                                        \
3374*5c23704eSSong Gao }
3375*5c23704eSSong Gao 
3376*5c23704eSSong Gao VSHUF(vshuf_h, 16, H)
3377*5c23704eSSong Gao VSHUF(vshuf_w, 32, W)
3378*5c23704eSSong Gao VSHUF(vshuf_d, 64, D)
3379*5c23704eSSong Gao 
3380*5c23704eSSong Gao #define VSHUF4I(NAME, BIT, E)                                               \
3381*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)          \
3382*5c23704eSSong Gao {                                                                           \
3383*5c23704eSSong Gao     int i, j, max;                                                          \
3384*5c23704eSSong Gao     VReg temp = {};                                                         \
3385*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                                  \
3386*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                                  \
3387*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                           \
3388*5c23704eSSong Gao                                                                             \
3389*5c23704eSSong Gao     max = LSX_LEN / BIT;                                                    \
3390*5c23704eSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                               \
3391*5c23704eSSong Gao         j = i < max ? 1 : 2;                                                \
3392*5c23704eSSong Gao         temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \
3393*5c23704eSSong Gao     }                                                                       \
3394*5c23704eSSong Gao     *Vd = temp;                                                             \
3395*5c23704eSSong Gao }
3396*5c23704eSSong Gao 
3397*5c23704eSSong Gao VSHUF4I(vshuf4i_b, 8, B)
3398*5c23704eSSong Gao VSHUF4I(vshuf4i_h, 16, H)
3399*5c23704eSSong Gao VSHUF4I(vshuf4i_w, 32, W)
3400*5c23704eSSong Gao 
HELPER(vshuf4i_d)3401*5c23704eSSong Gao void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3402*5c23704eSSong Gao {
3403*5c23704eSSong Gao     int i;
3404*5c23704eSSong Gao     VReg temp = {};
3405*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3406*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3407*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
3408*5c23704eSSong Gao 
3409*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
3410*5c23704eSSong Gao         temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i);
3411*5c23704eSSong Gao         temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i);
3412*5c23704eSSong Gao     }
3413*5c23704eSSong Gao     *Vd = temp;
3414*5c23704eSSong Gao }
3415*5c23704eSSong Gao 
HELPER(vperm_w)3416*5c23704eSSong Gao void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc)
3417*5c23704eSSong Gao {
3418*5c23704eSSong Gao     int i, m;
3419*5c23704eSSong Gao     VReg temp = {};
3420*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3421*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3422*5c23704eSSong Gao     VReg *Vk = (VReg *)vk;
3423*5c23704eSSong Gao 
3424*5c23704eSSong Gao     m = LASX_LEN / 32;
3425*5c23704eSSong Gao     for (i = 0; i < m ; i++) {
3426*5c23704eSSong Gao         uint64_t k = (uint8_t)Vk->W(i) % 8;
3427*5c23704eSSong Gao         temp.W(i) = Vj->W(k);
3428*5c23704eSSong Gao     }
3429*5c23704eSSong Gao     *Vd = temp;
3430*5c23704eSSong Gao }
3431*5c23704eSSong Gao 
HELPER(vpermi_w)3432*5c23704eSSong Gao void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3433*5c23704eSSong Gao {
3434*5c23704eSSong Gao     int i;
3435*5c23704eSSong Gao     VReg temp = {};
3436*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3437*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3438*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);
3439*5c23704eSSong Gao 
3440*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {
3441*5c23704eSSong Gao         temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i);
3442*5c23704eSSong Gao         temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i);
3443*5c23704eSSong Gao         temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i);
3444*5c23704eSSong Gao         temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i);
3445*5c23704eSSong Gao     }
3446*5c23704eSSong Gao     *Vd = temp;
3447*5c23704eSSong Gao }
3448*5c23704eSSong Gao 
HELPER(vpermi_d)3449*5c23704eSSong Gao void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3450*5c23704eSSong Gao {
3451*5c23704eSSong Gao     VReg temp = {};
3452*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3453*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3454*5c23704eSSong Gao 
3455*5c23704eSSong Gao     temp.D(0) = Vj->D(imm & 0x3);
3456*5c23704eSSong Gao     temp.D(1) = Vj->D((imm >> 2) & 0x3);
3457*5c23704eSSong Gao     temp.D(2) = Vj->D((imm >> 4) & 0x3);
3458*5c23704eSSong Gao     temp.D(3) = Vj->D((imm >> 6) & 0x3);
3459*5c23704eSSong Gao     *Vd = temp;
3460*5c23704eSSong Gao }
3461*5c23704eSSong Gao 
HELPER(vpermi_q)3462*5c23704eSSong Gao void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3463*5c23704eSSong Gao {
3464*5c23704eSSong Gao     int i;
3465*5c23704eSSong Gao     VReg temp;
3466*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;
3467*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;
3468*5c23704eSSong Gao 
3469*5c23704eSSong Gao     for (i = 0; i < 2; i++, imm >>= 4) {
3470*5c23704eSSong Gao         temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1);
3471*5c23704eSSong Gao     }
3472*5c23704eSSong Gao     *Vd = temp;
3473*5c23704eSSong Gao }
3474*5c23704eSSong Gao 
3475*5c23704eSSong Gao #define VEXTRINS(NAME, BIT, E, MASK)                               \
3476*5c23704eSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3477*5c23704eSSong Gao {                                                                  \
3478*5c23704eSSong Gao     int i, ins, extr, max;                                         \
3479*5c23704eSSong Gao     VReg *Vd = (VReg *)vd;                                         \
3480*5c23704eSSong Gao     VReg *Vj = (VReg *)vj;                                         \
3481*5c23704eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
3482*5c23704eSSong Gao                                                                    \
3483*5c23704eSSong Gao     max = LSX_LEN / BIT;                                           \
3484*5c23704eSSong Gao     ins = (imm >> 4) & MASK;                                       \
3485*5c23704eSSong Gao     extr = imm & MASK;                                             \
3486*5c23704eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                             \
3487*5c23704eSSong Gao         Vd->E(ins + i * max) = Vj->E(extr + i * max);              \
3488*5c23704eSSong Gao     }                                                              \
3489*5c23704eSSong Gao }
3490*5c23704eSSong Gao 
3491*5c23704eSSong Gao VEXTRINS(vextrins_b, 8, B, 0xf)
3492*5c23704eSSong Gao VEXTRINS(vextrins_h, 16, H, 0x7)
3493*5c23704eSSong Gao VEXTRINS(vextrins_w, 32, W, 0x3)
3494*5c23704eSSong Gao VEXTRINS(vextrins_d, 64, D, 0x1)
3495