xref: /openbmc/qemu/target/riscv/vector_helper.c (revision d9b7609a)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "cpu.h"
21 #include "exec/memop.h"
22 #include "exec/exec-all.h"
23 #include "exec/helper-proto.h"
24 #include "fpu/softfloat.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "internals.h"
27 #include <math.h>
28 
29 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
30                             target_ulong s2)
31 {
32     int vlmax, vl;
33     RISCVCPU *cpu = env_archcpu(env);
34     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
35     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
36     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
37     bool vill = FIELD_EX64(s2, VTYPE, VILL);
38     target_ulong reserved = FIELD_EX64(s2, VTYPE, RESERVED);
39 
40     if (lmul & 4) {
41         /* Fractional LMUL. */
42         if (lmul == 4 ||
43             cpu->cfg.elen >> (8 - lmul) < sew) {
44             vill = true;
45         }
46     }
47 
48     if ((sew > cpu->cfg.elen)
49         || vill
50         || (ediv != 0)
51         || (reserved != 0)) {
52         /* only set vill bit. */
53         env->vtype = FIELD_DP64(0, VTYPE, VILL, 1);
54         env->vl = 0;
55         env->vstart = 0;
56         return 0;
57     }
58 
59     vlmax = vext_get_vlmax(cpu, s2);
60     if (s1 <= vlmax) {
61         vl = s1;
62     } else {
63         vl = vlmax;
64     }
65     env->vl = vl;
66     env->vtype = s2;
67     env->vstart = 0;
68     return vl;
69 }
70 
71 /*
72  * Note that vector data is stored in host-endian 64-bit chunks,
73  * so addressing units smaller than that needs a host-endian fixup.
74  */
75 #ifdef HOST_WORDS_BIGENDIAN
76 #define H1(x)   ((x) ^ 7)
77 #define H1_2(x) ((x) ^ 6)
78 #define H1_4(x) ((x) ^ 4)
79 #define H2(x)   ((x) ^ 3)
80 #define H4(x)   ((x) ^ 1)
81 #define H8(x)   ((x))
82 #else
83 #define H1(x)   (x)
84 #define H1_2(x) (x)
85 #define H1_4(x) (x)
86 #define H2(x)   (x)
87 #define H4(x)   (x)
88 #define H8(x)   (x)
89 #endif
90 
91 static inline uint32_t vext_nf(uint32_t desc)
92 {
93     return FIELD_EX32(simd_data(desc), VDATA, NF);
94 }
95 
96 static inline uint32_t vext_vm(uint32_t desc)
97 {
98     return FIELD_EX32(simd_data(desc), VDATA, VM);
99 }
100 
101 /*
102  * Encode LMUL to lmul as following:
103  *     LMUL    vlmul    lmul
104  *      1       000       0
105  *      2       001       1
106  *      4       010       2
107  *      8       011       3
108  *      -       100       -
109  *     1/8      101      -3
110  *     1/4      110      -2
111  *     1/2      111      -1
112  */
113 static inline int32_t vext_lmul(uint32_t desc)
114 {
115     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
116 }
117 
118 /*
119  * Get vector group length in bytes. Its range is [64, 2048].
120  *
121  * As simd_desc support at most 256, the max vlen is 512 bits.
122  * So vlen in bytes is encoded as maxsz.
123  */
124 static inline uint32_t vext_maxsz(uint32_t desc)
125 {
126     return simd_maxsz(desc) << vext_lmul(desc);
127 }
128 
129 /*
130  * This function checks watchpoint before real load operation.
131  *
132  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
133  * In user mode, there is no watchpoint support now.
134  *
135  * It will trigger an exception if there is no mapping in TLB
136  * and page table walk can't fill the TLB entry. Then the guest
137  * software can return here after process the exception or never return.
138  */
139 static void probe_pages(CPURISCVState *env, target_ulong addr,
140                         target_ulong len, uintptr_t ra,
141                         MMUAccessType access_type)
142 {
143     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
144     target_ulong curlen = MIN(pagelen, len);
145 
146     probe_access(env, addr, curlen, access_type,
147                  cpu_mmu_index(env, false), ra);
148     if (len > curlen) {
149         addr += curlen;
150         curlen = len - curlen;
151         probe_access(env, addr, curlen, access_type,
152                      cpu_mmu_index(env, false), ra);
153     }
154 }
155 
156 static inline void vext_set_elem_mask(void *v0, int index,
157                                       uint8_t value)
158 {
159     int idx = index / 64;
160     int pos = index % 64;
161     uint64_t old = ((uint64_t *)v0)[idx];
162     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
163 }
164 
165 /*
166  * Earlier designs (pre-0.9) had a varying number of bits
167  * per mask value (MLEN). In the 0.9 design, MLEN=1.
168  * (Section 4.5)
169  */
170 static inline int vext_elem_mask(void *v0, int index)
171 {
172     int idx = index / 64;
173     int pos = index  % 64;
174     return (((uint64_t *)v0)[idx] >> pos) & 1;
175 }
176 
177 /* elements operations for load and store */
178 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
179                                uint32_t idx, void *vd, uintptr_t retaddr);
180 
181 #define GEN_VEXT_LD_ELEM(NAME, MTYPE, ETYPE, H, LDSUF)     \
182 static void NAME(CPURISCVState *env, abi_ptr addr,         \
183                  uint32_t idx, void *vd, uintptr_t retaddr)\
184 {                                                          \
185     MTYPE data;                                            \
186     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
187     data = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
188     *cur = data;                                           \
189 }                                                          \
190 
191 GEN_VEXT_LD_ELEM(ldb_b, int8_t,  int8_t,  H1, ldsb)
192 GEN_VEXT_LD_ELEM(ldb_h, int8_t,  int16_t, H2, ldsb)
193 GEN_VEXT_LD_ELEM(ldb_w, int8_t,  int32_t, H4, ldsb)
194 GEN_VEXT_LD_ELEM(ldb_d, int8_t,  int64_t, H8, ldsb)
195 GEN_VEXT_LD_ELEM(ldh_h, int16_t, int16_t, H2, ldsw)
196 GEN_VEXT_LD_ELEM(ldh_w, int16_t, int32_t, H4, ldsw)
197 GEN_VEXT_LD_ELEM(ldh_d, int16_t, int64_t, H8, ldsw)
198 GEN_VEXT_LD_ELEM(ldw_w, int32_t, int32_t, H4, ldl)
199 GEN_VEXT_LD_ELEM(ldw_d, int32_t, int64_t, H8, ldl)
200 GEN_VEXT_LD_ELEM(lde_b, int8_t,  int8_t,  H1, ldsb)
201 GEN_VEXT_LD_ELEM(lde_h, int16_t, int16_t, H2, ldsw)
202 GEN_VEXT_LD_ELEM(lde_w, int32_t, int32_t, H4, ldl)
203 GEN_VEXT_LD_ELEM(lde_d, int64_t, int64_t, H8, ldq)
204 GEN_VEXT_LD_ELEM(ldbu_b, uint8_t,  uint8_t,  H1, ldub)
205 GEN_VEXT_LD_ELEM(ldbu_h, uint8_t,  uint16_t, H2, ldub)
206 GEN_VEXT_LD_ELEM(ldbu_w, uint8_t,  uint32_t, H4, ldub)
207 GEN_VEXT_LD_ELEM(ldbu_d, uint8_t,  uint64_t, H8, ldub)
208 GEN_VEXT_LD_ELEM(ldhu_h, uint16_t, uint16_t, H2, lduw)
209 GEN_VEXT_LD_ELEM(ldhu_w, uint16_t, uint32_t, H4, lduw)
210 GEN_VEXT_LD_ELEM(ldhu_d, uint16_t, uint64_t, H8, lduw)
211 GEN_VEXT_LD_ELEM(ldwu_w, uint32_t, uint32_t, H4, ldl)
212 GEN_VEXT_LD_ELEM(ldwu_d, uint32_t, uint64_t, H8, ldl)
213 
214 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
215 static void NAME(CPURISCVState *env, abi_ptr addr,         \
216                  uint32_t idx, void *vd, uintptr_t retaddr)\
217 {                                                          \
218     ETYPE data = *((ETYPE *)vd + H(idx));                  \
219     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
220 }
221 
222 GEN_VEXT_ST_ELEM(stb_b, int8_t,  H1, stb)
223 GEN_VEXT_ST_ELEM(stb_h, int16_t, H2, stb)
224 GEN_VEXT_ST_ELEM(stb_w, int32_t, H4, stb)
225 GEN_VEXT_ST_ELEM(stb_d, int64_t, H8, stb)
226 GEN_VEXT_ST_ELEM(sth_h, int16_t, H2, stw)
227 GEN_VEXT_ST_ELEM(sth_w, int32_t, H4, stw)
228 GEN_VEXT_ST_ELEM(sth_d, int64_t, H8, stw)
229 GEN_VEXT_ST_ELEM(stw_w, int32_t, H4, stl)
230 GEN_VEXT_ST_ELEM(stw_d, int64_t, H8, stl)
231 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
232 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
233 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
234 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
235 
236 /*
237  *** stride: access vector element from strided memory
238  */
239 static void
240 vext_ldst_stride(void *vd, void *v0, target_ulong base,
241                  target_ulong stride, CPURISCVState *env,
242                  uint32_t desc, uint32_t vm,
243                  vext_ldst_elem_fn *ldst_elem,
244                  uint32_t esz, uint32_t msz, uintptr_t ra,
245                  MMUAccessType access_type)
246 {
247     uint32_t i, k;
248     uint32_t nf = vext_nf(desc);
249     uint32_t vlmax = vext_maxsz(desc) / esz;
250 
251     /* probe every access*/
252     for (i = 0; i < env->vl; i++) {
253         if (!vm && !vext_elem_mask(v0, i)) {
254             continue;
255         }
256         probe_pages(env, base + stride * i, nf * msz, ra, access_type);
257     }
258     /* do real access */
259     for (i = 0; i < env->vl; i++) {
260         k = 0;
261         if (!vm && !vext_elem_mask(v0, i)) {
262             continue;
263         }
264         while (k < nf) {
265             target_ulong addr = base + stride * i + k * msz;
266             ldst_elem(env, addr, i + k * vlmax, vd, ra);
267             k++;
268         }
269     }
270 }
271 
272 #define GEN_VEXT_LD_STRIDE(NAME, MTYPE, ETYPE, LOAD_FN)                 \
273 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
274                   target_ulong stride, CPURISCVState *env,              \
275                   uint32_t desc)                                        \
276 {                                                                       \
277     uint32_t vm = vext_vm(desc);                                        \
278     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
279                      sizeof(ETYPE), sizeof(MTYPE),                      \
280                      GETPC(), MMU_DATA_LOAD);                           \
281 }
282 
283 GEN_VEXT_LD_STRIDE(vlsb_v_b,  int8_t,   int8_t,   ldb_b)
284 GEN_VEXT_LD_STRIDE(vlsb_v_h,  int8_t,   int16_t,  ldb_h)
285 GEN_VEXT_LD_STRIDE(vlsb_v_w,  int8_t,   int32_t,  ldb_w)
286 GEN_VEXT_LD_STRIDE(vlsb_v_d,  int8_t,   int64_t,  ldb_d)
287 GEN_VEXT_LD_STRIDE(vlsh_v_h,  int16_t,  int16_t,  ldh_h)
288 GEN_VEXT_LD_STRIDE(vlsh_v_w,  int16_t,  int32_t,  ldh_w)
289 GEN_VEXT_LD_STRIDE(vlsh_v_d,  int16_t,  int64_t,  ldh_d)
290 GEN_VEXT_LD_STRIDE(vlsw_v_w,  int32_t,  int32_t,  ldw_w)
291 GEN_VEXT_LD_STRIDE(vlsw_v_d,  int32_t,  int64_t,  ldw_d)
292 GEN_VEXT_LD_STRIDE(vlse_v_b,  int8_t,   int8_t,   lde_b)
293 GEN_VEXT_LD_STRIDE(vlse_v_h,  int16_t,  int16_t,  lde_h)
294 GEN_VEXT_LD_STRIDE(vlse_v_w,  int32_t,  int32_t,  lde_w)
295 GEN_VEXT_LD_STRIDE(vlse_v_d,  int64_t,  int64_t,  lde_d)
296 GEN_VEXT_LD_STRIDE(vlsbu_v_b, uint8_t,  uint8_t,  ldbu_b)
297 GEN_VEXT_LD_STRIDE(vlsbu_v_h, uint8_t,  uint16_t, ldbu_h)
298 GEN_VEXT_LD_STRIDE(vlsbu_v_w, uint8_t,  uint32_t, ldbu_w)
299 GEN_VEXT_LD_STRIDE(vlsbu_v_d, uint8_t,  uint64_t, ldbu_d)
300 GEN_VEXT_LD_STRIDE(vlshu_v_h, uint16_t, uint16_t, ldhu_h)
301 GEN_VEXT_LD_STRIDE(vlshu_v_w, uint16_t, uint32_t, ldhu_w)
302 GEN_VEXT_LD_STRIDE(vlshu_v_d, uint16_t, uint64_t, ldhu_d)
303 GEN_VEXT_LD_STRIDE(vlswu_v_w, uint32_t, uint32_t, ldwu_w)
304 GEN_VEXT_LD_STRIDE(vlswu_v_d, uint32_t, uint64_t, ldwu_d)
305 
306 #define GEN_VEXT_ST_STRIDE(NAME, MTYPE, ETYPE, STORE_FN)                \
307 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
308                   target_ulong stride, CPURISCVState *env,              \
309                   uint32_t desc)                                        \
310 {                                                                       \
311     uint32_t vm = vext_vm(desc);                                        \
312     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
313                      sizeof(ETYPE), sizeof(MTYPE),                      \
314                      GETPC(), MMU_DATA_STORE);                          \
315 }
316 
317 GEN_VEXT_ST_STRIDE(vssb_v_b, int8_t,  int8_t,  stb_b)
318 GEN_VEXT_ST_STRIDE(vssb_v_h, int8_t,  int16_t, stb_h)
319 GEN_VEXT_ST_STRIDE(vssb_v_w, int8_t,  int32_t, stb_w)
320 GEN_VEXT_ST_STRIDE(vssb_v_d, int8_t,  int64_t, stb_d)
321 GEN_VEXT_ST_STRIDE(vssh_v_h, int16_t, int16_t, sth_h)
322 GEN_VEXT_ST_STRIDE(vssh_v_w, int16_t, int32_t, sth_w)
323 GEN_VEXT_ST_STRIDE(vssh_v_d, int16_t, int64_t, sth_d)
324 GEN_VEXT_ST_STRIDE(vssw_v_w, int32_t, int32_t, stw_w)
325 GEN_VEXT_ST_STRIDE(vssw_v_d, int32_t, int64_t, stw_d)
326 GEN_VEXT_ST_STRIDE(vsse_v_b, int8_t,  int8_t,  ste_b)
327 GEN_VEXT_ST_STRIDE(vsse_v_h, int16_t, int16_t, ste_h)
328 GEN_VEXT_ST_STRIDE(vsse_v_w, int32_t, int32_t, ste_w)
329 GEN_VEXT_ST_STRIDE(vsse_v_d, int64_t, int64_t, ste_d)
330 
331 /*
332  *** unit-stride: access elements stored contiguously in memory
333  */
334 
335 /* unmasked unit-stride load and store operation*/
336 static void
337 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
338              vext_ldst_elem_fn *ldst_elem, uint32_t esz, uint32_t msz,
339              uintptr_t ra, MMUAccessType access_type)
340 {
341     uint32_t i, k;
342     uint32_t nf = vext_nf(desc);
343     uint32_t vlmax = vext_maxsz(desc) / esz;
344 
345     /* probe every access */
346     probe_pages(env, base, env->vl * nf * msz, ra, access_type);
347     /* load bytes from guest memory */
348     for (i = 0; i < env->vl; i++) {
349         k = 0;
350         while (k < nf) {
351             target_ulong addr = base + (i * nf + k) * msz;
352             ldst_elem(env, addr, i + k * vlmax, vd, ra);
353             k++;
354         }
355     }
356 }
357 
358 /*
359  * masked unit-stride load and store operation will be a special case of stride,
360  * stride = NF * sizeof (MTYPE)
361  */
362 
363 #define GEN_VEXT_LD_US(NAME, MTYPE, ETYPE, LOAD_FN)                     \
364 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
365                          CPURISCVState *env, uint32_t desc)             \
366 {                                                                       \
367     uint32_t stride = vext_nf(desc) * sizeof(MTYPE);                    \
368     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
369                      sizeof(ETYPE), sizeof(MTYPE),                      \
370                      GETPC(), MMU_DATA_LOAD);                           \
371 }                                                                       \
372                                                                         \
373 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
374                   CPURISCVState *env, uint32_t desc)                    \
375 {                                                                       \
376     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
377                  sizeof(ETYPE), sizeof(MTYPE), GETPC(), MMU_DATA_LOAD); \
378 }
379 
380 GEN_VEXT_LD_US(vlb_v_b,  int8_t,   int8_t,   ldb_b)
381 GEN_VEXT_LD_US(vlb_v_h,  int8_t,   int16_t,  ldb_h)
382 GEN_VEXT_LD_US(vlb_v_w,  int8_t,   int32_t,  ldb_w)
383 GEN_VEXT_LD_US(vlb_v_d,  int8_t,   int64_t,  ldb_d)
384 GEN_VEXT_LD_US(vlh_v_h,  int16_t,  int16_t,  ldh_h)
385 GEN_VEXT_LD_US(vlh_v_w,  int16_t,  int32_t,  ldh_w)
386 GEN_VEXT_LD_US(vlh_v_d,  int16_t,  int64_t,  ldh_d)
387 GEN_VEXT_LD_US(vlw_v_w,  int32_t,  int32_t,  ldw_w)
388 GEN_VEXT_LD_US(vlw_v_d,  int32_t,  int64_t,  ldw_d)
389 GEN_VEXT_LD_US(vle_v_b,  int8_t,   int8_t,   lde_b)
390 GEN_VEXT_LD_US(vle_v_h,  int16_t,  int16_t,  lde_h)
391 GEN_VEXT_LD_US(vle_v_w,  int32_t,  int32_t,  lde_w)
392 GEN_VEXT_LD_US(vle_v_d,  int64_t,  int64_t,  lde_d)
393 GEN_VEXT_LD_US(vlbu_v_b, uint8_t,  uint8_t,  ldbu_b)
394 GEN_VEXT_LD_US(vlbu_v_h, uint8_t,  uint16_t, ldbu_h)
395 GEN_VEXT_LD_US(vlbu_v_w, uint8_t,  uint32_t, ldbu_w)
396 GEN_VEXT_LD_US(vlbu_v_d, uint8_t,  uint64_t, ldbu_d)
397 GEN_VEXT_LD_US(vlhu_v_h, uint16_t, uint16_t, ldhu_h)
398 GEN_VEXT_LD_US(vlhu_v_w, uint16_t, uint32_t, ldhu_w)
399 GEN_VEXT_LD_US(vlhu_v_d, uint16_t, uint64_t, ldhu_d)
400 GEN_VEXT_LD_US(vlwu_v_w, uint32_t, uint32_t, ldwu_w)
401 GEN_VEXT_LD_US(vlwu_v_d, uint32_t, uint64_t, ldwu_d)
402 
403 #define GEN_VEXT_ST_US(NAME, MTYPE, ETYPE, STORE_FN)                    \
404 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
405                          CPURISCVState *env, uint32_t desc)             \
406 {                                                                       \
407     uint32_t stride = vext_nf(desc) * sizeof(MTYPE);                    \
408     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,  \
409                      sizeof(ETYPE), sizeof(MTYPE),                      \
410                      GETPC(), MMU_DATA_STORE);                          \
411 }                                                                       \
412                                                                         \
413 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
414                   CPURISCVState *env, uint32_t desc)                    \
415 {                                                                       \
416     vext_ldst_us(vd, base, env, desc, STORE_FN,                         \
417                  sizeof(ETYPE), sizeof(MTYPE), GETPC(), MMU_DATA_STORE);\
418 }
419 
420 GEN_VEXT_ST_US(vsb_v_b, int8_t,  int8_t , stb_b)
421 GEN_VEXT_ST_US(vsb_v_h, int8_t,  int16_t, stb_h)
422 GEN_VEXT_ST_US(vsb_v_w, int8_t,  int32_t, stb_w)
423 GEN_VEXT_ST_US(vsb_v_d, int8_t,  int64_t, stb_d)
424 GEN_VEXT_ST_US(vsh_v_h, int16_t, int16_t, sth_h)
425 GEN_VEXT_ST_US(vsh_v_w, int16_t, int32_t, sth_w)
426 GEN_VEXT_ST_US(vsh_v_d, int16_t, int64_t, sth_d)
427 GEN_VEXT_ST_US(vsw_v_w, int32_t, int32_t, stw_w)
428 GEN_VEXT_ST_US(vsw_v_d, int32_t, int64_t, stw_d)
429 GEN_VEXT_ST_US(vse_v_b, int8_t,  int8_t , ste_b)
430 GEN_VEXT_ST_US(vse_v_h, int16_t, int16_t, ste_h)
431 GEN_VEXT_ST_US(vse_v_w, int32_t, int32_t, ste_w)
432 GEN_VEXT_ST_US(vse_v_d, int64_t, int64_t, ste_d)
433 
434 /*
435  *** index: access vector element from indexed memory
436  */
437 typedef target_ulong vext_get_index_addr(target_ulong base,
438         uint32_t idx, void *vs2);
439 
440 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
441 static target_ulong NAME(target_ulong base,            \
442                          uint32_t idx, void *vs2)      \
443 {                                                      \
444     return (base + *((ETYPE *)vs2 + H(idx)));          \
445 }
446 
447 GEN_VEXT_GET_INDEX_ADDR(idx_b, int8_t,  H1)
448 GEN_VEXT_GET_INDEX_ADDR(idx_h, int16_t, H2)
449 GEN_VEXT_GET_INDEX_ADDR(idx_w, int32_t, H4)
450 GEN_VEXT_GET_INDEX_ADDR(idx_d, int64_t, H8)
451 
452 static inline void
453 vext_ldst_index(void *vd, void *v0, target_ulong base,
454                 void *vs2, CPURISCVState *env, uint32_t desc,
455                 vext_get_index_addr get_index_addr,
456                 vext_ldst_elem_fn *ldst_elem,
457                 uint32_t esz, uint32_t msz, uintptr_t ra,
458                 MMUAccessType access_type)
459 {
460     uint32_t i, k;
461     uint32_t nf = vext_nf(desc);
462     uint32_t vm = vext_vm(desc);
463     uint32_t vlmax = vext_maxsz(desc) / esz;
464 
465     /* probe every access*/
466     for (i = 0; i < env->vl; i++) {
467         if (!vm && !vext_elem_mask(v0, i)) {
468             continue;
469         }
470         probe_pages(env, get_index_addr(base, i, vs2), nf * msz, ra,
471                     access_type);
472     }
473     /* load bytes from guest memory */
474     for (i = 0; i < env->vl; i++) {
475         k = 0;
476         if (!vm && !vext_elem_mask(v0, i)) {
477             continue;
478         }
479         while (k < nf) {
480             abi_ptr addr = get_index_addr(base, i, vs2) + k * msz;
481             ldst_elem(env, addr, i + k * vlmax, vd, ra);
482             k++;
483         }
484     }
485 }
486 
487 #define GEN_VEXT_LD_INDEX(NAME, MTYPE, ETYPE, INDEX_FN, LOAD_FN)           \
488 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
489                   void *vs2, CPURISCVState *env, uint32_t desc)            \
490 {                                                                          \
491     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
492                     LOAD_FN, sizeof(ETYPE), sizeof(MTYPE),                 \
493                     GETPC(), MMU_DATA_LOAD);                               \
494 }
495 
496 GEN_VEXT_LD_INDEX(vlxb_v_b,  int8_t,   int8_t,   idx_b, ldb_b)
497 GEN_VEXT_LD_INDEX(vlxb_v_h,  int8_t,   int16_t,  idx_h, ldb_h)
498 GEN_VEXT_LD_INDEX(vlxb_v_w,  int8_t,   int32_t,  idx_w, ldb_w)
499 GEN_VEXT_LD_INDEX(vlxb_v_d,  int8_t,   int64_t,  idx_d, ldb_d)
500 GEN_VEXT_LD_INDEX(vlxh_v_h,  int16_t,  int16_t,  idx_h, ldh_h)
501 GEN_VEXT_LD_INDEX(vlxh_v_w,  int16_t,  int32_t,  idx_w, ldh_w)
502 GEN_VEXT_LD_INDEX(vlxh_v_d,  int16_t,  int64_t,  idx_d, ldh_d)
503 GEN_VEXT_LD_INDEX(vlxw_v_w,  int32_t,  int32_t,  idx_w, ldw_w)
504 GEN_VEXT_LD_INDEX(vlxw_v_d,  int32_t,  int64_t,  idx_d, ldw_d)
505 GEN_VEXT_LD_INDEX(vlxe_v_b,  int8_t,   int8_t,   idx_b, lde_b)
506 GEN_VEXT_LD_INDEX(vlxe_v_h,  int16_t,  int16_t,  idx_h, lde_h)
507 GEN_VEXT_LD_INDEX(vlxe_v_w,  int32_t,  int32_t,  idx_w, lde_w)
508 GEN_VEXT_LD_INDEX(vlxe_v_d,  int64_t,  int64_t,  idx_d, lde_d)
509 GEN_VEXT_LD_INDEX(vlxbu_v_b, uint8_t,  uint8_t,  idx_b, ldbu_b)
510 GEN_VEXT_LD_INDEX(vlxbu_v_h, uint8_t,  uint16_t, idx_h, ldbu_h)
511 GEN_VEXT_LD_INDEX(vlxbu_v_w, uint8_t,  uint32_t, idx_w, ldbu_w)
512 GEN_VEXT_LD_INDEX(vlxbu_v_d, uint8_t,  uint64_t, idx_d, ldbu_d)
513 GEN_VEXT_LD_INDEX(vlxhu_v_h, uint16_t, uint16_t, idx_h, ldhu_h)
514 GEN_VEXT_LD_INDEX(vlxhu_v_w, uint16_t, uint32_t, idx_w, ldhu_w)
515 GEN_VEXT_LD_INDEX(vlxhu_v_d, uint16_t, uint64_t, idx_d, ldhu_d)
516 GEN_VEXT_LD_INDEX(vlxwu_v_w, uint32_t, uint32_t, idx_w, ldwu_w)
517 GEN_VEXT_LD_INDEX(vlxwu_v_d, uint32_t, uint64_t, idx_d, ldwu_d)
518 
519 #define GEN_VEXT_ST_INDEX(NAME, MTYPE, ETYPE, INDEX_FN, STORE_FN)\
520 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
521                   void *vs2, CPURISCVState *env, uint32_t desc)  \
522 {                                                                \
523     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
524                     STORE_FN, sizeof(ETYPE), sizeof(MTYPE),      \
525                     GETPC(), MMU_DATA_STORE);                    \
526 }
527 
528 GEN_VEXT_ST_INDEX(vsxb_v_b, int8_t,  int8_t,  idx_b, stb_b)
529 GEN_VEXT_ST_INDEX(vsxb_v_h, int8_t,  int16_t, idx_h, stb_h)
530 GEN_VEXT_ST_INDEX(vsxb_v_w, int8_t,  int32_t, idx_w, stb_w)
531 GEN_VEXT_ST_INDEX(vsxb_v_d, int8_t,  int64_t, idx_d, stb_d)
532 GEN_VEXT_ST_INDEX(vsxh_v_h, int16_t, int16_t, idx_h, sth_h)
533 GEN_VEXT_ST_INDEX(vsxh_v_w, int16_t, int32_t, idx_w, sth_w)
534 GEN_VEXT_ST_INDEX(vsxh_v_d, int16_t, int64_t, idx_d, sth_d)
535 GEN_VEXT_ST_INDEX(vsxw_v_w, int32_t, int32_t, idx_w, stw_w)
536 GEN_VEXT_ST_INDEX(vsxw_v_d, int32_t, int64_t, idx_d, stw_d)
537 GEN_VEXT_ST_INDEX(vsxe_v_b, int8_t,  int8_t,  idx_b, ste_b)
538 GEN_VEXT_ST_INDEX(vsxe_v_h, int16_t, int16_t, idx_h, ste_h)
539 GEN_VEXT_ST_INDEX(vsxe_v_w, int32_t, int32_t, idx_w, ste_w)
540 GEN_VEXT_ST_INDEX(vsxe_v_d, int64_t, int64_t, idx_d, ste_d)
541 
542 /*
543  *** unit-stride fault-only-fisrt load instructions
544  */
545 static inline void
546 vext_ldff(void *vd, void *v0, target_ulong base,
547           CPURISCVState *env, uint32_t desc,
548           vext_ldst_elem_fn *ldst_elem,
549           uint32_t esz, uint32_t msz, uintptr_t ra)
550 {
551     void *host;
552     uint32_t i, k, vl = 0;
553     uint32_t nf = vext_nf(desc);
554     uint32_t vm = vext_vm(desc);
555     uint32_t vlmax = vext_maxsz(desc) / esz;
556     target_ulong addr, offset, remain;
557 
558     /* probe every access*/
559     for (i = 0; i < env->vl; i++) {
560         if (!vm && !vext_elem_mask(v0, i)) {
561             continue;
562         }
563         addr = base + nf * i * msz;
564         if (i == 0) {
565             probe_pages(env, addr, nf * msz, ra, MMU_DATA_LOAD);
566         } else {
567             /* if it triggers an exception, no need to check watchpoint */
568             remain = nf * msz;
569             while (remain > 0) {
570                 offset = -(addr | TARGET_PAGE_MASK);
571                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
572                                          cpu_mmu_index(env, false));
573                 if (host) {
574 #ifdef CONFIG_USER_ONLY
575                     if (page_check_range(addr, nf * msz, PAGE_READ) < 0) {
576                         vl = i;
577                         goto ProbeSuccess;
578                     }
579 #else
580                     probe_pages(env, addr, nf * msz, ra, MMU_DATA_LOAD);
581 #endif
582                 } else {
583                     vl = i;
584                     goto ProbeSuccess;
585                 }
586                 if (remain <=  offset) {
587                     break;
588                 }
589                 remain -= offset;
590                 addr += offset;
591             }
592         }
593     }
594 ProbeSuccess:
595     /* load bytes from guest memory */
596     if (vl != 0) {
597         env->vl = vl;
598     }
599     for (i = 0; i < env->vl; i++) {
600         k = 0;
601         if (!vm && !vext_elem_mask(v0, i)) {
602             continue;
603         }
604         while (k < nf) {
605             target_ulong addr = base + (i * nf + k) * msz;
606             ldst_elem(env, addr, i + k * vlmax, vd, ra);
607             k++;
608         }
609     }
610 }
611 
612 #define GEN_VEXT_LDFF(NAME, MTYPE, ETYPE, LOAD_FN)               \
613 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
614                   CPURISCVState *env, uint32_t desc)             \
615 {                                                                \
616     vext_ldff(vd, v0, base, env, desc, LOAD_FN,                  \
617               sizeof(ETYPE), sizeof(MTYPE), GETPC());            \
618 }
619 
620 GEN_VEXT_LDFF(vlbff_v_b,  int8_t,   int8_t,   ldb_b)
621 GEN_VEXT_LDFF(vlbff_v_h,  int8_t,   int16_t,  ldb_h)
622 GEN_VEXT_LDFF(vlbff_v_w,  int8_t,   int32_t,  ldb_w)
623 GEN_VEXT_LDFF(vlbff_v_d,  int8_t,   int64_t,  ldb_d)
624 GEN_VEXT_LDFF(vlhff_v_h,  int16_t,  int16_t,  ldh_h)
625 GEN_VEXT_LDFF(vlhff_v_w,  int16_t,  int32_t,  ldh_w)
626 GEN_VEXT_LDFF(vlhff_v_d,  int16_t,  int64_t,  ldh_d)
627 GEN_VEXT_LDFF(vlwff_v_w,  int32_t,  int32_t,  ldw_w)
628 GEN_VEXT_LDFF(vlwff_v_d,  int32_t,  int64_t,  ldw_d)
629 GEN_VEXT_LDFF(vleff_v_b,  int8_t,   int8_t,   lde_b)
630 GEN_VEXT_LDFF(vleff_v_h,  int16_t,  int16_t,  lde_h)
631 GEN_VEXT_LDFF(vleff_v_w,  int32_t,  int32_t,  lde_w)
632 GEN_VEXT_LDFF(vleff_v_d,  int64_t,  int64_t,  lde_d)
633 GEN_VEXT_LDFF(vlbuff_v_b, uint8_t,  uint8_t,  ldbu_b)
634 GEN_VEXT_LDFF(vlbuff_v_h, uint8_t,  uint16_t, ldbu_h)
635 GEN_VEXT_LDFF(vlbuff_v_w, uint8_t,  uint32_t, ldbu_w)
636 GEN_VEXT_LDFF(vlbuff_v_d, uint8_t,  uint64_t, ldbu_d)
637 GEN_VEXT_LDFF(vlhuff_v_h, uint16_t, uint16_t, ldhu_h)
638 GEN_VEXT_LDFF(vlhuff_v_w, uint16_t, uint32_t, ldhu_w)
639 GEN_VEXT_LDFF(vlhuff_v_d, uint16_t, uint64_t, ldhu_d)
640 GEN_VEXT_LDFF(vlwuff_v_w, uint32_t, uint32_t, ldwu_w)
641 GEN_VEXT_LDFF(vlwuff_v_d, uint32_t, uint64_t, ldwu_d)
642 
643 #define DO_SWAP(N, M) (M)
644 #define DO_AND(N, M)  (N & M)
645 #define DO_XOR(N, M)  (N ^ M)
646 #define DO_OR(N, M)   (N | M)
647 #define DO_ADD(N, M)  (N + M)
648 
649 /* Signed min/max */
650 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
651 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
652 
653 /* Unsigned min/max */
654 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
655 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
656 
657 /*
658  *** Vector Integer Arithmetic Instructions
659  */
660 
661 /* expand macro args before macro */
662 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
663 
664 /* (TD, T1, T2, TX1, TX2) */
665 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
666 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
667 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
668 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
669 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
670 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
671 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
672 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
673 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
674 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
675 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
676 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
677 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
678 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
679 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
680 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
681 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
682 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
683 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
684 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
685 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
686 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
687 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
688 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
689 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
690 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
691 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
692 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
693 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
694 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
695 
696 /* operation of two vector elements */
697 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
698 
699 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
700 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
701 {                                                               \
702     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
703     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
704     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
705 }
706 #define DO_SUB(N, M) (N - M)
707 #define DO_RSUB(N, M) (M - N)
708 
709 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
710 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
711 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
712 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
713 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
714 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
715 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
716 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
717 
718 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
719                        CPURISCVState *env, uint32_t desc,
720                        uint32_t esz, uint32_t dsz,
721                        opivv2_fn *fn)
722 {
723     uint32_t vm = vext_vm(desc);
724     uint32_t vl = env->vl;
725     uint32_t i;
726 
727     for (i = 0; i < vl; i++) {
728         if (!vm && !vext_elem_mask(v0, i)) {
729             continue;
730         }
731         fn(vd, vs1, vs2, i);
732     }
733 }
734 
735 /* generate the helpers for OPIVV */
736 #define GEN_VEXT_VV(NAME, ESZ, DSZ)                       \
737 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
738                   void *vs2, CPURISCVState *env,          \
739                   uint32_t desc)                          \
740 {                                                         \
741     do_vext_vv(vd, v0, vs1, vs2, env, desc, ESZ, DSZ,     \
742                do_##NAME);                                \
743 }
744 
745 GEN_VEXT_VV(vadd_vv_b, 1, 1)
746 GEN_VEXT_VV(vadd_vv_h, 2, 2)
747 GEN_VEXT_VV(vadd_vv_w, 4, 4)
748 GEN_VEXT_VV(vadd_vv_d, 8, 8)
749 GEN_VEXT_VV(vsub_vv_b, 1, 1)
750 GEN_VEXT_VV(vsub_vv_h, 2, 2)
751 GEN_VEXT_VV(vsub_vv_w, 4, 4)
752 GEN_VEXT_VV(vsub_vv_d, 8, 8)
753 
754 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
755 
756 /*
757  * (T1)s1 gives the real operator type.
758  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
759  */
760 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
761 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
762 {                                                                   \
763     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
764     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
765 }
766 
767 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
768 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
769 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
770 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
771 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
772 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
773 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
774 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
775 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
776 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
777 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
778 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
779 
780 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
781                        CPURISCVState *env, uint32_t desc,
782                        uint32_t esz, uint32_t dsz,
783                        opivx2_fn fn)
784 {
785     uint32_t vm = vext_vm(desc);
786     uint32_t vl = env->vl;
787     uint32_t i;
788 
789     for (i = 0; i < vl; i++) {
790         if (!vm && !vext_elem_mask(v0, i)) {
791             continue;
792         }
793         fn(vd, s1, vs2, i);
794     }
795 }
796 
797 /* generate the helpers for OPIVX */
798 #define GEN_VEXT_VX(NAME, ESZ, DSZ)                       \
799 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
800                   void *vs2, CPURISCVState *env,          \
801                   uint32_t desc)                          \
802 {                                                         \
803     do_vext_vx(vd, v0, s1, vs2, env, desc, ESZ, DSZ,      \
804                do_##NAME);                                \
805 }
806 
807 GEN_VEXT_VX(vadd_vx_b, 1, 1)
808 GEN_VEXT_VX(vadd_vx_h, 2, 2)
809 GEN_VEXT_VX(vadd_vx_w, 4, 4)
810 GEN_VEXT_VX(vadd_vx_d, 8, 8)
811 GEN_VEXT_VX(vsub_vx_b, 1, 1)
812 GEN_VEXT_VX(vsub_vx_h, 2, 2)
813 GEN_VEXT_VX(vsub_vx_w, 4, 4)
814 GEN_VEXT_VX(vsub_vx_d, 8, 8)
815 GEN_VEXT_VX(vrsub_vx_b, 1, 1)
816 GEN_VEXT_VX(vrsub_vx_h, 2, 2)
817 GEN_VEXT_VX(vrsub_vx_w, 4, 4)
818 GEN_VEXT_VX(vrsub_vx_d, 8, 8)
819 
820 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
821 {
822     intptr_t oprsz = simd_oprsz(desc);
823     intptr_t i;
824 
825     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
826         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
827     }
828 }
829 
830 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
831 {
832     intptr_t oprsz = simd_oprsz(desc);
833     intptr_t i;
834 
835     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
836         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
837     }
838 }
839 
840 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
841 {
842     intptr_t oprsz = simd_oprsz(desc);
843     intptr_t i;
844 
845     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
846         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
847     }
848 }
849 
850 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
851 {
852     intptr_t oprsz = simd_oprsz(desc);
853     intptr_t i;
854 
855     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
856         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
857     }
858 }
859 
860 /* Vector Widening Integer Add/Subtract */
861 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
862 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
863 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
864 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
865 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
866 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
867 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
868 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
869 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
870 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
871 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
872 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
873 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
874 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
875 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
876 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
877 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
878 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
879 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
880 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
881 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
882 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
883 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
884 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
885 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
886 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
887 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
888 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
889 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
890 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
891 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
892 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
893 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
894 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
895 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
896 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
897 GEN_VEXT_VV(vwaddu_vv_b, 1, 2)
898 GEN_VEXT_VV(vwaddu_vv_h, 2, 4)
899 GEN_VEXT_VV(vwaddu_vv_w, 4, 8)
900 GEN_VEXT_VV(vwsubu_vv_b, 1, 2)
901 GEN_VEXT_VV(vwsubu_vv_h, 2, 4)
902 GEN_VEXT_VV(vwsubu_vv_w, 4, 8)
903 GEN_VEXT_VV(vwadd_vv_b, 1, 2)
904 GEN_VEXT_VV(vwadd_vv_h, 2, 4)
905 GEN_VEXT_VV(vwadd_vv_w, 4, 8)
906 GEN_VEXT_VV(vwsub_vv_b, 1, 2)
907 GEN_VEXT_VV(vwsub_vv_h, 2, 4)
908 GEN_VEXT_VV(vwsub_vv_w, 4, 8)
909 GEN_VEXT_VV(vwaddu_wv_b, 1, 2)
910 GEN_VEXT_VV(vwaddu_wv_h, 2, 4)
911 GEN_VEXT_VV(vwaddu_wv_w, 4, 8)
912 GEN_VEXT_VV(vwsubu_wv_b, 1, 2)
913 GEN_VEXT_VV(vwsubu_wv_h, 2, 4)
914 GEN_VEXT_VV(vwsubu_wv_w, 4, 8)
915 GEN_VEXT_VV(vwadd_wv_b, 1, 2)
916 GEN_VEXT_VV(vwadd_wv_h, 2, 4)
917 GEN_VEXT_VV(vwadd_wv_w, 4, 8)
918 GEN_VEXT_VV(vwsub_wv_b, 1, 2)
919 GEN_VEXT_VV(vwsub_wv_h, 2, 4)
920 GEN_VEXT_VV(vwsub_wv_w, 4, 8)
921 
922 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
923 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
924 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
925 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
926 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
927 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
928 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
929 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
930 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
931 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
932 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
933 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
934 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
935 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
936 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
937 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
938 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
939 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
940 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
941 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
942 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
943 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
944 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
945 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
946 GEN_VEXT_VX(vwaddu_vx_b, 1, 2)
947 GEN_VEXT_VX(vwaddu_vx_h, 2, 4)
948 GEN_VEXT_VX(vwaddu_vx_w, 4, 8)
949 GEN_VEXT_VX(vwsubu_vx_b, 1, 2)
950 GEN_VEXT_VX(vwsubu_vx_h, 2, 4)
951 GEN_VEXT_VX(vwsubu_vx_w, 4, 8)
952 GEN_VEXT_VX(vwadd_vx_b, 1, 2)
953 GEN_VEXT_VX(vwadd_vx_h, 2, 4)
954 GEN_VEXT_VX(vwadd_vx_w, 4, 8)
955 GEN_VEXT_VX(vwsub_vx_b, 1, 2)
956 GEN_VEXT_VX(vwsub_vx_h, 2, 4)
957 GEN_VEXT_VX(vwsub_vx_w, 4, 8)
958 GEN_VEXT_VX(vwaddu_wx_b, 1, 2)
959 GEN_VEXT_VX(vwaddu_wx_h, 2, 4)
960 GEN_VEXT_VX(vwaddu_wx_w, 4, 8)
961 GEN_VEXT_VX(vwsubu_wx_b, 1, 2)
962 GEN_VEXT_VX(vwsubu_wx_h, 2, 4)
963 GEN_VEXT_VX(vwsubu_wx_w, 4, 8)
964 GEN_VEXT_VX(vwadd_wx_b, 1, 2)
965 GEN_VEXT_VX(vwadd_wx_h, 2, 4)
966 GEN_VEXT_VX(vwadd_wx_w, 4, 8)
967 GEN_VEXT_VX(vwsub_wx_b, 1, 2)
968 GEN_VEXT_VX(vwsub_wx_h, 2, 4)
969 GEN_VEXT_VX(vwsub_wx_w, 4, 8)
970 
971 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
972 #define DO_VADC(N, M, C) (N + M + C)
973 #define DO_VSBC(N, M, C) (N - M - C)
974 
975 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
976 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
977                   CPURISCVState *env, uint32_t desc)          \
978 {                                                             \
979     uint32_t vl = env->vl;                                    \
980     uint32_t i;                                               \
981                                                               \
982     for (i = 0; i < vl; i++) {                                \
983         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
984         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
985         uint8_t carry = vext_elem_mask(v0, i);                \
986                                                               \
987         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
988     }                                                         \
989 }
990 
991 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
992 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
993 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
994 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
995 
996 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
997 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
998 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
999 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1000 
1001 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1002 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1003                   CPURISCVState *env, uint32_t desc)                     \
1004 {                                                                        \
1005     uint32_t vl = env->vl;                                               \
1006     uint32_t i;                                                          \
1007                                                                          \
1008     for (i = 0; i < vl; i++) {                                           \
1009         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1010         uint8_t carry = vext_elem_mask(v0, i);                           \
1011                                                                          \
1012         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1013     }                                                                    \
1014 }
1015 
1016 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1017 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1018 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1019 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1020 
1021 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1022 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1023 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1024 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1025 
1026 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1027                           (__typeof(N))(N + M) < N)
1028 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1029 
1030 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1031 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1032                   CPURISCVState *env, uint32_t desc)          \
1033 {                                                             \
1034     uint32_t vl = env->vl;                                    \
1035     uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);        \
1036     uint32_t i;                                               \
1037                                                               \
1038     for (i = 0; i < vl; i++) {                                \
1039         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1040         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1041         uint8_t carry = vext_elem_mask(v0, i);                \
1042                                                               \
1043         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1044     }                                                         \
1045     for (; i < vlmax; i++) {                                  \
1046         vext_set_elem_mask(vd, i, 0);                         \
1047     }                                                         \
1048 }
1049 
1050 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1051 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1052 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1053 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1054 
1055 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1056 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1057 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1058 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1059 
1060 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1061 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1062                   void *vs2, CPURISCVState *env, uint32_t desc) \
1063 {                                                               \
1064     uint32_t vl = env->vl;                                      \
1065     uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);          \
1066     uint32_t i;                                                 \
1067                                                                 \
1068     for (i = 0; i < vl; i++) {                                  \
1069         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1070         uint8_t carry = vext_elem_mask(v0, i);                  \
1071                                                                 \
1072         vext_set_elem_mask(vd, i,                               \
1073                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1074     }                                                           \
1075     for (; i < vlmax; i++) {                                    \
1076         vext_set_elem_mask(vd, i, 0);                           \
1077     }                                                           \
1078 }
1079 
1080 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1081 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1082 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1083 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1084 
1085 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1086 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1087 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1088 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1089 
1090 /* Vector Bitwise Logical Instructions */
1091 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1092 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1093 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1094 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1095 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1096 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1097 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1098 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1099 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1100 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1101 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1102 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1103 GEN_VEXT_VV(vand_vv_b, 1, 1)
1104 GEN_VEXT_VV(vand_vv_h, 2, 2)
1105 GEN_VEXT_VV(vand_vv_w, 4, 4)
1106 GEN_VEXT_VV(vand_vv_d, 8, 8)
1107 GEN_VEXT_VV(vor_vv_b, 1, 1)
1108 GEN_VEXT_VV(vor_vv_h, 2, 2)
1109 GEN_VEXT_VV(vor_vv_w, 4, 4)
1110 GEN_VEXT_VV(vor_vv_d, 8, 8)
1111 GEN_VEXT_VV(vxor_vv_b, 1, 1)
1112 GEN_VEXT_VV(vxor_vv_h, 2, 2)
1113 GEN_VEXT_VV(vxor_vv_w, 4, 4)
1114 GEN_VEXT_VV(vxor_vv_d, 8, 8)
1115 
1116 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1117 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1118 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1119 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1120 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1121 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1122 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1123 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1124 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1125 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1126 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1127 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1128 GEN_VEXT_VX(vand_vx_b, 1, 1)
1129 GEN_VEXT_VX(vand_vx_h, 2, 2)
1130 GEN_VEXT_VX(vand_vx_w, 4, 4)
1131 GEN_VEXT_VX(vand_vx_d, 8, 8)
1132 GEN_VEXT_VX(vor_vx_b, 1, 1)
1133 GEN_VEXT_VX(vor_vx_h, 2, 2)
1134 GEN_VEXT_VX(vor_vx_w, 4, 4)
1135 GEN_VEXT_VX(vor_vx_d, 8, 8)
1136 GEN_VEXT_VX(vxor_vx_b, 1, 1)
1137 GEN_VEXT_VX(vxor_vx_h, 2, 2)
1138 GEN_VEXT_VX(vxor_vx_w, 4, 4)
1139 GEN_VEXT_VX(vxor_vx_d, 8, 8)
1140 
1141 /* Vector Single-Width Bit Shift Instructions */
1142 #define DO_SLL(N, M)  (N << (M))
1143 #define DO_SRL(N, M)  (N >> (M))
1144 
1145 /* generate the helpers for shift instructions with two vector operators */
1146 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1147 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1148                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1149 {                                                                         \
1150     uint32_t vm = vext_vm(desc);                                          \
1151     uint32_t vl = env->vl;                                                \
1152     uint32_t i;                                                           \
1153                                                                           \
1154     for (i = 0; i < vl; i++) {                                            \
1155         if (!vm && !vext_elem_mask(v0, i)) {                              \
1156             continue;                                                     \
1157         }                                                                 \
1158         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1159         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1160         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1161     }                                                                     \
1162 }
1163 
1164 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1165 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1166 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1167 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1168 
1169 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1170 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1171 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1172 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1173 
1174 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1175 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1176 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1177 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1178 
1179 /* generate the helpers for shift instructions with one vector and one scalar */
1180 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1181 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1182         void *vs2, CPURISCVState *env, uint32_t desc)       \
1183 {                                                           \
1184     uint32_t vm = vext_vm(desc);                            \
1185     uint32_t vl = env->vl;                                  \
1186     uint32_t i;                                             \
1187                                                             \
1188     for (i = 0; i < vl; i++) {                              \
1189         if (!vm && !vext_elem_mask(v0, i)) {                \
1190             continue;                                       \
1191         }                                                   \
1192         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1193         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1194     }                                                       \
1195 }
1196 
1197 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1198 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1199 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1200 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1201 
1202 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1203 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1204 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1205 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1206 
1207 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1208 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1209 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1210 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1211 
1212 /* Vector Narrowing Integer Right Shift Instructions */
1213 GEN_VEXT_SHIFT_VV(vnsrl_vv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1214 GEN_VEXT_SHIFT_VV(vnsrl_vv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1215 GEN_VEXT_SHIFT_VV(vnsrl_vv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1216 GEN_VEXT_SHIFT_VV(vnsra_vv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1217 GEN_VEXT_SHIFT_VV(vnsra_vv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1218 GEN_VEXT_SHIFT_VV(vnsra_vv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1219 GEN_VEXT_SHIFT_VX(vnsrl_vx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1220 GEN_VEXT_SHIFT_VX(vnsrl_vx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1221 GEN_VEXT_SHIFT_VX(vnsrl_vx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1222 GEN_VEXT_SHIFT_VX(vnsra_vx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1223 GEN_VEXT_SHIFT_VX(vnsra_vx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1224 GEN_VEXT_SHIFT_VX(vnsra_vx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1225 
1226 /* Vector Integer Comparison Instructions */
1227 #define DO_MSEQ(N, M) (N == M)
1228 #define DO_MSNE(N, M) (N != M)
1229 #define DO_MSLT(N, M) (N < M)
1230 #define DO_MSLE(N, M) (N <= M)
1231 #define DO_MSGT(N, M) (N > M)
1232 
1233 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1234 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1235                   CPURISCVState *env, uint32_t desc)          \
1236 {                                                             \
1237     uint32_t vm = vext_vm(desc);                              \
1238     uint32_t vl = env->vl;                                    \
1239     uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);        \
1240     uint32_t i;                                               \
1241                                                               \
1242     for (i = 0; i < vl; i++) {                                \
1243         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1244         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1245         if (!vm && !vext_elem_mask(v0, i)) {                  \
1246             continue;                                         \
1247         }                                                     \
1248         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1249     }                                                         \
1250     for (; i < vlmax; i++) {                                  \
1251         vext_set_elem_mask(vd, i, 0);                         \
1252     }                                                         \
1253 }
1254 
1255 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1256 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1257 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1258 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1259 
1260 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1261 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1262 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1263 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1264 
1265 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1266 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1267 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1268 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1269 
1270 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1271 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1272 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1273 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1274 
1275 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1276 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1277 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1278 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1279 
1280 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1281 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1282 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1283 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1284 
1285 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1286 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1287                   CPURISCVState *env, uint32_t desc)                \
1288 {                                                                   \
1289     uint32_t vm = vext_vm(desc);                                    \
1290     uint32_t vl = env->vl;                                          \
1291     uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);              \
1292     uint32_t i;                                                     \
1293                                                                     \
1294     for (i = 0; i < vl; i++) {                                      \
1295         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1296         if (!vm && !vext_elem_mask(v0, i)) {                        \
1297             continue;                                               \
1298         }                                                           \
1299         vext_set_elem_mask(vd, i,                                   \
1300                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1301     }                                                               \
1302     for (; i < vlmax; i++) {                                        \
1303         vext_set_elem_mask(vd, i, 0);                               \
1304     }                                                               \
1305 }
1306 
1307 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1308 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1309 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1310 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1311 
1312 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1313 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1314 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1315 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1316 
1317 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1318 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1319 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1320 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1321 
1322 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1323 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1324 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1325 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1326 
1327 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1328 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1329 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1330 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1331 
1332 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1333 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1334 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1335 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1336 
1337 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1338 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1339 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1340 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1341 
1342 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1343 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1344 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1345 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1346 
1347 /* Vector Integer Min/Max Instructions */
1348 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1349 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1350 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1351 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1352 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1353 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1354 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1355 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1356 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1357 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1358 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1359 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1360 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1361 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1362 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1363 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1364 GEN_VEXT_VV(vminu_vv_b, 1, 1)
1365 GEN_VEXT_VV(vminu_vv_h, 2, 2)
1366 GEN_VEXT_VV(vminu_vv_w, 4, 4)
1367 GEN_VEXT_VV(vminu_vv_d, 8, 8)
1368 GEN_VEXT_VV(vmin_vv_b, 1, 1)
1369 GEN_VEXT_VV(vmin_vv_h, 2, 2)
1370 GEN_VEXT_VV(vmin_vv_w, 4, 4)
1371 GEN_VEXT_VV(vmin_vv_d, 8, 8)
1372 GEN_VEXT_VV(vmaxu_vv_b, 1, 1)
1373 GEN_VEXT_VV(vmaxu_vv_h, 2, 2)
1374 GEN_VEXT_VV(vmaxu_vv_w, 4, 4)
1375 GEN_VEXT_VV(vmaxu_vv_d, 8, 8)
1376 GEN_VEXT_VV(vmax_vv_b, 1, 1)
1377 GEN_VEXT_VV(vmax_vv_h, 2, 2)
1378 GEN_VEXT_VV(vmax_vv_w, 4, 4)
1379 GEN_VEXT_VV(vmax_vv_d, 8, 8)
1380 
1381 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1382 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1383 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1384 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1385 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1386 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1387 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1388 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1389 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1390 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1391 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1392 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1393 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1394 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1395 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1396 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1397 GEN_VEXT_VX(vminu_vx_b, 1, 1)
1398 GEN_VEXT_VX(vminu_vx_h, 2, 2)
1399 GEN_VEXT_VX(vminu_vx_w, 4, 4)
1400 GEN_VEXT_VX(vminu_vx_d, 8, 8)
1401 GEN_VEXT_VX(vmin_vx_b, 1, 1)
1402 GEN_VEXT_VX(vmin_vx_h, 2, 2)
1403 GEN_VEXT_VX(vmin_vx_w, 4, 4)
1404 GEN_VEXT_VX(vmin_vx_d, 8, 8)
1405 GEN_VEXT_VX(vmaxu_vx_b, 1, 1)
1406 GEN_VEXT_VX(vmaxu_vx_h, 2, 2)
1407 GEN_VEXT_VX(vmaxu_vx_w, 4, 4)
1408 GEN_VEXT_VX(vmaxu_vx_d, 8, 8)
1409 GEN_VEXT_VX(vmax_vx_b, 1, 1)
1410 GEN_VEXT_VX(vmax_vx_h, 2, 2)
1411 GEN_VEXT_VX(vmax_vx_w, 4, 4)
1412 GEN_VEXT_VX(vmax_vx_d, 8, 8)
1413 
1414 /* Vector Single-Width Integer Multiply Instructions */
1415 #define DO_MUL(N, M) (N * M)
1416 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1417 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1418 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1419 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1420 GEN_VEXT_VV(vmul_vv_b, 1, 1)
1421 GEN_VEXT_VV(vmul_vv_h, 2, 2)
1422 GEN_VEXT_VV(vmul_vv_w, 4, 4)
1423 GEN_VEXT_VV(vmul_vv_d, 8, 8)
1424 
1425 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1426 {
1427     return (int16_t)s2 * (int16_t)s1 >> 8;
1428 }
1429 
1430 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1431 {
1432     return (int32_t)s2 * (int32_t)s1 >> 16;
1433 }
1434 
1435 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1436 {
1437     return (int64_t)s2 * (int64_t)s1 >> 32;
1438 }
1439 
1440 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1441 {
1442     uint64_t hi_64, lo_64;
1443 
1444     muls64(&lo_64, &hi_64, s1, s2);
1445     return hi_64;
1446 }
1447 
1448 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1449 {
1450     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1451 }
1452 
1453 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1454 {
1455     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1456 }
1457 
1458 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1459 {
1460     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1461 }
1462 
1463 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1464 {
1465     uint64_t hi_64, lo_64;
1466 
1467     mulu64(&lo_64, &hi_64, s2, s1);
1468     return hi_64;
1469 }
1470 
1471 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1472 {
1473     return (int16_t)s2 * (uint16_t)s1 >> 8;
1474 }
1475 
1476 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1477 {
1478     return (int32_t)s2 * (uint32_t)s1 >> 16;
1479 }
1480 
1481 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1482 {
1483     return (int64_t)s2 * (uint64_t)s1 >> 32;
1484 }
1485 
1486 /*
1487  * Let  A = signed operand,
1488  *      B = unsigned operand
1489  *      P = mulu64(A, B), unsigned product
1490  *
1491  * LET  X = 2 ** 64  - A, 2's complement of A
1492  *      SP = signed product
1493  * THEN
1494  *      IF A < 0
1495  *          SP = -X * B
1496  *             = -(2 ** 64 - A) * B
1497  *             = A * B - 2 ** 64 * B
1498  *             = P - 2 ** 64 * B
1499  *      ELSE
1500  *          SP = P
1501  * THEN
1502  *      HI_P -= (A < 0 ? B : 0)
1503  */
1504 
1505 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1506 {
1507     uint64_t hi_64, lo_64;
1508 
1509     mulu64(&lo_64, &hi_64, s2, s1);
1510 
1511     hi_64 -= s2 < 0 ? s1 : 0;
1512     return hi_64;
1513 }
1514 
1515 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1516 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1517 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1518 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1519 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1520 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1521 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1522 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1523 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1524 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1525 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1526 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1527 GEN_VEXT_VV(vmulh_vv_b, 1, 1)
1528 GEN_VEXT_VV(vmulh_vv_h, 2, 2)
1529 GEN_VEXT_VV(vmulh_vv_w, 4, 4)
1530 GEN_VEXT_VV(vmulh_vv_d, 8, 8)
1531 GEN_VEXT_VV(vmulhu_vv_b, 1, 1)
1532 GEN_VEXT_VV(vmulhu_vv_h, 2, 2)
1533 GEN_VEXT_VV(vmulhu_vv_w, 4, 4)
1534 GEN_VEXT_VV(vmulhu_vv_d, 8, 8)
1535 GEN_VEXT_VV(vmulhsu_vv_b, 1, 1)
1536 GEN_VEXT_VV(vmulhsu_vv_h, 2, 2)
1537 GEN_VEXT_VV(vmulhsu_vv_w, 4, 4)
1538 GEN_VEXT_VV(vmulhsu_vv_d, 8, 8)
1539 
1540 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1541 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1542 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1543 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1544 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1545 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1546 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1547 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1548 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1549 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1550 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1551 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1552 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1553 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1554 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1555 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1556 GEN_VEXT_VX(vmul_vx_b, 1, 1)
1557 GEN_VEXT_VX(vmul_vx_h, 2, 2)
1558 GEN_VEXT_VX(vmul_vx_w, 4, 4)
1559 GEN_VEXT_VX(vmul_vx_d, 8, 8)
1560 GEN_VEXT_VX(vmulh_vx_b, 1, 1)
1561 GEN_VEXT_VX(vmulh_vx_h, 2, 2)
1562 GEN_VEXT_VX(vmulh_vx_w, 4, 4)
1563 GEN_VEXT_VX(vmulh_vx_d, 8, 8)
1564 GEN_VEXT_VX(vmulhu_vx_b, 1, 1)
1565 GEN_VEXT_VX(vmulhu_vx_h, 2, 2)
1566 GEN_VEXT_VX(vmulhu_vx_w, 4, 4)
1567 GEN_VEXT_VX(vmulhu_vx_d, 8, 8)
1568 GEN_VEXT_VX(vmulhsu_vx_b, 1, 1)
1569 GEN_VEXT_VX(vmulhsu_vx_h, 2, 2)
1570 GEN_VEXT_VX(vmulhsu_vx_w, 4, 4)
1571 GEN_VEXT_VX(vmulhsu_vx_d, 8, 8)
1572 
1573 /* Vector Integer Divide Instructions */
1574 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1575 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1576 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1577         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1578 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1579         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1580 
1581 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1582 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1583 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1584 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1585 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1586 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1587 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1588 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1589 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1590 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1591 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1592 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1593 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1594 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1595 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1596 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1597 GEN_VEXT_VV(vdivu_vv_b, 1, 1)
1598 GEN_VEXT_VV(vdivu_vv_h, 2, 2)
1599 GEN_VEXT_VV(vdivu_vv_w, 4, 4)
1600 GEN_VEXT_VV(vdivu_vv_d, 8, 8)
1601 GEN_VEXT_VV(vdiv_vv_b, 1, 1)
1602 GEN_VEXT_VV(vdiv_vv_h, 2, 2)
1603 GEN_VEXT_VV(vdiv_vv_w, 4, 4)
1604 GEN_VEXT_VV(vdiv_vv_d, 8, 8)
1605 GEN_VEXT_VV(vremu_vv_b, 1, 1)
1606 GEN_VEXT_VV(vremu_vv_h, 2, 2)
1607 GEN_VEXT_VV(vremu_vv_w, 4, 4)
1608 GEN_VEXT_VV(vremu_vv_d, 8, 8)
1609 GEN_VEXT_VV(vrem_vv_b, 1, 1)
1610 GEN_VEXT_VV(vrem_vv_h, 2, 2)
1611 GEN_VEXT_VV(vrem_vv_w, 4, 4)
1612 GEN_VEXT_VV(vrem_vv_d, 8, 8)
1613 
1614 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1615 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1616 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1617 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1618 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1619 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1620 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1621 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1622 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1623 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1624 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1625 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1626 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1627 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1628 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1629 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1630 GEN_VEXT_VX(vdivu_vx_b, 1, 1)
1631 GEN_VEXT_VX(vdivu_vx_h, 2, 2)
1632 GEN_VEXT_VX(vdivu_vx_w, 4, 4)
1633 GEN_VEXT_VX(vdivu_vx_d, 8, 8)
1634 GEN_VEXT_VX(vdiv_vx_b, 1, 1)
1635 GEN_VEXT_VX(vdiv_vx_h, 2, 2)
1636 GEN_VEXT_VX(vdiv_vx_w, 4, 4)
1637 GEN_VEXT_VX(vdiv_vx_d, 8, 8)
1638 GEN_VEXT_VX(vremu_vx_b, 1, 1)
1639 GEN_VEXT_VX(vremu_vx_h, 2, 2)
1640 GEN_VEXT_VX(vremu_vx_w, 4, 4)
1641 GEN_VEXT_VX(vremu_vx_d, 8, 8)
1642 GEN_VEXT_VX(vrem_vx_b, 1, 1)
1643 GEN_VEXT_VX(vrem_vx_h, 2, 2)
1644 GEN_VEXT_VX(vrem_vx_w, 4, 4)
1645 GEN_VEXT_VX(vrem_vx_d, 8, 8)
1646 
1647 /* Vector Widening Integer Multiply Instructions */
1648 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1649 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1650 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1651 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1652 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1653 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1654 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1655 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1656 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1657 GEN_VEXT_VV(vwmul_vv_b, 1, 2)
1658 GEN_VEXT_VV(vwmul_vv_h, 2, 4)
1659 GEN_VEXT_VV(vwmul_vv_w, 4, 8)
1660 GEN_VEXT_VV(vwmulu_vv_b, 1, 2)
1661 GEN_VEXT_VV(vwmulu_vv_h, 2, 4)
1662 GEN_VEXT_VV(vwmulu_vv_w, 4, 8)
1663 GEN_VEXT_VV(vwmulsu_vv_b, 1, 2)
1664 GEN_VEXT_VV(vwmulsu_vv_h, 2, 4)
1665 GEN_VEXT_VV(vwmulsu_vv_w, 4, 8)
1666 
1667 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1668 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1669 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1670 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1671 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1672 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1673 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1674 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1675 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1676 GEN_VEXT_VX(vwmul_vx_b, 1, 2)
1677 GEN_VEXT_VX(vwmul_vx_h, 2, 4)
1678 GEN_VEXT_VX(vwmul_vx_w, 4, 8)
1679 GEN_VEXT_VX(vwmulu_vx_b, 1, 2)
1680 GEN_VEXT_VX(vwmulu_vx_h, 2, 4)
1681 GEN_VEXT_VX(vwmulu_vx_w, 4, 8)
1682 GEN_VEXT_VX(vwmulsu_vx_b, 1, 2)
1683 GEN_VEXT_VX(vwmulsu_vx_h, 2, 4)
1684 GEN_VEXT_VX(vwmulsu_vx_w, 4, 8)
1685 
1686 /* Vector Single-Width Integer Multiply-Add Instructions */
1687 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1688 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1689 {                                                                  \
1690     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1691     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1692     TD d = *((TD *)vd + HD(i));                                    \
1693     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1694 }
1695 
1696 #define DO_MACC(N, M, D) (M * N + D)
1697 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1698 #define DO_MADD(N, M, D) (M * D + N)
1699 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1700 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1701 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1702 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1703 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1704 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1705 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1706 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1707 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1708 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1709 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1710 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1711 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1712 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1713 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1714 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1715 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1716 GEN_VEXT_VV(vmacc_vv_b, 1, 1)
1717 GEN_VEXT_VV(vmacc_vv_h, 2, 2)
1718 GEN_VEXT_VV(vmacc_vv_w, 4, 4)
1719 GEN_VEXT_VV(vmacc_vv_d, 8, 8)
1720 GEN_VEXT_VV(vnmsac_vv_b, 1, 1)
1721 GEN_VEXT_VV(vnmsac_vv_h, 2, 2)
1722 GEN_VEXT_VV(vnmsac_vv_w, 4, 4)
1723 GEN_VEXT_VV(vnmsac_vv_d, 8, 8)
1724 GEN_VEXT_VV(vmadd_vv_b, 1, 1)
1725 GEN_VEXT_VV(vmadd_vv_h, 2, 2)
1726 GEN_VEXT_VV(vmadd_vv_w, 4, 4)
1727 GEN_VEXT_VV(vmadd_vv_d, 8, 8)
1728 GEN_VEXT_VV(vnmsub_vv_b, 1, 1)
1729 GEN_VEXT_VV(vnmsub_vv_h, 2, 2)
1730 GEN_VEXT_VV(vnmsub_vv_w, 4, 4)
1731 GEN_VEXT_VV(vnmsub_vv_d, 8, 8)
1732 
1733 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1734 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1735 {                                                                   \
1736     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1737     TD d = *((TD *)vd + HD(i));                                     \
1738     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1739 }
1740 
1741 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1742 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1743 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1744 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1745 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1746 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1747 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1748 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1749 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1750 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1751 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1752 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1753 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1754 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1755 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1756 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1757 GEN_VEXT_VX(vmacc_vx_b, 1, 1)
1758 GEN_VEXT_VX(vmacc_vx_h, 2, 2)
1759 GEN_VEXT_VX(vmacc_vx_w, 4, 4)
1760 GEN_VEXT_VX(vmacc_vx_d, 8, 8)
1761 GEN_VEXT_VX(vnmsac_vx_b, 1, 1)
1762 GEN_VEXT_VX(vnmsac_vx_h, 2, 2)
1763 GEN_VEXT_VX(vnmsac_vx_w, 4, 4)
1764 GEN_VEXT_VX(vnmsac_vx_d, 8, 8)
1765 GEN_VEXT_VX(vmadd_vx_b, 1, 1)
1766 GEN_VEXT_VX(vmadd_vx_h, 2, 2)
1767 GEN_VEXT_VX(vmadd_vx_w, 4, 4)
1768 GEN_VEXT_VX(vmadd_vx_d, 8, 8)
1769 GEN_VEXT_VX(vnmsub_vx_b, 1, 1)
1770 GEN_VEXT_VX(vnmsub_vx_h, 2, 2)
1771 GEN_VEXT_VX(vnmsub_vx_w, 4, 4)
1772 GEN_VEXT_VX(vnmsub_vx_d, 8, 8)
1773 
1774 /* Vector Widening Integer Multiply-Add Instructions */
1775 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1776 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1777 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1778 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1779 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1780 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1781 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1782 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1783 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1784 GEN_VEXT_VV(vwmaccu_vv_b, 1, 2)
1785 GEN_VEXT_VV(vwmaccu_vv_h, 2, 4)
1786 GEN_VEXT_VV(vwmaccu_vv_w, 4, 8)
1787 GEN_VEXT_VV(vwmacc_vv_b, 1, 2)
1788 GEN_VEXT_VV(vwmacc_vv_h, 2, 4)
1789 GEN_VEXT_VV(vwmacc_vv_w, 4, 8)
1790 GEN_VEXT_VV(vwmaccsu_vv_b, 1, 2)
1791 GEN_VEXT_VV(vwmaccsu_vv_h, 2, 4)
1792 GEN_VEXT_VV(vwmaccsu_vv_w, 4, 8)
1793 
1794 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1795 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1796 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1797 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1798 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1799 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1800 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1801 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1802 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1803 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1804 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1805 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1806 GEN_VEXT_VX(vwmaccu_vx_b, 1, 2)
1807 GEN_VEXT_VX(vwmaccu_vx_h, 2, 4)
1808 GEN_VEXT_VX(vwmaccu_vx_w, 4, 8)
1809 GEN_VEXT_VX(vwmacc_vx_b, 1, 2)
1810 GEN_VEXT_VX(vwmacc_vx_h, 2, 4)
1811 GEN_VEXT_VX(vwmacc_vx_w, 4, 8)
1812 GEN_VEXT_VX(vwmaccsu_vx_b, 1, 2)
1813 GEN_VEXT_VX(vwmaccsu_vx_h, 2, 4)
1814 GEN_VEXT_VX(vwmaccsu_vx_w, 4, 8)
1815 GEN_VEXT_VX(vwmaccus_vx_b, 1, 2)
1816 GEN_VEXT_VX(vwmaccus_vx_h, 2, 4)
1817 GEN_VEXT_VX(vwmaccus_vx_w, 4, 8)
1818 
1819 /* Vector Integer Merge and Move Instructions */
1820 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1821 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1822                   uint32_t desc)                                     \
1823 {                                                                    \
1824     uint32_t vl = env->vl;                                           \
1825     uint32_t i;                                                      \
1826                                                                      \
1827     for (i = 0; i < vl; i++) {                                       \
1828         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1829         *((ETYPE *)vd + H(i)) = s1;                                  \
1830     }                                                                \
1831 }
1832 
1833 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1834 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1835 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1836 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1837 
1838 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1839 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1840                   uint32_t desc)                                     \
1841 {                                                                    \
1842     uint32_t vl = env->vl;                                           \
1843     uint32_t i;                                                      \
1844                                                                      \
1845     for (i = 0; i < vl; i++) {                                       \
1846         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1847     }                                                                \
1848 }
1849 
1850 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1851 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1852 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1853 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1854 
1855 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1856 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1857                   CPURISCVState *env, uint32_t desc)                 \
1858 {                                                                    \
1859     uint32_t vl = env->vl;                                           \
1860     uint32_t i;                                                      \
1861                                                                      \
1862     for (i = 0; i < vl; i++) {                                       \
1863         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1864         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1865     }                                                                \
1866 }
1867 
1868 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1869 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1870 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1871 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1872 
1873 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1874 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1875                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1876 {                                                                    \
1877     uint32_t vl = env->vl;                                           \
1878     uint32_t i;                                                      \
1879                                                                      \
1880     for (i = 0; i < vl; i++) {                                       \
1881         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1882         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1883                    (ETYPE)(target_long)s1);                          \
1884         *((ETYPE *)vd + H(i)) = d;                                   \
1885     }                                                                \
1886 }
1887 
1888 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1889 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1890 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1891 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1892 
1893 /*
1894  *** Vector Fixed-Point Arithmetic Instructions
1895  */
1896 
1897 /* Vector Single-Width Saturating Add and Subtract */
1898 
1899 /*
1900  * As fixed point instructions probably have round mode and saturation,
1901  * define common macros for fixed point here.
1902  */
1903 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1904                           CPURISCVState *env, int vxrm);
1905 
1906 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1907 static inline void                                                  \
1908 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1909           CPURISCVState *env, int vxrm)                             \
1910 {                                                                   \
1911     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1912     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1913     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1914 }
1915 
1916 static inline void
1917 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1918              CPURISCVState *env,
1919              uint32_t vl, uint32_t vm, int vxrm,
1920              opivv2_rm_fn *fn)
1921 {
1922     for (uint32_t i = 0; i < vl; i++) {
1923         if (!vm && !vext_elem_mask(v0, i)) {
1924             continue;
1925         }
1926         fn(vd, vs1, vs2, i, env, vxrm);
1927     }
1928 }
1929 
1930 static inline void
1931 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1932              CPURISCVState *env,
1933              uint32_t desc, uint32_t esz, uint32_t dsz,
1934              opivv2_rm_fn *fn)
1935 {
1936     uint32_t vm = vext_vm(desc);
1937     uint32_t vl = env->vl;
1938 
1939     switch (env->vxrm) {
1940     case 0: /* rnu */
1941         vext_vv_rm_1(vd, v0, vs1, vs2,
1942                      env, vl, vm, 0, fn);
1943         break;
1944     case 1: /* rne */
1945         vext_vv_rm_1(vd, v0, vs1, vs2,
1946                      env, vl, vm, 1, fn);
1947         break;
1948     case 2: /* rdn */
1949         vext_vv_rm_1(vd, v0, vs1, vs2,
1950                      env, vl, vm, 2, fn);
1951         break;
1952     default: /* rod */
1953         vext_vv_rm_1(vd, v0, vs1, vs2,
1954                      env, vl, vm, 3, fn);
1955         break;
1956     }
1957 }
1958 
1959 /* generate helpers for fixed point instructions with OPIVV format */
1960 #define GEN_VEXT_VV_RM(NAME, ESZ, DSZ)                          \
1961 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1962                   CPURISCVState *env, uint32_t desc)            \
1963 {                                                               \
1964     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc, ESZ, DSZ,         \
1965                  do_##NAME);                                    \
1966 }
1967 
1968 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
1969 {
1970     uint8_t res = a + b;
1971     if (res < a) {
1972         res = UINT8_MAX;
1973         env->vxsat = 0x1;
1974     }
1975     return res;
1976 }
1977 
1978 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
1979                                uint16_t b)
1980 {
1981     uint16_t res = a + b;
1982     if (res < a) {
1983         res = UINT16_MAX;
1984         env->vxsat = 0x1;
1985     }
1986     return res;
1987 }
1988 
1989 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
1990                                uint32_t b)
1991 {
1992     uint32_t res = a + b;
1993     if (res < a) {
1994         res = UINT32_MAX;
1995         env->vxsat = 0x1;
1996     }
1997     return res;
1998 }
1999 
2000 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2001                                uint64_t b)
2002 {
2003     uint64_t res = a + b;
2004     if (res < a) {
2005         res = UINT64_MAX;
2006         env->vxsat = 0x1;
2007     }
2008     return res;
2009 }
2010 
2011 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2012 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2013 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2014 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2015 GEN_VEXT_VV_RM(vsaddu_vv_b, 1, 1)
2016 GEN_VEXT_VV_RM(vsaddu_vv_h, 2, 2)
2017 GEN_VEXT_VV_RM(vsaddu_vv_w, 4, 4)
2018 GEN_VEXT_VV_RM(vsaddu_vv_d, 8, 8)
2019 
2020 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2021                           CPURISCVState *env, int vxrm);
2022 
2023 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2024 static inline void                                                  \
2025 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2026           CPURISCVState *env, int vxrm)                             \
2027 {                                                                   \
2028     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2029     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2030 }
2031 
2032 static inline void
2033 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2034              CPURISCVState *env,
2035              uint32_t vl, uint32_t vm, int vxrm,
2036              opivx2_rm_fn *fn)
2037 {
2038     for (uint32_t i = 0; i < vl; i++) {
2039         if (!vm && !vext_elem_mask(v0, i)) {
2040             continue;
2041         }
2042         fn(vd, s1, vs2, i, env, vxrm);
2043     }
2044 }
2045 
2046 static inline void
2047 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2048              CPURISCVState *env,
2049              uint32_t desc, uint32_t esz, uint32_t dsz,
2050              opivx2_rm_fn *fn)
2051 {
2052     uint32_t vm = vext_vm(desc);
2053     uint32_t vl = env->vl;
2054 
2055     switch (env->vxrm) {
2056     case 0: /* rnu */
2057         vext_vx_rm_1(vd, v0, s1, vs2,
2058                      env, vl, vm, 0, fn);
2059         break;
2060     case 1: /* rne */
2061         vext_vx_rm_1(vd, v0, s1, vs2,
2062                      env, vl, vm, 1, fn);
2063         break;
2064     case 2: /* rdn */
2065         vext_vx_rm_1(vd, v0, s1, vs2,
2066                      env, vl, vm, 2, fn);
2067         break;
2068     default: /* rod */
2069         vext_vx_rm_1(vd, v0, s1, vs2,
2070                      env, vl, vm, 3, fn);
2071         break;
2072     }
2073 }
2074 
2075 /* generate helpers for fixed point instructions with OPIVX format */
2076 #define GEN_VEXT_VX_RM(NAME, ESZ, DSZ)                    \
2077 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2078         void *vs2, CPURISCVState *env, uint32_t desc)     \
2079 {                                                         \
2080     vext_vx_rm_2(vd, v0, s1, vs2, env, desc, ESZ, DSZ,    \
2081                  do_##NAME);                              \
2082 }
2083 
2084 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2085 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2086 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2087 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2088 GEN_VEXT_VX_RM(vsaddu_vx_b, 1, 1)
2089 GEN_VEXT_VX_RM(vsaddu_vx_h, 2, 2)
2090 GEN_VEXT_VX_RM(vsaddu_vx_w, 4, 4)
2091 GEN_VEXT_VX_RM(vsaddu_vx_d, 8, 8)
2092 
2093 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2094 {
2095     int8_t res = a + b;
2096     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2097         res = a > 0 ? INT8_MAX : INT8_MIN;
2098         env->vxsat = 0x1;
2099     }
2100     return res;
2101 }
2102 
2103 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2104 {
2105     int16_t res = a + b;
2106     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2107         res = a > 0 ? INT16_MAX : INT16_MIN;
2108         env->vxsat = 0x1;
2109     }
2110     return res;
2111 }
2112 
2113 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2114 {
2115     int32_t res = a + b;
2116     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2117         res = a > 0 ? INT32_MAX : INT32_MIN;
2118         env->vxsat = 0x1;
2119     }
2120     return res;
2121 }
2122 
2123 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2124 {
2125     int64_t res = a + b;
2126     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2127         res = a > 0 ? INT64_MAX : INT64_MIN;
2128         env->vxsat = 0x1;
2129     }
2130     return res;
2131 }
2132 
2133 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2134 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2135 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2136 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2137 GEN_VEXT_VV_RM(vsadd_vv_b, 1, 1)
2138 GEN_VEXT_VV_RM(vsadd_vv_h, 2, 2)
2139 GEN_VEXT_VV_RM(vsadd_vv_w, 4, 4)
2140 GEN_VEXT_VV_RM(vsadd_vv_d, 8, 8)
2141 
2142 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2143 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2144 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2145 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2146 GEN_VEXT_VX_RM(vsadd_vx_b, 1, 1)
2147 GEN_VEXT_VX_RM(vsadd_vx_h, 2, 2)
2148 GEN_VEXT_VX_RM(vsadd_vx_w, 4, 4)
2149 GEN_VEXT_VX_RM(vsadd_vx_d, 8, 8)
2150 
2151 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2152 {
2153     uint8_t res = a - b;
2154     if (res > a) {
2155         res = 0;
2156         env->vxsat = 0x1;
2157     }
2158     return res;
2159 }
2160 
2161 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2162                                uint16_t b)
2163 {
2164     uint16_t res = a - b;
2165     if (res > a) {
2166         res = 0;
2167         env->vxsat = 0x1;
2168     }
2169     return res;
2170 }
2171 
2172 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2173                                uint32_t b)
2174 {
2175     uint32_t res = a - b;
2176     if (res > a) {
2177         res = 0;
2178         env->vxsat = 0x1;
2179     }
2180     return res;
2181 }
2182 
2183 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2184                                uint64_t b)
2185 {
2186     uint64_t res = a - b;
2187     if (res > a) {
2188         res = 0;
2189         env->vxsat = 0x1;
2190     }
2191     return res;
2192 }
2193 
2194 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2195 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2196 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2197 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2198 GEN_VEXT_VV_RM(vssubu_vv_b, 1, 1)
2199 GEN_VEXT_VV_RM(vssubu_vv_h, 2, 2)
2200 GEN_VEXT_VV_RM(vssubu_vv_w, 4, 4)
2201 GEN_VEXT_VV_RM(vssubu_vv_d, 8, 8)
2202 
2203 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2204 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2205 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2206 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2207 GEN_VEXT_VX_RM(vssubu_vx_b, 1, 1)
2208 GEN_VEXT_VX_RM(vssubu_vx_h, 2, 2)
2209 GEN_VEXT_VX_RM(vssubu_vx_w, 4, 4)
2210 GEN_VEXT_VX_RM(vssubu_vx_d, 8, 8)
2211 
2212 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2213 {
2214     int8_t res = a - b;
2215     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2216         res = a >= 0 ? INT8_MAX : INT8_MIN;
2217         env->vxsat = 0x1;
2218     }
2219     return res;
2220 }
2221 
2222 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2223 {
2224     int16_t res = a - b;
2225     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2226         res = a >= 0 ? INT16_MAX : INT16_MIN;
2227         env->vxsat = 0x1;
2228     }
2229     return res;
2230 }
2231 
2232 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2233 {
2234     int32_t res = a - b;
2235     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2236         res = a >= 0 ? INT32_MAX : INT32_MIN;
2237         env->vxsat = 0x1;
2238     }
2239     return res;
2240 }
2241 
2242 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2243 {
2244     int64_t res = a - b;
2245     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2246         res = a >= 0 ? INT64_MAX : INT64_MIN;
2247         env->vxsat = 0x1;
2248     }
2249     return res;
2250 }
2251 
2252 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2253 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2254 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2255 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2256 GEN_VEXT_VV_RM(vssub_vv_b, 1, 1)
2257 GEN_VEXT_VV_RM(vssub_vv_h, 2, 2)
2258 GEN_VEXT_VV_RM(vssub_vv_w, 4, 4)
2259 GEN_VEXT_VV_RM(vssub_vv_d, 8, 8)
2260 
2261 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2262 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2263 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2264 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2265 GEN_VEXT_VX_RM(vssub_vx_b, 1, 1)
2266 GEN_VEXT_VX_RM(vssub_vx_h, 2, 2)
2267 GEN_VEXT_VX_RM(vssub_vx_w, 4, 4)
2268 GEN_VEXT_VX_RM(vssub_vx_d, 8, 8)
2269 
2270 /* Vector Single-Width Averaging Add and Subtract */
2271 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2272 {
2273     uint8_t d = extract64(v, shift, 1);
2274     uint8_t d1;
2275     uint64_t D1, D2;
2276 
2277     if (shift == 0 || shift > 64) {
2278         return 0;
2279     }
2280 
2281     d1 = extract64(v, shift - 1, 1);
2282     D1 = extract64(v, 0, shift);
2283     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2284         return d1;
2285     } else if (vxrm == 1) { /* round-to-nearest-even */
2286         if (shift > 1) {
2287             D2 = extract64(v, 0, shift - 1);
2288             return d1 & ((D2 != 0) | d);
2289         } else {
2290             return d1 & d;
2291         }
2292     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2293         return !d & (D1 != 0);
2294     }
2295     return 0; /* round-down (truncate) */
2296 }
2297 
2298 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2299 {
2300     int64_t res = (int64_t)a + b;
2301     uint8_t round = get_round(vxrm, res, 1);
2302 
2303     return (res >> 1) + round;
2304 }
2305 
2306 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2307 {
2308     int64_t res = a + b;
2309     uint8_t round = get_round(vxrm, res, 1);
2310     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2311 
2312     /* With signed overflow, bit 64 is inverse of bit 63. */
2313     return ((res >> 1) ^ over) + round;
2314 }
2315 
2316 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2317 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2318 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2319 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2320 GEN_VEXT_VV_RM(vaadd_vv_b, 1, 1)
2321 GEN_VEXT_VV_RM(vaadd_vv_h, 2, 2)
2322 GEN_VEXT_VV_RM(vaadd_vv_w, 4, 4)
2323 GEN_VEXT_VV_RM(vaadd_vv_d, 8, 8)
2324 
2325 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2326 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2327 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2328 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2329 GEN_VEXT_VX_RM(vaadd_vx_b, 1, 1)
2330 GEN_VEXT_VX_RM(vaadd_vx_h, 2, 2)
2331 GEN_VEXT_VX_RM(vaadd_vx_w, 4, 4)
2332 GEN_VEXT_VX_RM(vaadd_vx_d, 8, 8)
2333 
2334 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2335 {
2336     int64_t res = (int64_t)a - b;
2337     uint8_t round = get_round(vxrm, res, 1);
2338 
2339     return (res >> 1) + round;
2340 }
2341 
2342 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2343 {
2344     int64_t res = (int64_t)a - b;
2345     uint8_t round = get_round(vxrm, res, 1);
2346     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2347 
2348     /* With signed overflow, bit 64 is inverse of bit 63. */
2349     return ((res >> 1) ^ over) + round;
2350 }
2351 
2352 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2353 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2354 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2355 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2356 GEN_VEXT_VV_RM(vasub_vv_b, 1, 1)
2357 GEN_VEXT_VV_RM(vasub_vv_h, 2, 2)
2358 GEN_VEXT_VV_RM(vasub_vv_w, 4, 4)
2359 GEN_VEXT_VV_RM(vasub_vv_d, 8, 8)
2360 
2361 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2362 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2363 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2364 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2365 GEN_VEXT_VX_RM(vasub_vx_b, 1, 1)
2366 GEN_VEXT_VX_RM(vasub_vx_h, 2, 2)
2367 GEN_VEXT_VX_RM(vasub_vx_w, 4, 4)
2368 GEN_VEXT_VX_RM(vasub_vx_d, 8, 8)
2369 
2370 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2371 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2372 {
2373     uint8_t round;
2374     int16_t res;
2375 
2376     res = (int16_t)a * (int16_t)b;
2377     round = get_round(vxrm, res, 7);
2378     res   = (res >> 7) + round;
2379 
2380     if (res > INT8_MAX) {
2381         env->vxsat = 0x1;
2382         return INT8_MAX;
2383     } else if (res < INT8_MIN) {
2384         env->vxsat = 0x1;
2385         return INT8_MIN;
2386     } else {
2387         return res;
2388     }
2389 }
2390 
2391 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2392 {
2393     uint8_t round;
2394     int32_t res;
2395 
2396     res = (int32_t)a * (int32_t)b;
2397     round = get_round(vxrm, res, 15);
2398     res   = (res >> 15) + round;
2399 
2400     if (res > INT16_MAX) {
2401         env->vxsat = 0x1;
2402         return INT16_MAX;
2403     } else if (res < INT16_MIN) {
2404         env->vxsat = 0x1;
2405         return INT16_MIN;
2406     } else {
2407         return res;
2408     }
2409 }
2410 
2411 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2412 {
2413     uint8_t round;
2414     int64_t res;
2415 
2416     res = (int64_t)a * (int64_t)b;
2417     round = get_round(vxrm, res, 31);
2418     res   = (res >> 31) + round;
2419 
2420     if (res > INT32_MAX) {
2421         env->vxsat = 0x1;
2422         return INT32_MAX;
2423     } else if (res < INT32_MIN) {
2424         env->vxsat = 0x1;
2425         return INT32_MIN;
2426     } else {
2427         return res;
2428     }
2429 }
2430 
2431 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2432 {
2433     uint8_t round;
2434     uint64_t hi_64, lo_64;
2435     int64_t res;
2436 
2437     if (a == INT64_MIN && b == INT64_MIN) {
2438         env->vxsat = 1;
2439         return INT64_MAX;
2440     }
2441 
2442     muls64(&lo_64, &hi_64, a, b);
2443     round = get_round(vxrm, lo_64, 63);
2444     /*
2445      * Cannot overflow, as there are always
2446      * 2 sign bits after multiply.
2447      */
2448     res = (hi_64 << 1) | (lo_64 >> 63);
2449     if (round) {
2450         if (res == INT64_MAX) {
2451             env->vxsat = 1;
2452         } else {
2453             res += 1;
2454         }
2455     }
2456     return res;
2457 }
2458 
2459 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2460 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2461 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2462 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2463 GEN_VEXT_VV_RM(vsmul_vv_b, 1, 1)
2464 GEN_VEXT_VV_RM(vsmul_vv_h, 2, 2)
2465 GEN_VEXT_VV_RM(vsmul_vv_w, 4, 4)
2466 GEN_VEXT_VV_RM(vsmul_vv_d, 8, 8)
2467 
2468 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2469 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2470 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2471 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2472 GEN_VEXT_VX_RM(vsmul_vx_b, 1, 1)
2473 GEN_VEXT_VX_RM(vsmul_vx_h, 2, 2)
2474 GEN_VEXT_VX_RM(vsmul_vx_w, 4, 4)
2475 GEN_VEXT_VX_RM(vsmul_vx_d, 8, 8)
2476 
2477 /* Vector Widening Saturating Scaled Multiply-Add */
2478 static inline uint16_t
2479 vwsmaccu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b,
2480           uint16_t c)
2481 {
2482     uint8_t round;
2483     uint16_t res = (uint16_t)a * b;
2484 
2485     round = get_round(vxrm, res, 4);
2486     res   = (res >> 4) + round;
2487     return saddu16(env, vxrm, c, res);
2488 }
2489 
2490 static inline uint32_t
2491 vwsmaccu16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b,
2492            uint32_t c)
2493 {
2494     uint8_t round;
2495     uint32_t res = (uint32_t)a * b;
2496 
2497     round = get_round(vxrm, res, 8);
2498     res   = (res >> 8) + round;
2499     return saddu32(env, vxrm, c, res);
2500 }
2501 
2502 static inline uint64_t
2503 vwsmaccu32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b,
2504            uint64_t c)
2505 {
2506     uint8_t round;
2507     uint64_t res = (uint64_t)a * b;
2508 
2509     round = get_round(vxrm, res, 16);
2510     res   = (res >> 16) + round;
2511     return saddu64(env, vxrm, c, res);
2512 }
2513 
2514 #define OPIVV3_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
2515 static inline void                                                 \
2516 do_##NAME(void *vd, void *vs1, void *vs2, int i,                   \
2517           CPURISCVState *env, int vxrm)                            \
2518 {                                                                  \
2519     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
2520     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
2521     TD d = *((TD *)vd + HD(i));                                    \
2522     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1, d);                \
2523 }
2524 
2525 RVVCALL(OPIVV3_RM, vwsmaccu_vv_b, WOP_UUU_B, H2, H1, H1, vwsmaccu8)
2526 RVVCALL(OPIVV3_RM, vwsmaccu_vv_h, WOP_UUU_H, H4, H2, H2, vwsmaccu16)
2527 RVVCALL(OPIVV3_RM, vwsmaccu_vv_w, WOP_UUU_W, H8, H4, H4, vwsmaccu32)
2528 GEN_VEXT_VV_RM(vwsmaccu_vv_b, 1, 2)
2529 GEN_VEXT_VV_RM(vwsmaccu_vv_h, 2, 4)
2530 GEN_VEXT_VV_RM(vwsmaccu_vv_w, 4, 8)
2531 
2532 #define OPIVX3_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)         \
2533 static inline void                                                 \
2534 do_##NAME(void *vd, target_long s1, void *vs2, int i,              \
2535           CPURISCVState *env, int vxrm)                            \
2536 {                                                                  \
2537     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
2538     TD d = *((TD *)vd + HD(i));                                    \
2539     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1, d);       \
2540 }
2541 
2542 RVVCALL(OPIVX3_RM, vwsmaccu_vx_b, WOP_UUU_B, H2, H1, vwsmaccu8)
2543 RVVCALL(OPIVX3_RM, vwsmaccu_vx_h, WOP_UUU_H, H4, H2, vwsmaccu16)
2544 RVVCALL(OPIVX3_RM, vwsmaccu_vx_w, WOP_UUU_W, H8, H4, vwsmaccu32)
2545 GEN_VEXT_VX_RM(vwsmaccu_vx_b, 1, 2)
2546 GEN_VEXT_VX_RM(vwsmaccu_vx_h, 2, 4)
2547 GEN_VEXT_VX_RM(vwsmaccu_vx_w, 4, 8)
2548 
2549 static inline int16_t
2550 vwsmacc8(CPURISCVState *env, int vxrm, int8_t a, int8_t b, int16_t c)
2551 {
2552     uint8_t round;
2553     int16_t res = (int16_t)a * b;
2554 
2555     round = get_round(vxrm, res, 4);
2556     res   = (res >> 4) + round;
2557     return sadd16(env, vxrm, c, res);
2558 }
2559 
2560 static inline int32_t
2561 vwsmacc16(CPURISCVState *env, int vxrm, int16_t a, int16_t b, int32_t c)
2562 {
2563     uint8_t round;
2564     int32_t res = (int32_t)a * b;
2565 
2566     round = get_round(vxrm, res, 8);
2567     res   = (res >> 8) + round;
2568     return sadd32(env, vxrm, c, res);
2569 
2570 }
2571 
2572 static inline int64_t
2573 vwsmacc32(CPURISCVState *env, int vxrm, int32_t a, int32_t b, int64_t c)
2574 {
2575     uint8_t round;
2576     int64_t res = (int64_t)a * b;
2577 
2578     round = get_round(vxrm, res, 16);
2579     res   = (res >> 16) + round;
2580     return sadd64(env, vxrm, c, res);
2581 }
2582 
2583 RVVCALL(OPIVV3_RM, vwsmacc_vv_b, WOP_SSS_B, H2, H1, H1, vwsmacc8)
2584 RVVCALL(OPIVV3_RM, vwsmacc_vv_h, WOP_SSS_H, H4, H2, H2, vwsmacc16)
2585 RVVCALL(OPIVV3_RM, vwsmacc_vv_w, WOP_SSS_W, H8, H4, H4, vwsmacc32)
2586 GEN_VEXT_VV_RM(vwsmacc_vv_b, 1, 2)
2587 GEN_VEXT_VV_RM(vwsmacc_vv_h, 2, 4)
2588 GEN_VEXT_VV_RM(vwsmacc_vv_w, 4, 8)
2589 RVVCALL(OPIVX3_RM, vwsmacc_vx_b, WOP_SSS_B, H2, H1, vwsmacc8)
2590 RVVCALL(OPIVX3_RM, vwsmacc_vx_h, WOP_SSS_H, H4, H2, vwsmacc16)
2591 RVVCALL(OPIVX3_RM, vwsmacc_vx_w, WOP_SSS_W, H8, H4, vwsmacc32)
2592 GEN_VEXT_VX_RM(vwsmacc_vx_b, 1, 2)
2593 GEN_VEXT_VX_RM(vwsmacc_vx_h, 2, 4)
2594 GEN_VEXT_VX_RM(vwsmacc_vx_w, 4, 8)
2595 
2596 static inline int16_t
2597 vwsmaccsu8(CPURISCVState *env, int vxrm, uint8_t a, int8_t b, int16_t c)
2598 {
2599     uint8_t round;
2600     int16_t res = a * (int16_t)b;
2601 
2602     round = get_round(vxrm, res, 4);
2603     res   = (res >> 4) + round;
2604     return ssub16(env, vxrm, c, res);
2605 }
2606 
2607 static inline int32_t
2608 vwsmaccsu16(CPURISCVState *env, int vxrm, uint16_t a, int16_t b, uint32_t c)
2609 {
2610     uint8_t round;
2611     int32_t res = a * (int32_t)b;
2612 
2613     round = get_round(vxrm, res, 8);
2614     res   = (res >> 8) + round;
2615     return ssub32(env, vxrm, c, res);
2616 }
2617 
2618 static inline int64_t
2619 vwsmaccsu32(CPURISCVState *env, int vxrm, uint32_t a, int32_t b, int64_t c)
2620 {
2621     uint8_t round;
2622     int64_t res = a * (int64_t)b;
2623 
2624     round = get_round(vxrm, res, 16);
2625     res   = (res >> 16) + round;
2626     return ssub64(env, vxrm, c, res);
2627 }
2628 
2629 RVVCALL(OPIVV3_RM, vwsmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, vwsmaccsu8)
2630 RVVCALL(OPIVV3_RM, vwsmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, vwsmaccsu16)
2631 RVVCALL(OPIVV3_RM, vwsmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, vwsmaccsu32)
2632 GEN_VEXT_VV_RM(vwsmaccsu_vv_b, 1, 2)
2633 GEN_VEXT_VV_RM(vwsmaccsu_vv_h, 2, 4)
2634 GEN_VEXT_VV_RM(vwsmaccsu_vv_w, 4, 8)
2635 RVVCALL(OPIVX3_RM, vwsmaccsu_vx_b, WOP_SSU_B, H2, H1, vwsmaccsu8)
2636 RVVCALL(OPIVX3_RM, vwsmaccsu_vx_h, WOP_SSU_H, H4, H2, vwsmaccsu16)
2637 RVVCALL(OPIVX3_RM, vwsmaccsu_vx_w, WOP_SSU_W, H8, H4, vwsmaccsu32)
2638 GEN_VEXT_VX_RM(vwsmaccsu_vx_b, 1, 2)
2639 GEN_VEXT_VX_RM(vwsmaccsu_vx_h, 2, 4)
2640 GEN_VEXT_VX_RM(vwsmaccsu_vx_w, 4, 8)
2641 
2642 static inline int16_t
2643 vwsmaccus8(CPURISCVState *env, int vxrm, int8_t a, uint8_t b, int16_t c)
2644 {
2645     uint8_t round;
2646     int16_t res = (int16_t)a * b;
2647 
2648     round = get_round(vxrm, res, 4);
2649     res   = (res >> 4) + round;
2650     return ssub16(env, vxrm, c, res);
2651 }
2652 
2653 static inline int32_t
2654 vwsmaccus16(CPURISCVState *env, int vxrm, int16_t a, uint16_t b, int32_t c)
2655 {
2656     uint8_t round;
2657     int32_t res = (int32_t)a * b;
2658 
2659     round = get_round(vxrm, res, 8);
2660     res   = (res >> 8) + round;
2661     return ssub32(env, vxrm, c, res);
2662 }
2663 
2664 static inline int64_t
2665 vwsmaccus32(CPURISCVState *env, int vxrm, int32_t a, uint32_t b, int64_t c)
2666 {
2667     uint8_t round;
2668     int64_t res = (int64_t)a * b;
2669 
2670     round = get_round(vxrm, res, 16);
2671     res   = (res >> 16) + round;
2672     return ssub64(env, vxrm, c, res);
2673 }
2674 
2675 RVVCALL(OPIVX3_RM, vwsmaccus_vx_b, WOP_SUS_B, H2, H1, vwsmaccus8)
2676 RVVCALL(OPIVX3_RM, vwsmaccus_vx_h, WOP_SUS_H, H4, H2, vwsmaccus16)
2677 RVVCALL(OPIVX3_RM, vwsmaccus_vx_w, WOP_SUS_W, H8, H4, vwsmaccus32)
2678 GEN_VEXT_VX_RM(vwsmaccus_vx_b, 1, 2)
2679 GEN_VEXT_VX_RM(vwsmaccus_vx_h, 2, 4)
2680 GEN_VEXT_VX_RM(vwsmaccus_vx_w, 4, 8)
2681 
2682 /* Vector Single-Width Scaling Shift Instructions */
2683 static inline uint8_t
2684 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2685 {
2686     uint8_t round, shift = b & 0x7;
2687     uint8_t res;
2688 
2689     round = get_round(vxrm, a, shift);
2690     res   = (a >> shift)  + round;
2691     return res;
2692 }
2693 static inline uint16_t
2694 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2695 {
2696     uint8_t round, shift = b & 0xf;
2697     uint16_t res;
2698 
2699     round = get_round(vxrm, a, shift);
2700     res   = (a >> shift)  + round;
2701     return res;
2702 }
2703 static inline uint32_t
2704 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2705 {
2706     uint8_t round, shift = b & 0x1f;
2707     uint32_t res;
2708 
2709     round = get_round(vxrm, a, shift);
2710     res   = (a >> shift)  + round;
2711     return res;
2712 }
2713 static inline uint64_t
2714 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2715 {
2716     uint8_t round, shift = b & 0x3f;
2717     uint64_t res;
2718 
2719     round = get_round(vxrm, a, shift);
2720     res   = (a >> shift)  + round;
2721     return res;
2722 }
2723 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2724 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2725 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2726 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2727 GEN_VEXT_VV_RM(vssrl_vv_b, 1, 1)
2728 GEN_VEXT_VV_RM(vssrl_vv_h, 2, 2)
2729 GEN_VEXT_VV_RM(vssrl_vv_w, 4, 4)
2730 GEN_VEXT_VV_RM(vssrl_vv_d, 8, 8)
2731 
2732 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2733 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2734 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2735 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2736 GEN_VEXT_VX_RM(vssrl_vx_b, 1, 1)
2737 GEN_VEXT_VX_RM(vssrl_vx_h, 2, 2)
2738 GEN_VEXT_VX_RM(vssrl_vx_w, 4, 4)
2739 GEN_VEXT_VX_RM(vssrl_vx_d, 8, 8)
2740 
2741 static inline int8_t
2742 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2743 {
2744     uint8_t round, shift = b & 0x7;
2745     int8_t res;
2746 
2747     round = get_round(vxrm, a, shift);
2748     res   = (a >> shift)  + round;
2749     return res;
2750 }
2751 static inline int16_t
2752 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2753 {
2754     uint8_t round, shift = b & 0xf;
2755     int16_t res;
2756 
2757     round = get_round(vxrm, a, shift);
2758     res   = (a >> shift)  + round;
2759     return res;
2760 }
2761 static inline int32_t
2762 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2763 {
2764     uint8_t round, shift = b & 0x1f;
2765     int32_t res;
2766 
2767     round = get_round(vxrm, a, shift);
2768     res   = (a >> shift)  + round;
2769     return res;
2770 }
2771 static inline int64_t
2772 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2773 {
2774     uint8_t round, shift = b & 0x3f;
2775     int64_t res;
2776 
2777     round = get_round(vxrm, a, shift);
2778     res   = (a >> shift)  + round;
2779     return res;
2780 }
2781 
2782 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2783 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2784 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2785 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2786 GEN_VEXT_VV_RM(vssra_vv_b, 1, 1)
2787 GEN_VEXT_VV_RM(vssra_vv_h, 2, 2)
2788 GEN_VEXT_VV_RM(vssra_vv_w, 4, 4)
2789 GEN_VEXT_VV_RM(vssra_vv_d, 8, 8)
2790 
2791 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2792 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2793 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2794 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2795 GEN_VEXT_VX_RM(vssra_vx_b, 1, 1)
2796 GEN_VEXT_VX_RM(vssra_vx_h, 2, 2)
2797 GEN_VEXT_VX_RM(vssra_vx_w, 4, 4)
2798 GEN_VEXT_VX_RM(vssra_vx_d, 8, 8)
2799 
2800 /* Vector Narrowing Fixed-Point Clip Instructions */
2801 static inline int8_t
2802 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2803 {
2804     uint8_t round, shift = b & 0xf;
2805     int16_t res;
2806 
2807     round = get_round(vxrm, a, shift);
2808     res   = (a >> shift)  + round;
2809     if (res > INT8_MAX) {
2810         env->vxsat = 0x1;
2811         return INT8_MAX;
2812     } else if (res < INT8_MIN) {
2813         env->vxsat = 0x1;
2814         return INT8_MIN;
2815     } else {
2816         return res;
2817     }
2818 }
2819 
2820 static inline int16_t
2821 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2822 {
2823     uint8_t round, shift = b & 0x1f;
2824     int32_t res;
2825 
2826     round = get_round(vxrm, a, shift);
2827     res   = (a >> shift)  + round;
2828     if (res > INT16_MAX) {
2829         env->vxsat = 0x1;
2830         return INT16_MAX;
2831     } else if (res < INT16_MIN) {
2832         env->vxsat = 0x1;
2833         return INT16_MIN;
2834     } else {
2835         return res;
2836     }
2837 }
2838 
2839 static inline int32_t
2840 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2841 {
2842     uint8_t round, shift = b & 0x3f;
2843     int64_t res;
2844 
2845     round = get_round(vxrm, a, shift);
2846     res   = (a >> shift)  + round;
2847     if (res > INT32_MAX) {
2848         env->vxsat = 0x1;
2849         return INT32_MAX;
2850     } else if (res < INT32_MIN) {
2851         env->vxsat = 0x1;
2852         return INT32_MIN;
2853     } else {
2854         return res;
2855     }
2856 }
2857 
2858 RVVCALL(OPIVV2_RM, vnclip_vv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2859 RVVCALL(OPIVV2_RM, vnclip_vv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2860 RVVCALL(OPIVV2_RM, vnclip_vv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2861 GEN_VEXT_VV_RM(vnclip_vv_b, 1, 1)
2862 GEN_VEXT_VV_RM(vnclip_vv_h, 2, 2)
2863 GEN_VEXT_VV_RM(vnclip_vv_w, 4, 4)
2864 
2865 RVVCALL(OPIVX2_RM, vnclip_vx_b, NOP_SSS_B, H1, H2, vnclip8)
2866 RVVCALL(OPIVX2_RM, vnclip_vx_h, NOP_SSS_H, H2, H4, vnclip16)
2867 RVVCALL(OPIVX2_RM, vnclip_vx_w, NOP_SSS_W, H4, H8, vnclip32)
2868 GEN_VEXT_VX_RM(vnclip_vx_b, 1, 1)
2869 GEN_VEXT_VX_RM(vnclip_vx_h, 2, 2)
2870 GEN_VEXT_VX_RM(vnclip_vx_w, 4, 4)
2871 
2872 static inline uint8_t
2873 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2874 {
2875     uint8_t round, shift = b & 0xf;
2876     uint16_t res;
2877 
2878     round = get_round(vxrm, a, shift);
2879     res   = (a >> shift)  + round;
2880     if (res > UINT8_MAX) {
2881         env->vxsat = 0x1;
2882         return UINT8_MAX;
2883     } else {
2884         return res;
2885     }
2886 }
2887 
2888 static inline uint16_t
2889 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2890 {
2891     uint8_t round, shift = b & 0x1f;
2892     uint32_t res;
2893 
2894     round = get_round(vxrm, a, shift);
2895     res   = (a >> shift)  + round;
2896     if (res > UINT16_MAX) {
2897         env->vxsat = 0x1;
2898         return UINT16_MAX;
2899     } else {
2900         return res;
2901     }
2902 }
2903 
2904 static inline uint32_t
2905 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2906 {
2907     uint8_t round, shift = b & 0x3f;
2908     int64_t res;
2909 
2910     round = get_round(vxrm, a, shift);
2911     res   = (a >> shift)  + round;
2912     if (res > UINT32_MAX) {
2913         env->vxsat = 0x1;
2914         return UINT32_MAX;
2915     } else {
2916         return res;
2917     }
2918 }
2919 
2920 RVVCALL(OPIVV2_RM, vnclipu_vv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2921 RVVCALL(OPIVV2_RM, vnclipu_vv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2922 RVVCALL(OPIVV2_RM, vnclipu_vv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2923 GEN_VEXT_VV_RM(vnclipu_vv_b, 1, 1)
2924 GEN_VEXT_VV_RM(vnclipu_vv_h, 2, 2)
2925 GEN_VEXT_VV_RM(vnclipu_vv_w, 4, 4)
2926 
2927 RVVCALL(OPIVX2_RM, vnclipu_vx_b, NOP_UUU_B, H1, H2, vnclipu8)
2928 RVVCALL(OPIVX2_RM, vnclipu_vx_h, NOP_UUU_H, H2, H4, vnclipu16)
2929 RVVCALL(OPIVX2_RM, vnclipu_vx_w, NOP_UUU_W, H4, H8, vnclipu32)
2930 GEN_VEXT_VX_RM(vnclipu_vx_b, 1, 1)
2931 GEN_VEXT_VX_RM(vnclipu_vx_h, 2, 2)
2932 GEN_VEXT_VX_RM(vnclipu_vx_w, 4, 4)
2933 
2934 /*
2935  *** Vector Float Point Arithmetic Instructions
2936  */
2937 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2938 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2939 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2940                       CPURISCVState *env)                      \
2941 {                                                              \
2942     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2943     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2944     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2945 }
2946 
2947 #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ)                   \
2948 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2949                   void *vs2, CPURISCVState *env,          \
2950                   uint32_t desc)                          \
2951 {                                                         \
2952     uint32_t vm = vext_vm(desc);                          \
2953     uint32_t vl = env->vl;                                \
2954     uint32_t i;                                           \
2955                                                           \
2956     for (i = 0; i < vl; i++) {                            \
2957         if (!vm && !vext_elem_mask(v0, i)) {              \
2958             continue;                                     \
2959         }                                                 \
2960         do_##NAME(vd, vs1, vs2, i, env);                  \
2961     }                                                     \
2962 }
2963 
2964 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2965 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2966 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2967 GEN_VEXT_VV_ENV(vfadd_vv_h, 2, 2)
2968 GEN_VEXT_VV_ENV(vfadd_vv_w, 4, 4)
2969 GEN_VEXT_VV_ENV(vfadd_vv_d, 8, 8)
2970 
2971 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2972 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2973                       CPURISCVState *env)                      \
2974 {                                                              \
2975     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2976     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2977 }
2978 
2979 #define GEN_VEXT_VF(NAME, ESZ, DSZ)                       \
2980 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2981                   void *vs2, CPURISCVState *env,          \
2982                   uint32_t desc)                          \
2983 {                                                         \
2984     uint32_t vm = vext_vm(desc);                          \
2985     uint32_t vl = env->vl;                                \
2986     uint32_t i;                                           \
2987                                                           \
2988     for (i = 0; i < vl; i++) {                            \
2989         if (!vm && !vext_elem_mask(v0, i)) {              \
2990             continue;                                     \
2991         }                                                 \
2992         do_##NAME(vd, s1, vs2, i, env);                   \
2993     }                                                     \
2994 }
2995 
2996 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2997 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2998 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2999 GEN_VEXT_VF(vfadd_vf_h, 2, 2)
3000 GEN_VEXT_VF(vfadd_vf_w, 4, 4)
3001 GEN_VEXT_VF(vfadd_vf_d, 8, 8)
3002 
3003 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3004 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3005 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3006 GEN_VEXT_VV_ENV(vfsub_vv_h, 2, 2)
3007 GEN_VEXT_VV_ENV(vfsub_vv_w, 4, 4)
3008 GEN_VEXT_VV_ENV(vfsub_vv_d, 8, 8)
3009 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3010 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3011 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3012 GEN_VEXT_VF(vfsub_vf_h, 2, 2)
3013 GEN_VEXT_VF(vfsub_vf_w, 4, 4)
3014 GEN_VEXT_VF(vfsub_vf_d, 8, 8)
3015 
3016 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3017 {
3018     return float16_sub(b, a, s);
3019 }
3020 
3021 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3022 {
3023     return float32_sub(b, a, s);
3024 }
3025 
3026 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3027 {
3028     return float64_sub(b, a, s);
3029 }
3030 
3031 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3032 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3033 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3034 GEN_VEXT_VF(vfrsub_vf_h, 2, 2)
3035 GEN_VEXT_VF(vfrsub_vf_w, 4, 4)
3036 GEN_VEXT_VF(vfrsub_vf_d, 8, 8)
3037 
3038 /* Vector Widening Floating-Point Add/Subtract Instructions */
3039 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3040 {
3041     return float32_add(float16_to_float32(a, true, s),
3042             float16_to_float32(b, true, s), s);
3043 }
3044 
3045 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3046 {
3047     return float64_add(float32_to_float64(a, s),
3048             float32_to_float64(b, s), s);
3049 
3050 }
3051 
3052 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3053 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3054 GEN_VEXT_VV_ENV(vfwadd_vv_h, 2, 4)
3055 GEN_VEXT_VV_ENV(vfwadd_vv_w, 4, 8)
3056 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3057 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3058 GEN_VEXT_VF(vfwadd_vf_h, 2, 4)
3059 GEN_VEXT_VF(vfwadd_vf_w, 4, 8)
3060 
3061 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3062 {
3063     return float32_sub(float16_to_float32(a, true, s),
3064             float16_to_float32(b, true, s), s);
3065 }
3066 
3067 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3068 {
3069     return float64_sub(float32_to_float64(a, s),
3070             float32_to_float64(b, s), s);
3071 
3072 }
3073 
3074 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3075 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3076 GEN_VEXT_VV_ENV(vfwsub_vv_h, 2, 4)
3077 GEN_VEXT_VV_ENV(vfwsub_vv_w, 4, 8)
3078 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3079 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3080 GEN_VEXT_VF(vfwsub_vf_h, 2, 4)
3081 GEN_VEXT_VF(vfwsub_vf_w, 4, 8)
3082 
3083 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3084 {
3085     return float32_add(a, float16_to_float32(b, true, s), s);
3086 }
3087 
3088 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3089 {
3090     return float64_add(a, float32_to_float64(b, s), s);
3091 }
3092 
3093 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3094 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3095 GEN_VEXT_VV_ENV(vfwadd_wv_h, 2, 4)
3096 GEN_VEXT_VV_ENV(vfwadd_wv_w, 4, 8)
3097 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3098 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3099 GEN_VEXT_VF(vfwadd_wf_h, 2, 4)
3100 GEN_VEXT_VF(vfwadd_wf_w, 4, 8)
3101 
3102 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3103 {
3104     return float32_sub(a, float16_to_float32(b, true, s), s);
3105 }
3106 
3107 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3108 {
3109     return float64_sub(a, float32_to_float64(b, s), s);
3110 }
3111 
3112 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3113 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3114 GEN_VEXT_VV_ENV(vfwsub_wv_h, 2, 4)
3115 GEN_VEXT_VV_ENV(vfwsub_wv_w, 4, 8)
3116 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3117 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3118 GEN_VEXT_VF(vfwsub_wf_h, 2, 4)
3119 GEN_VEXT_VF(vfwsub_wf_w, 4, 8)
3120 
3121 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3122 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3123 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3124 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3125 GEN_VEXT_VV_ENV(vfmul_vv_h, 2, 2)
3126 GEN_VEXT_VV_ENV(vfmul_vv_w, 4, 4)
3127 GEN_VEXT_VV_ENV(vfmul_vv_d, 8, 8)
3128 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3129 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3130 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3131 GEN_VEXT_VF(vfmul_vf_h, 2, 2)
3132 GEN_VEXT_VF(vfmul_vf_w, 4, 4)
3133 GEN_VEXT_VF(vfmul_vf_d, 8, 8)
3134 
3135 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3136 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3137 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3138 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2, 2)
3139 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4, 4)
3140 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8, 8)
3141 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3142 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3143 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3144 GEN_VEXT_VF(vfdiv_vf_h, 2, 2)
3145 GEN_VEXT_VF(vfdiv_vf_w, 4, 4)
3146 GEN_VEXT_VF(vfdiv_vf_d, 8, 8)
3147 
3148 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3149 {
3150     return float16_div(b, a, s);
3151 }
3152 
3153 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3154 {
3155     return float32_div(b, a, s);
3156 }
3157 
3158 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3159 {
3160     return float64_div(b, a, s);
3161 }
3162 
3163 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3164 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3165 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3166 GEN_VEXT_VF(vfrdiv_vf_h, 2, 2)
3167 GEN_VEXT_VF(vfrdiv_vf_w, 4, 4)
3168 GEN_VEXT_VF(vfrdiv_vf_d, 8, 8)
3169 
3170 /* Vector Widening Floating-Point Multiply */
3171 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3172 {
3173     return float32_mul(float16_to_float32(a, true, s),
3174             float16_to_float32(b, true, s), s);
3175 }
3176 
3177 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3178 {
3179     return float64_mul(float32_to_float64(a, s),
3180             float32_to_float64(b, s), s);
3181 
3182 }
3183 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3184 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3185 GEN_VEXT_VV_ENV(vfwmul_vv_h, 2, 4)
3186 GEN_VEXT_VV_ENV(vfwmul_vv_w, 4, 8)
3187 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3188 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3189 GEN_VEXT_VF(vfwmul_vf_h, 2, 4)
3190 GEN_VEXT_VF(vfwmul_vf_w, 4, 8)
3191 
3192 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3193 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3194 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3195         CPURISCVState *env)                                        \
3196 {                                                                  \
3197     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3198     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3199     TD d = *((TD *)vd + HD(i));                                    \
3200     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3201 }
3202 
3203 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3204 {
3205     return float16_muladd(a, b, d, 0, s);
3206 }
3207 
3208 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3209 {
3210     return float32_muladd(a, b, d, 0, s);
3211 }
3212 
3213 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3214 {
3215     return float64_muladd(a, b, d, 0, s);
3216 }
3217 
3218 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3219 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3220 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3221 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2, 2)
3222 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4, 4)
3223 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8, 8)
3224 
3225 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3226 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3227         CPURISCVState *env)                                       \
3228 {                                                                 \
3229     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3230     TD d = *((TD *)vd + HD(i));                                   \
3231     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3232 }
3233 
3234 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3235 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3236 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3237 GEN_VEXT_VF(vfmacc_vf_h, 2, 2)
3238 GEN_VEXT_VF(vfmacc_vf_w, 4, 4)
3239 GEN_VEXT_VF(vfmacc_vf_d, 8, 8)
3240 
3241 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3242 {
3243     return float16_muladd(a, b, d,
3244             float_muladd_negate_c | float_muladd_negate_product, s);
3245 }
3246 
3247 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3248 {
3249     return float32_muladd(a, b, d,
3250             float_muladd_negate_c | float_muladd_negate_product, s);
3251 }
3252 
3253 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3254 {
3255     return float64_muladd(a, b, d,
3256             float_muladd_negate_c | float_muladd_negate_product, s);
3257 }
3258 
3259 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3260 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3261 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3262 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2, 2)
3263 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4, 4)
3264 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8, 8)
3265 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3266 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3267 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3268 GEN_VEXT_VF(vfnmacc_vf_h, 2, 2)
3269 GEN_VEXT_VF(vfnmacc_vf_w, 4, 4)
3270 GEN_VEXT_VF(vfnmacc_vf_d, 8, 8)
3271 
3272 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3273 {
3274     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3275 }
3276 
3277 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3278 {
3279     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3280 }
3281 
3282 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3283 {
3284     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3285 }
3286 
3287 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3288 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3289 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3290 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2, 2)
3291 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4, 4)
3292 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8, 8)
3293 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3294 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3295 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3296 GEN_VEXT_VF(vfmsac_vf_h, 2, 2)
3297 GEN_VEXT_VF(vfmsac_vf_w, 4, 4)
3298 GEN_VEXT_VF(vfmsac_vf_d, 8, 8)
3299 
3300 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3301 {
3302     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3303 }
3304 
3305 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3306 {
3307     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3308 }
3309 
3310 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3311 {
3312     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3313 }
3314 
3315 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3316 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3317 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3318 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2, 2)
3319 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4, 4)
3320 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8, 8)
3321 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3322 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3323 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3324 GEN_VEXT_VF(vfnmsac_vf_h, 2, 2)
3325 GEN_VEXT_VF(vfnmsac_vf_w, 4, 4)
3326 GEN_VEXT_VF(vfnmsac_vf_d, 8, 8)
3327 
3328 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3329 {
3330     return float16_muladd(d, b, a, 0, s);
3331 }
3332 
3333 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3334 {
3335     return float32_muladd(d, b, a, 0, s);
3336 }
3337 
3338 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3339 {
3340     return float64_muladd(d, b, a, 0, s);
3341 }
3342 
3343 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3344 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3345 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3346 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2, 2)
3347 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4, 4)
3348 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8, 8)
3349 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3350 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3351 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3352 GEN_VEXT_VF(vfmadd_vf_h, 2, 2)
3353 GEN_VEXT_VF(vfmadd_vf_w, 4, 4)
3354 GEN_VEXT_VF(vfmadd_vf_d, 8, 8)
3355 
3356 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3357 {
3358     return float16_muladd(d, b, a,
3359             float_muladd_negate_c | float_muladd_negate_product, s);
3360 }
3361 
3362 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3363 {
3364     return float32_muladd(d, b, a,
3365             float_muladd_negate_c | float_muladd_negate_product, s);
3366 }
3367 
3368 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3369 {
3370     return float64_muladd(d, b, a,
3371             float_muladd_negate_c | float_muladd_negate_product, s);
3372 }
3373 
3374 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3375 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3376 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3377 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2, 2)
3378 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4, 4)
3379 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8, 8)
3380 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3381 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3382 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3383 GEN_VEXT_VF(vfnmadd_vf_h, 2, 2)
3384 GEN_VEXT_VF(vfnmadd_vf_w, 4, 4)
3385 GEN_VEXT_VF(vfnmadd_vf_d, 8, 8)
3386 
3387 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3388 {
3389     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3390 }
3391 
3392 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3393 {
3394     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3395 }
3396 
3397 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3398 {
3399     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3400 }
3401 
3402 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3403 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3404 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3405 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2, 2)
3406 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4, 4)
3407 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8, 8)
3408 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3409 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3410 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3411 GEN_VEXT_VF(vfmsub_vf_h, 2, 2)
3412 GEN_VEXT_VF(vfmsub_vf_w, 4, 4)
3413 GEN_VEXT_VF(vfmsub_vf_d, 8, 8)
3414 
3415 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3416 {
3417     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3418 }
3419 
3420 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3421 {
3422     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3423 }
3424 
3425 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3426 {
3427     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3428 }
3429 
3430 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3431 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3432 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3433 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2, 2)
3434 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4, 4)
3435 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8, 8)
3436 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3437 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3438 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3439 GEN_VEXT_VF(vfnmsub_vf_h, 2, 2)
3440 GEN_VEXT_VF(vfnmsub_vf_w, 4, 4)
3441 GEN_VEXT_VF(vfnmsub_vf_d, 8, 8)
3442 
3443 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3444 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3445 {
3446     return float32_muladd(float16_to_float32(a, true, s),
3447                         float16_to_float32(b, true, s), d, 0, s);
3448 }
3449 
3450 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3451 {
3452     return float64_muladd(float32_to_float64(a, s),
3453                         float32_to_float64(b, s), d, 0, s);
3454 }
3455 
3456 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3457 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3458 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 2, 4)
3459 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 4, 8)
3460 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3461 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3462 GEN_VEXT_VF(vfwmacc_vf_h, 2, 4)
3463 GEN_VEXT_VF(vfwmacc_vf_w, 4, 8)
3464 
3465 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3466 {
3467     return float32_muladd(float16_to_float32(a, true, s),
3468                         float16_to_float32(b, true, s), d,
3469                         float_muladd_negate_c | float_muladd_negate_product, s);
3470 }
3471 
3472 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3473 {
3474     return float64_muladd(float32_to_float64(a, s),
3475                         float32_to_float64(b, s), d,
3476                         float_muladd_negate_c | float_muladd_negate_product, s);
3477 }
3478 
3479 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3480 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3481 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 2, 4)
3482 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 4, 8)
3483 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3484 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3485 GEN_VEXT_VF(vfwnmacc_vf_h, 2, 4)
3486 GEN_VEXT_VF(vfwnmacc_vf_w, 4, 8)
3487 
3488 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3489 {
3490     return float32_muladd(float16_to_float32(a, true, s),
3491                         float16_to_float32(b, true, s), d,
3492                         float_muladd_negate_c, s);
3493 }
3494 
3495 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3496 {
3497     return float64_muladd(float32_to_float64(a, s),
3498                         float32_to_float64(b, s), d,
3499                         float_muladd_negate_c, s);
3500 }
3501 
3502 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3503 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3504 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 2, 4)
3505 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 4, 8)
3506 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3507 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3508 GEN_VEXT_VF(vfwmsac_vf_h, 2, 4)
3509 GEN_VEXT_VF(vfwmsac_vf_w, 4, 8)
3510 
3511 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3512 {
3513     return float32_muladd(float16_to_float32(a, true, s),
3514                         float16_to_float32(b, true, s), d,
3515                         float_muladd_negate_product, s);
3516 }
3517 
3518 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3519 {
3520     return float64_muladd(float32_to_float64(a, s),
3521                         float32_to_float64(b, s), d,
3522                         float_muladd_negate_product, s);
3523 }
3524 
3525 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3526 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3527 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 2, 4)
3528 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 4, 8)
3529 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3530 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3531 GEN_VEXT_VF(vfwnmsac_vf_h, 2, 4)
3532 GEN_VEXT_VF(vfwnmsac_vf_w, 4, 8)
3533 
3534 /* Vector Floating-Point Square-Root Instruction */
3535 /* (TD, T2, TX2) */
3536 #define OP_UU_H uint16_t, uint16_t, uint16_t
3537 #define OP_UU_W uint32_t, uint32_t, uint32_t
3538 #define OP_UU_D uint64_t, uint64_t, uint64_t
3539 
3540 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3541 static void do_##NAME(void *vd, void *vs2, int i,      \
3542         CPURISCVState *env)                            \
3543 {                                                      \
3544     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3545     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3546 }
3547 
3548 #define GEN_VEXT_V_ENV(NAME, ESZ, DSZ)                 \
3549 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3550         CPURISCVState *env, uint32_t desc)             \
3551 {                                                      \
3552     uint32_t vm = vext_vm(desc);                       \
3553     uint32_t vl = env->vl;                             \
3554     uint32_t i;                                        \
3555                                                        \
3556     if (vl == 0) {                                     \
3557         return;                                        \
3558     }                                                  \
3559     for (i = 0; i < vl; i++) {                         \
3560         if (!vm && !vext_elem_mask(v0, i)) {           \
3561             continue;                                  \
3562         }                                              \
3563         do_##NAME(vd, vs2, i, env);                    \
3564     }                                                  \
3565 }
3566 
3567 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3568 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3569 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3570 GEN_VEXT_V_ENV(vfsqrt_v_h, 2, 2)
3571 GEN_VEXT_V_ENV(vfsqrt_v_w, 4, 4)
3572 GEN_VEXT_V_ENV(vfsqrt_v_d, 8, 8)
3573 
3574 /* Vector Floating-Point MIN/MAX Instructions */
3575 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minnum)
3576 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minnum)
3577 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minnum)
3578 GEN_VEXT_VV_ENV(vfmin_vv_h, 2, 2)
3579 GEN_VEXT_VV_ENV(vfmin_vv_w, 4, 4)
3580 GEN_VEXT_VV_ENV(vfmin_vv_d, 8, 8)
3581 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minnum)
3582 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minnum)
3583 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minnum)
3584 GEN_VEXT_VF(vfmin_vf_h, 2, 2)
3585 GEN_VEXT_VF(vfmin_vf_w, 4, 4)
3586 GEN_VEXT_VF(vfmin_vf_d, 8, 8)
3587 
3588 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maxnum)
3589 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maxnum)
3590 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maxnum)
3591 GEN_VEXT_VV_ENV(vfmax_vv_h, 2, 2)
3592 GEN_VEXT_VV_ENV(vfmax_vv_w, 4, 4)
3593 GEN_VEXT_VV_ENV(vfmax_vv_d, 8, 8)
3594 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maxnum)
3595 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maxnum)
3596 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maxnum)
3597 GEN_VEXT_VF(vfmax_vf_h, 2, 2)
3598 GEN_VEXT_VF(vfmax_vf_w, 4, 4)
3599 GEN_VEXT_VF(vfmax_vf_d, 8, 8)
3600 
3601 /* Vector Floating-Point Sign-Injection Instructions */
3602 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3603 {
3604     return deposit64(b, 0, 15, a);
3605 }
3606 
3607 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3608 {
3609     return deposit64(b, 0, 31, a);
3610 }
3611 
3612 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3613 {
3614     return deposit64(b, 0, 63, a);
3615 }
3616 
3617 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3618 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3619 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3620 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2, 2)
3621 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4, 4)
3622 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8, 8)
3623 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3624 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3625 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3626 GEN_VEXT_VF(vfsgnj_vf_h, 2, 2)
3627 GEN_VEXT_VF(vfsgnj_vf_w, 4, 4)
3628 GEN_VEXT_VF(vfsgnj_vf_d, 8, 8)
3629 
3630 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3631 {
3632     return deposit64(~b, 0, 15, a);
3633 }
3634 
3635 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3636 {
3637     return deposit64(~b, 0, 31, a);
3638 }
3639 
3640 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3641 {
3642     return deposit64(~b, 0, 63, a);
3643 }
3644 
3645 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3646 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3647 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3648 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2, 2)
3649 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4, 4)
3650 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8, 8)
3651 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3652 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3653 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3654 GEN_VEXT_VF(vfsgnjn_vf_h, 2, 2)
3655 GEN_VEXT_VF(vfsgnjn_vf_w, 4, 4)
3656 GEN_VEXT_VF(vfsgnjn_vf_d, 8, 8)
3657 
3658 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3659 {
3660     return deposit64(b ^ a, 0, 15, a);
3661 }
3662 
3663 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3664 {
3665     return deposit64(b ^ a, 0, 31, a);
3666 }
3667 
3668 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3669 {
3670     return deposit64(b ^ a, 0, 63, a);
3671 }
3672 
3673 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3674 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3675 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3676 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2, 2)
3677 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4, 4)
3678 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8, 8)
3679 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3680 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3681 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3682 GEN_VEXT_VF(vfsgnjx_vf_h, 2, 2)
3683 GEN_VEXT_VF(vfsgnjx_vf_w, 4, 4)
3684 GEN_VEXT_VF(vfsgnjx_vf_d, 8, 8)
3685 
3686 /* Vector Floating-Point Compare Instructions */
3687 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3688 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3689                   CPURISCVState *env, uint32_t desc)          \
3690 {                                                             \
3691     uint32_t vm = vext_vm(desc);                              \
3692     uint32_t vl = env->vl;                                    \
3693     uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);        \
3694     uint32_t i;                                               \
3695                                                               \
3696     for (i = 0; i < vl; i++) {                                \
3697         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3698         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3699         if (!vm && !vext_elem_mask(v0, i)) {                  \
3700             continue;                                         \
3701         }                                                     \
3702         vext_set_elem_mask(vd, i,                             \
3703                            DO_OP(s2, s1, &env->fp_status));   \
3704     }                                                         \
3705     for (; i < vlmax; i++) {                                  \
3706         vext_set_elem_mask(vd, i, 0);                         \
3707     }                                                         \
3708 }
3709 
3710 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
3711 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
3712 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
3713 
3714 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
3715 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
3716                   CPURISCVState *env, uint32_t desc)                \
3717 {                                                                   \
3718     uint32_t vm = vext_vm(desc);                                    \
3719     uint32_t vl = env->vl;                                          \
3720     uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);              \
3721     uint32_t i;                                                     \
3722                                                                     \
3723     for (i = 0; i < vl; i++) {                                      \
3724         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
3725         if (!vm && !vext_elem_mask(v0, i)) {                        \
3726             continue;                                               \
3727         }                                                           \
3728         vext_set_elem_mask(vd, i,                                   \
3729                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
3730     }                                                               \
3731     for (; i < vlmax; i++) {                                        \
3732         vext_set_elem_mask(vd, i, 0);                               \
3733     }                                                               \
3734 }
3735 
3736 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
3737 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
3738 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
3739 
3740 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
3741 {
3742     FloatRelation compare = float16_compare_quiet(a, b, s);
3743     return compare != float_relation_equal;
3744 }
3745 
3746 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
3747 {
3748     FloatRelation compare = float32_compare_quiet(a, b, s);
3749     return compare != float_relation_equal;
3750 }
3751 
3752 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
3753 {
3754     FloatRelation compare = float64_compare_quiet(a, b, s);
3755     return compare != float_relation_equal;
3756 }
3757 
3758 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
3759 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
3760 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
3761 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
3762 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
3763 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
3764 
3765 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
3766 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
3767 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
3768 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
3769 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
3770 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
3771 
3772 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
3773 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
3774 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
3775 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
3776 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
3777 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
3778 
3779 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
3780 {
3781     FloatRelation compare = float16_compare(a, b, s);
3782     return compare == float_relation_greater;
3783 }
3784 
3785 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
3786 {
3787     FloatRelation compare = float32_compare(a, b, s);
3788     return compare == float_relation_greater;
3789 }
3790 
3791 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
3792 {
3793     FloatRelation compare = float64_compare(a, b, s);
3794     return compare == float_relation_greater;
3795 }
3796 
3797 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
3798 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
3799 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
3800 
3801 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
3802 {
3803     FloatRelation compare = float16_compare(a, b, s);
3804     return compare == float_relation_greater ||
3805            compare == float_relation_equal;
3806 }
3807 
3808 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
3809 {
3810     FloatRelation compare = float32_compare(a, b, s);
3811     return compare == float_relation_greater ||
3812            compare == float_relation_equal;
3813 }
3814 
3815 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
3816 {
3817     FloatRelation compare = float64_compare(a, b, s);
3818     return compare == float_relation_greater ||
3819            compare == float_relation_equal;
3820 }
3821 
3822 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
3823 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
3824 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
3825 
3826 GEN_VEXT_CMP_VV_ENV(vmford_vv_h, uint16_t, H2, !float16_unordered_quiet)
3827 GEN_VEXT_CMP_VV_ENV(vmford_vv_w, uint32_t, H4, !float32_unordered_quiet)
3828 GEN_VEXT_CMP_VV_ENV(vmford_vv_d, uint64_t, H8, !float64_unordered_quiet)
3829 GEN_VEXT_CMP_VF(vmford_vf_h, uint16_t, H2, !float16_unordered_quiet)
3830 GEN_VEXT_CMP_VF(vmford_vf_w, uint32_t, H4, !float32_unordered_quiet)
3831 GEN_VEXT_CMP_VF(vmford_vf_d, uint64_t, H8, !float64_unordered_quiet)
3832 
3833 /* Vector Floating-Point Classify Instruction */
3834 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3835 static void do_##NAME(void *vd, void *vs2, int i)      \
3836 {                                                      \
3837     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3838     *((TD *)vd + HD(i)) = OP(s2);                      \
3839 }
3840 
3841 #define GEN_VEXT_V(NAME, ESZ, DSZ)                     \
3842 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3843                   CPURISCVState *env, uint32_t desc)   \
3844 {                                                      \
3845     uint32_t vm = vext_vm(desc);                       \
3846     uint32_t vl = env->vl;                             \
3847     uint32_t i;                                        \
3848                                                        \
3849     for (i = 0; i < vl; i++) {                         \
3850         if (!vm && !vext_elem_mask(v0, i)) {           \
3851             continue;                                  \
3852         }                                              \
3853         do_##NAME(vd, vs2, i);                         \
3854     }                                                  \
3855 }
3856 
3857 target_ulong fclass_h(uint64_t frs1)
3858 {
3859     float16 f = frs1;
3860     bool sign = float16_is_neg(f);
3861 
3862     if (float16_is_infinity(f)) {
3863         return sign ? 1 << 0 : 1 << 7;
3864     } else if (float16_is_zero(f)) {
3865         return sign ? 1 << 3 : 1 << 4;
3866     } else if (float16_is_zero_or_denormal(f)) {
3867         return sign ? 1 << 2 : 1 << 5;
3868     } else if (float16_is_any_nan(f)) {
3869         float_status s = { }; /* for snan_bit_is_one */
3870         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
3871     } else {
3872         return sign ? 1 << 1 : 1 << 6;
3873     }
3874 }
3875 
3876 target_ulong fclass_s(uint64_t frs1)
3877 {
3878     float32 f = frs1;
3879     bool sign = float32_is_neg(f);
3880 
3881     if (float32_is_infinity(f)) {
3882         return sign ? 1 << 0 : 1 << 7;
3883     } else if (float32_is_zero(f)) {
3884         return sign ? 1 << 3 : 1 << 4;
3885     } else if (float32_is_zero_or_denormal(f)) {
3886         return sign ? 1 << 2 : 1 << 5;
3887     } else if (float32_is_any_nan(f)) {
3888         float_status s = { }; /* for snan_bit_is_one */
3889         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
3890     } else {
3891         return sign ? 1 << 1 : 1 << 6;
3892     }
3893 }
3894 
3895 target_ulong fclass_d(uint64_t frs1)
3896 {
3897     float64 f = frs1;
3898     bool sign = float64_is_neg(f);
3899 
3900     if (float64_is_infinity(f)) {
3901         return sign ? 1 << 0 : 1 << 7;
3902     } else if (float64_is_zero(f)) {
3903         return sign ? 1 << 3 : 1 << 4;
3904     } else if (float64_is_zero_or_denormal(f)) {
3905         return sign ? 1 << 2 : 1 << 5;
3906     } else if (float64_is_any_nan(f)) {
3907         float_status s = { }; /* for snan_bit_is_one */
3908         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
3909     } else {
3910         return sign ? 1 << 1 : 1 << 6;
3911     }
3912 }
3913 
3914 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
3915 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
3916 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
3917 GEN_VEXT_V(vfclass_v_h, 2, 2)
3918 GEN_VEXT_V(vfclass_v_w, 4, 4)
3919 GEN_VEXT_V(vfclass_v_d, 8, 8)
3920 
3921 /* Vector Floating-Point Merge Instruction */
3922 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
3923 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
3924                   CPURISCVState *env, uint32_t desc)          \
3925 {                                                             \
3926     uint32_t vm = vext_vm(desc);                              \
3927     uint32_t vl = env->vl;                                    \
3928     uint32_t i;                                               \
3929                                                               \
3930     for (i = 0; i < vl; i++) {                                \
3931         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3932         *((ETYPE *)vd + H(i))                                 \
3933           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
3934     }                                                         \
3935 }
3936 
3937 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
3938 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
3939 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
3940 
3941 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
3942 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
3943 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
3944 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
3945 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
3946 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2, 2)
3947 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4, 4)
3948 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8, 8)
3949 
3950 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
3951 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
3952 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
3953 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
3954 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2, 2)
3955 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4, 4)
3956 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8, 8)
3957 
3958 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
3959 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
3960 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
3961 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
3962 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2, 2)
3963 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4, 4)
3964 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8, 8)
3965 
3966 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
3967 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
3968 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
3969 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
3970 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2, 2)
3971 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4, 4)
3972 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8, 8)
3973 
3974 /* Widening Floating-Point/Integer Type-Convert Instructions */
3975 /* (TD, T2, TX2) */
3976 #define WOP_UU_H uint32_t, uint16_t, uint16_t
3977 #define WOP_UU_W uint64_t, uint32_t, uint32_t
3978 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
3979 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
3980 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
3981 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 2, 4)
3982 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 4, 8)
3983 
3984 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
3985 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
3986 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
3987 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 2, 4)
3988 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 4, 8)
3989 
3990 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
3991 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
3992 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
3993 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 2, 4)
3994 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 4, 8)
3995 
3996 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
3997 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
3998 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
3999 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 2, 4)
4000 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 4, 8)
4001 
4002 /*
4003  * vfwcvt.f.f.v vd, vs2, vm #
4004  * Convert single-width float to double-width float.
4005  */
4006 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4007 {
4008     return float16_to_float32(a, true, s);
4009 }
4010 
4011 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4012 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4013 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 2, 4)
4014 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 4, 8)
4015 
4016 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4017 /* (TD, T2, TX2) */
4018 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4019 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4020 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4021 RVVCALL(OPFVV1, vfncvt_xu_f_v_h, NOP_UU_H, H2, H4, float32_to_uint16)
4022 RVVCALL(OPFVV1, vfncvt_xu_f_v_w, NOP_UU_W, H4, H8, float64_to_uint32)
4023 GEN_VEXT_V_ENV(vfncvt_xu_f_v_h, 2, 2)
4024 GEN_VEXT_V_ENV(vfncvt_xu_f_v_w, 4, 4)
4025 
4026 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4027 RVVCALL(OPFVV1, vfncvt_x_f_v_h, NOP_UU_H, H2, H4, float32_to_int16)
4028 RVVCALL(OPFVV1, vfncvt_x_f_v_w, NOP_UU_W, H4, H8, float64_to_int32)
4029 GEN_VEXT_V_ENV(vfncvt_x_f_v_h, 2, 2)
4030 GEN_VEXT_V_ENV(vfncvt_x_f_v_w, 4, 4)
4031 
4032 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4033 RVVCALL(OPFVV1, vfncvt_f_xu_v_h, NOP_UU_H, H2, H4, uint32_to_float16)
4034 RVVCALL(OPFVV1, vfncvt_f_xu_v_w, NOP_UU_W, H4, H8, uint64_to_float32)
4035 GEN_VEXT_V_ENV(vfncvt_f_xu_v_h, 2, 2)
4036 GEN_VEXT_V_ENV(vfncvt_f_xu_v_w, 4, 4)
4037 
4038 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4039 RVVCALL(OPFVV1, vfncvt_f_x_v_h, NOP_UU_H, H2, H4, int32_to_float16)
4040 RVVCALL(OPFVV1, vfncvt_f_x_v_w, NOP_UU_W, H4, H8, int64_to_float32)
4041 GEN_VEXT_V_ENV(vfncvt_f_x_v_h, 2, 2)
4042 GEN_VEXT_V_ENV(vfncvt_f_x_v_w, 4, 4)
4043 
4044 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4045 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4046 {
4047     return float32_to_float16(a, true, s);
4048 }
4049 
4050 RVVCALL(OPFVV1, vfncvt_f_f_v_h, NOP_UU_H, H2, H4, vfncvtffv16)
4051 RVVCALL(OPFVV1, vfncvt_f_f_v_w, NOP_UU_W, H4, H8, float64_to_float32)
4052 GEN_VEXT_V_ENV(vfncvt_f_f_v_h, 2, 2)
4053 GEN_VEXT_V_ENV(vfncvt_f_f_v_w, 4, 4)
4054 
4055 /*
4056  *** Vector Reduction Operations
4057  */
4058 /* Vector Single-Width Integer Reduction Instructions */
4059 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4060 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4061         void *vs2, CPURISCVState *env, uint32_t desc)     \
4062 {                                                         \
4063     uint32_t vm = vext_vm(desc);                          \
4064     uint32_t vl = env->vl;                                \
4065     uint32_t i;                                           \
4066     TD s1 =  *((TD *)vs1 + HD(0));                        \
4067                                                           \
4068     for (i = 0; i < vl; i++) {                            \
4069         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4070         if (!vm && !vext_elem_mask(v0, i)) {              \
4071             continue;                                     \
4072         }                                                 \
4073         s1 = OP(s1, (TD)s2);                              \
4074     }                                                     \
4075     *((TD *)vd + HD(0)) = s1;                             \
4076 }
4077 
4078 /* vd[0] = sum(vs1[0], vs2[*]) */
4079 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4080 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4081 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4082 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4083 
4084 /* vd[0] = maxu(vs1[0], vs2[*]) */
4085 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4086 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4087 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4088 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4089 
4090 /* vd[0] = max(vs1[0], vs2[*]) */
4091 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4092 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4093 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4094 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4095 
4096 /* vd[0] = minu(vs1[0], vs2[*]) */
4097 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4098 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4099 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4100 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4101 
4102 /* vd[0] = min(vs1[0], vs2[*]) */
4103 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4104 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4105 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4106 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4107 
4108 /* vd[0] = and(vs1[0], vs2[*]) */
4109 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4110 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4111 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4112 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4113 
4114 /* vd[0] = or(vs1[0], vs2[*]) */
4115 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4116 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4117 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4118 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4119 
4120 /* vd[0] = xor(vs1[0], vs2[*]) */
4121 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4122 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4123 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4124 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4125 
4126 /* Vector Widening Integer Reduction Instructions */
4127 /* signed sum reduction into double-width accumulator */
4128 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4129 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4130 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4131 
4132 /* Unsigned sum reduction into double-width accumulator */
4133 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4134 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4135 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4136 
4137 /* Vector Single-Width Floating-Point Reduction Instructions */
4138 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4139 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4140                   void *vs2, CPURISCVState *env,           \
4141                   uint32_t desc)                           \
4142 {                                                          \
4143     uint32_t vm = vext_vm(desc);                           \
4144     uint32_t vl = env->vl;                                 \
4145     uint32_t i;                                            \
4146     TD s1 =  *((TD *)vs1 + HD(0));                         \
4147                                                            \
4148     for (i = 0; i < vl; i++) {                             \
4149         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4150         if (!vm && !vext_elem_mask(v0, i)) {               \
4151             continue;                                      \
4152         }                                                  \
4153         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4154     }                                                      \
4155     *((TD *)vd + HD(0)) = s1;                              \
4156 }
4157 
4158 /* Unordered sum */
4159 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4160 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4161 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4162 
4163 /* Maximum value */
4164 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maxnum)
4165 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maxnum)
4166 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maxnum)
4167 
4168 /* Minimum value */
4169 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minnum)
4170 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minnum)
4171 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minnum)
4172 
4173 /* Vector Widening Floating-Point Reduction Instructions */
4174 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4175 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4176                             void *vs2, CPURISCVState *env, uint32_t desc)
4177 {
4178     uint32_t vm = vext_vm(desc);
4179     uint32_t vl = env->vl;
4180     uint32_t i;
4181     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4182 
4183     for (i = 0; i < vl; i++) {
4184         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4185         if (!vm && !vext_elem_mask(v0, i)) {
4186             continue;
4187         }
4188         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4189                          &env->fp_status);
4190     }
4191     *((uint32_t *)vd + H4(0)) = s1;
4192 }
4193 
4194 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4195                             void *vs2, CPURISCVState *env, uint32_t desc)
4196 {
4197     uint32_t vm = vext_vm(desc);
4198     uint32_t vl = env->vl;
4199     uint32_t i;
4200     uint64_t s1 =  *((uint64_t *)vs1);
4201 
4202     for (i = 0; i < vl; i++) {
4203         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4204         if (!vm && !vext_elem_mask(v0, i)) {
4205             continue;
4206         }
4207         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4208                          &env->fp_status);
4209     }
4210     *((uint64_t *)vd) = s1;
4211 }
4212 
4213 /*
4214  *** Vector Mask Operations
4215  */
4216 /* Vector Mask-Register Logical Instructions */
4217 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4218 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4219                   void *vs2, CPURISCVState *env,          \
4220                   uint32_t desc)                          \
4221 {                                                         \
4222     uint32_t vlmax = env_archcpu(env)->cfg.vlen;          \
4223     uint32_t vl = env->vl;                                \
4224     uint32_t i;                                           \
4225     int a, b;                                             \
4226                                                           \
4227     for (i = 0; i < vl; i++) {                            \
4228         a = vext_elem_mask(vs1, i);                       \
4229         b = vext_elem_mask(vs2, i);                       \
4230         vext_set_elem_mask(vd, i, OP(b, a));              \
4231     }                                                     \
4232     for (; i < vlmax; i++) {                              \
4233         vext_set_elem_mask(vd, i, 0);                     \
4234     }                                                     \
4235 }
4236 
4237 #define DO_NAND(N, M)  (!(N & M))
4238 #define DO_ANDNOT(N, M)  (N & !M)
4239 #define DO_NOR(N, M)  (!(N | M))
4240 #define DO_ORNOT(N, M)  (N | !M)
4241 #define DO_XNOR(N, M)  (!(N ^ M))
4242 
4243 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4244 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4245 GEN_VEXT_MASK_VV(vmandnot_mm, DO_ANDNOT)
4246 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4247 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4248 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4249 GEN_VEXT_MASK_VV(vmornot_mm, DO_ORNOT)
4250 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4251 
4252 /* Vector mask population count vmpopc */
4253 target_ulong HELPER(vmpopc_m)(void *v0, void *vs2, CPURISCVState *env,
4254                               uint32_t desc)
4255 {
4256     target_ulong cnt = 0;
4257     uint32_t vm = vext_vm(desc);
4258     uint32_t vl = env->vl;
4259     int i;
4260 
4261     for (i = 0; i < vl; i++) {
4262         if (vm || vext_elem_mask(v0, i)) {
4263             if (vext_elem_mask(vs2, i)) {
4264                 cnt++;
4265             }
4266         }
4267     }
4268     return cnt;
4269 }
4270 
4271 /* vmfirst find-first-set mask bit*/
4272 target_ulong HELPER(vmfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4273                                uint32_t desc)
4274 {
4275     uint32_t vm = vext_vm(desc);
4276     uint32_t vl = env->vl;
4277     int i;
4278 
4279     for (i = 0; i < vl; i++) {
4280         if (vm || vext_elem_mask(v0, i)) {
4281             if (vext_elem_mask(vs2, i)) {
4282                 return i;
4283             }
4284         }
4285     }
4286     return -1LL;
4287 }
4288 
4289 enum set_mask_type {
4290     ONLY_FIRST = 1,
4291     INCLUDE_FIRST,
4292     BEFORE_FIRST,
4293 };
4294 
4295 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4296                    uint32_t desc, enum set_mask_type type)
4297 {
4298     uint32_t vlmax = env_archcpu(env)->cfg.vlen;
4299     uint32_t vm = vext_vm(desc);
4300     uint32_t vl = env->vl;
4301     int i;
4302     bool first_mask_bit = false;
4303 
4304     for (i = 0; i < vl; i++) {
4305         if (!vm && !vext_elem_mask(v0, i)) {
4306             continue;
4307         }
4308         /* write a zero to all following active elements */
4309         if (first_mask_bit) {
4310             vext_set_elem_mask(vd, i, 0);
4311             continue;
4312         }
4313         if (vext_elem_mask(vs2, i)) {
4314             first_mask_bit = true;
4315             if (type == BEFORE_FIRST) {
4316                 vext_set_elem_mask(vd, i, 0);
4317             } else {
4318                 vext_set_elem_mask(vd, i, 1);
4319             }
4320         } else {
4321             if (type == ONLY_FIRST) {
4322                 vext_set_elem_mask(vd, i, 0);
4323             } else {
4324                 vext_set_elem_mask(vd, i, 1);
4325             }
4326         }
4327     }
4328     for (; i < vlmax; i++) {
4329         vext_set_elem_mask(vd, i, 0);
4330     }
4331 }
4332 
4333 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4334                      uint32_t desc)
4335 {
4336     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4337 }
4338 
4339 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4340                      uint32_t desc)
4341 {
4342     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4343 }
4344 
4345 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4346                      uint32_t desc)
4347 {
4348     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4349 }
4350 
4351 /* Vector Iota Instruction */
4352 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4353 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4354                   uint32_t desc)                                          \
4355 {                                                                         \
4356     uint32_t vm = vext_vm(desc);                                          \
4357     uint32_t vl = env->vl;                                                \
4358     uint32_t sum = 0;                                                     \
4359     int i;                                                                \
4360                                                                           \
4361     for (i = 0; i < vl; i++) {                                            \
4362         if (!vm && !vext_elem_mask(v0, i)) {                              \
4363             continue;                                                     \
4364         }                                                                 \
4365         *((ETYPE *)vd + H(i)) = sum;                                      \
4366         if (vext_elem_mask(vs2, i)) {                                     \
4367             sum++;                                                        \
4368         }                                                                 \
4369     }                                                                     \
4370 }
4371 
4372 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4373 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4374 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4375 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4376 
4377 /* Vector Element Index Instruction */
4378 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4379 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4380 {                                                                         \
4381     uint32_t vm = vext_vm(desc);                                          \
4382     uint32_t vl = env->vl;                                                \
4383     int i;                                                                \
4384                                                                           \
4385     for (i = 0; i < vl; i++) {                                            \
4386         if (!vm && !vext_elem_mask(v0, i)) {                              \
4387             continue;                                                     \
4388         }                                                                 \
4389         *((ETYPE *)vd + H(i)) = i;                                        \
4390     }                                                                     \
4391 }
4392 
4393 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4394 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4395 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4396 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4397 
4398 /*
4399  *** Vector Permutation Instructions
4400  */
4401 
4402 /* Vector Slide Instructions */
4403 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4404 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4405                   CPURISCVState *env, uint32_t desc)                      \
4406 {                                                                         \
4407     uint32_t vm = vext_vm(desc);                                          \
4408     uint32_t vl = env->vl;                                                \
4409     target_ulong offset = s1, i;                                          \
4410                                                                           \
4411     for (i = offset; i < vl; i++) {                                       \
4412         if (!vm && !vext_elem_mask(v0, i)) {                              \
4413             continue;                                                     \
4414         }                                                                 \
4415         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4416     }                                                                     \
4417 }
4418 
4419 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4420 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4421 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4422 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4423 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4424 
4425 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4426 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4427                   CPURISCVState *env, uint32_t desc)                      \
4428 {                                                                         \
4429     uint32_t vlmax = env_archcpu(env)->cfg.vlen;                          \
4430     uint32_t vm = vext_vm(desc);                                          \
4431     uint32_t vl = env->vl;                                                \
4432     target_ulong offset = s1, i;                                          \
4433                                                                           \
4434     for (i = 0; i < vl; ++i) {                                            \
4435         target_ulong j = i + offset;                                      \
4436         if (!vm && !vext_elem_mask(v0, i)) {                              \
4437             continue;                                                     \
4438         }                                                                 \
4439         *((ETYPE *)vd + H(i)) = j >= vlmax ? 0 : *((ETYPE *)vs2 + H(j));  \
4440     }                                                                     \
4441 }
4442 
4443 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4444 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4445 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4446 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4447 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4448 
4449 #define GEN_VEXT_VSLIDE1UP_VX(NAME, ETYPE, H)                             \
4450 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4451                   CPURISCVState *env, uint32_t desc)                      \
4452 {                                                                         \
4453     uint32_t vm = vext_vm(desc);                                          \
4454     uint32_t vl = env->vl;                                                \
4455     uint32_t i;                                                           \
4456                                                                           \
4457     for (i = 0; i < vl; i++) {                                            \
4458         if (!vm && !vext_elem_mask(v0, i)) {                              \
4459             continue;                                                     \
4460         }                                                                 \
4461         if (i == 0) {                                                     \
4462             *((ETYPE *)vd + H(i)) = s1;                                   \
4463         } else {                                                          \
4464             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));           \
4465         }                                                                 \
4466     }                                                                     \
4467 }
4468 
4469 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4470 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, uint8_t,  H1)
4471 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, uint16_t, H2)
4472 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, uint32_t, H4)
4473 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, uint64_t, H8)
4474 
4475 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, ETYPE, H)                           \
4476 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4477                   CPURISCVState *env, uint32_t desc)                      \
4478 {                                                                         \
4479     uint32_t vm = vext_vm(desc);                                          \
4480     uint32_t vl = env->vl;                                                \
4481     uint32_t i;                                                           \
4482                                                                           \
4483     for (i = 0; i < vl; i++) {                                            \
4484         if (!vm && !vext_elem_mask(v0, i)) {                              \
4485             continue;                                                     \
4486         }                                                                 \
4487         if (i == vl - 1) {                                                \
4488             *((ETYPE *)vd + H(i)) = s1;                                   \
4489         } else {                                                          \
4490             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));           \
4491         }                                                                 \
4492     }                                                                     \
4493 }
4494 
4495 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4496 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, uint8_t,  H1)
4497 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, uint16_t, H2)
4498 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, uint32_t, H4)
4499 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, uint64_t, H8)
4500 
4501 /* Vector Register Gather Instruction */
4502 #define GEN_VEXT_VRGATHER_VV(NAME, ETYPE, H)                              \
4503 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4504                   CPURISCVState *env, uint32_t desc)                      \
4505 {                                                                         \
4506     uint32_t vlmax = env_archcpu(env)->cfg.vlen;                          \
4507     uint32_t vm = vext_vm(desc);                                          \
4508     uint32_t vl = env->vl;                                                \
4509     uint64_t index;                                                       \
4510     uint32_t i;                                                           \
4511                                                                           \
4512     for (i = 0; i < vl; i++) {                                            \
4513         if (!vm && !vext_elem_mask(v0, i)) {                              \
4514             continue;                                                     \
4515         }                                                                 \
4516         index = *((ETYPE *)vs1 + H(i));                                   \
4517         if (index >= vlmax) {                                             \
4518             *((ETYPE *)vd + H(i)) = 0;                                    \
4519         } else {                                                          \
4520             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
4521         }                                                                 \
4522     }                                                                     \
4523 }
4524 
4525 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4526 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  H1)
4527 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, H2)
4528 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, H4)
4529 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, H8)
4530 
4531 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4532 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4533                   CPURISCVState *env, uint32_t desc)                      \
4534 {                                                                         \
4535     uint32_t vlmax = env_archcpu(env)->cfg.vlen;                          \
4536     uint32_t vm = vext_vm(desc);                                          \
4537     uint32_t vl = env->vl;                                                \
4538     uint64_t index = s1;                                                  \
4539     uint32_t i;                                                           \
4540                                                                           \
4541     for (i = 0; i < vl; i++) {                                            \
4542         if (!vm && !vext_elem_mask(v0, i)) {                              \
4543             continue;                                                     \
4544         }                                                                 \
4545         if (index >= vlmax) {                                             \
4546             *((ETYPE *)vd + H(i)) = 0;                                    \
4547         } else {                                                          \
4548             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
4549         }                                                                 \
4550     }                                                                     \
4551 }
4552 
4553 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
4554 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
4555 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
4556 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
4557 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
4558 
4559 /* Vector Compress Instruction */
4560 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
4561 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4562                   CPURISCVState *env, uint32_t desc)                      \
4563 {                                                                         \
4564     uint32_t vl = env->vl;                                                \
4565     uint32_t num = 0, i;                                                  \
4566                                                                           \
4567     for (i = 0; i < vl; i++) {                                            \
4568         if (!vext_elem_mask(vs1, i)) {                                    \
4569             continue;                                                     \
4570         }                                                                 \
4571         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
4572         num++;                                                            \
4573     }                                                                     \
4574 }
4575 
4576 /* Compress into vd elements of vs2 where vs1 is enabled */
4577 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
4578 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
4579 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
4580 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
4581