xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 2e1cacfb)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg-gvec-desc.h"
30 #include "internals.h"
31 #include "vector_internals.h"
32 #include <math.h>
33 
34 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                             target_ulong s2)
36 {
37     int vlmax, vl;
38     RISCVCPU *cpu = env_archcpu(env);
39     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41     uint16_t sew = 8 << vsew;
42     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43     int xlen = riscv_cpu_xlen(env);
44     bool vill = (s2 >> (xlen - 1)) & 0x1;
45     target_ulong reserved = s2 &
46                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48     uint16_t vlen = cpu->cfg.vlenb << 3;
49     int8_t lmul;
50 
51     if (vlmul & 4) {
52         /*
53          * Fractional LMUL, check:
54          *
55          * VLEN * LMUL >= SEW
56          * VLEN >> (8 - lmul) >= sew
57          * (vlenb << 3) >> (8 - lmul) >= sew
58          */
59         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
79         vl = (s1 + 1) >> 1;
80     } else {
81         vl = vlmax;
82     }
83     env->vl = vl;
84     env->vtype = s2;
85     env->vstart = 0;
86     env->vill = 0;
87     return vl;
88 }
89 
90 /*
91  * Get the maximum number of elements can be operated.
92  *
93  * log2_esz: log2 of element size in bytes.
94  */
95 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
96 {
97     /*
98      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
99      * so vlen in bytes (vlenb) is encoded as maxsz.
100      */
101     uint32_t vlenb = simd_maxsz(desc);
102 
103     /* Return VLMAX */
104     int scale = vext_lmul(desc) - log2_esz;
105     return scale < 0 ? vlenb >> -scale : vlenb << scale;
106 }
107 
108 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
109 {
110     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
111 }
112 
113 /*
114  * This function checks watchpoint before real load operation.
115  *
116  * In system mode, the TLB API probe_access is enough for watchpoint check.
117  * In user mode, there is no watchpoint support now.
118  *
119  * It will trigger an exception if there is no mapping in TLB
120  * and page table walk can't fill the TLB entry. Then the guest
121  * software can return here after process the exception or never return.
122  */
123 static void probe_pages(CPURISCVState *env, target_ulong addr,
124                         target_ulong len, uintptr_t ra,
125                         MMUAccessType access_type)
126 {
127     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
128     target_ulong curlen = MIN(pagelen, len);
129     int mmu_index = riscv_env_mmu_index(env, false);
130 
131     probe_access(env, adjust_addr(env, addr), curlen, access_type,
132                  mmu_index, ra);
133     if (len > curlen) {
134         addr += curlen;
135         curlen = len - curlen;
136         probe_access(env, adjust_addr(env, addr), curlen, access_type,
137                      mmu_index, ra);
138     }
139 }
140 
141 static inline void vext_set_elem_mask(void *v0, int index,
142                                       uint8_t value)
143 {
144     int idx = index / 64;
145     int pos = index % 64;
146     uint64_t old = ((uint64_t *)v0)[idx];
147     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
148 }
149 
150 /* elements operations for load and store */
151 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
152                                uint32_t idx, void *vd, uintptr_t retaddr);
153 
154 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
155 static void NAME(CPURISCVState *env, abi_ptr addr,         \
156                  uint32_t idx, void *vd, uintptr_t retaddr)\
157 {                                                          \
158     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
159     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
160 }                                                          \
161 
162 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
163 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
164 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
165 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
166 
167 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
168 static void NAME(CPURISCVState *env, abi_ptr addr,         \
169                  uint32_t idx, void *vd, uintptr_t retaddr)\
170 {                                                          \
171     ETYPE data = *((ETYPE *)vd + H(idx));                  \
172     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
173 }
174 
175 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
176 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
177 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
178 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
179 
180 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
181                                    uint32_t desc, uint32_t nf,
182                                    uint32_t esz, uint32_t max_elems)
183 {
184     uint32_t vta = vext_vta(desc);
185     int k;
186 
187     if (vta == 0) {
188         return;
189     }
190 
191     for (k = 0; k < nf; ++k) {
192         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
193                           (k * max_elems + max_elems) * esz);
194     }
195 }
196 
197 /*
198  * stride: access vector element from strided memory
199  */
200 static void
201 vext_ldst_stride(void *vd, void *v0, target_ulong base,
202                  target_ulong stride, CPURISCVState *env,
203                  uint32_t desc, uint32_t vm,
204                  vext_ldst_elem_fn *ldst_elem,
205                  uint32_t log2_esz, uintptr_t ra)
206 {
207     uint32_t i, k;
208     uint32_t nf = vext_nf(desc);
209     uint32_t max_elems = vext_max_elems(desc, log2_esz);
210     uint32_t esz = 1 << log2_esz;
211     uint32_t vma = vext_vma(desc);
212 
213     VSTART_CHECK_EARLY_EXIT(env);
214 
215     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
216         k = 0;
217         while (k < nf) {
218             if (!vm && !vext_elem_mask(v0, i)) {
219                 /* set masked-off elements to 1s */
220                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
221                                   (i + k * max_elems + 1) * esz);
222                 k++;
223                 continue;
224             }
225             target_ulong addr = base + stride * i + (k << log2_esz);
226             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
227             k++;
228         }
229     }
230     env->vstart = 0;
231 
232     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
233 }
234 
235 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
236 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
237                   target_ulong stride, CPURISCVState *env,              \
238                   uint32_t desc)                                        \
239 {                                                                       \
240     uint32_t vm = vext_vm(desc);                                        \
241     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
242                      ctzl(sizeof(ETYPE)), GETPC());                     \
243 }
244 
245 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
246 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
247 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
248 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
249 
250 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
251 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
252                   target_ulong stride, CPURISCVState *env,              \
253                   uint32_t desc)                                        \
254 {                                                                       \
255     uint32_t vm = vext_vm(desc);                                        \
256     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
257                      ctzl(sizeof(ETYPE)), GETPC());                     \
258 }
259 
260 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
261 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
262 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
263 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
264 
265 /*
266  * unit-stride: access elements stored contiguously in memory
267  */
268 
269 /* unmasked unit-stride load and store operation */
270 static void
271 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
272              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
273              uintptr_t ra)
274 {
275     uint32_t i, k;
276     uint32_t nf = vext_nf(desc);
277     uint32_t max_elems = vext_max_elems(desc, log2_esz);
278     uint32_t esz = 1 << log2_esz;
279 
280     VSTART_CHECK_EARLY_EXIT(env);
281 
282     /* load bytes from guest memory */
283     for (i = env->vstart; i < evl; env->vstart = ++i) {
284         k = 0;
285         while (k < nf) {
286             target_ulong addr = base + ((i * nf + k) << log2_esz);
287             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
288             k++;
289         }
290     }
291     env->vstart = 0;
292 
293     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
294 }
295 
296 /*
297  * masked unit-stride load and store operation will be a special case of
298  * stride, stride = NF * sizeof (ETYPE)
299  */
300 
301 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
302 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
303                          CPURISCVState *env, uint32_t desc)             \
304 {                                                                       \
305     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
306     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
307                      ctzl(sizeof(ETYPE)), GETPC());                     \
308 }                                                                       \
309                                                                         \
310 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
311                   CPURISCVState *env, uint32_t desc)                    \
312 {                                                                       \
313     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
314                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
315 }
316 
317 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
318 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
319 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
320 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
321 
322 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
323 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
324                          CPURISCVState *env, uint32_t desc)              \
325 {                                                                        \
326     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
327     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
328                      ctzl(sizeof(ETYPE)), GETPC());                      \
329 }                                                                        \
330                                                                          \
331 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
332                   CPURISCVState *env, uint32_t desc)                     \
333 {                                                                        \
334     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
335                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
336 }
337 
338 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
339 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
340 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
341 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
342 
343 /*
344  * unit stride mask load and store, EEW = 1
345  */
346 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
347                     CPURISCVState *env, uint32_t desc)
348 {
349     /* evl = ceil(vl/8) */
350     uint8_t evl = (env->vl + 7) >> 3;
351     vext_ldst_us(vd, base, env, desc, lde_b,
352                  0, evl, GETPC());
353 }
354 
355 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
356                     CPURISCVState *env, uint32_t desc)
357 {
358     /* evl = ceil(vl/8) */
359     uint8_t evl = (env->vl + 7) >> 3;
360     vext_ldst_us(vd, base, env, desc, ste_b,
361                  0, evl, GETPC());
362 }
363 
364 /*
365  * index: access vector element from indexed memory
366  */
367 typedef target_ulong vext_get_index_addr(target_ulong base,
368         uint32_t idx, void *vs2);
369 
370 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
371 static target_ulong NAME(target_ulong base,            \
372                          uint32_t idx, void *vs2)      \
373 {                                                      \
374     return (base + *((ETYPE *)vs2 + H(idx)));          \
375 }
376 
377 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
378 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
379 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
380 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
381 
382 static inline void
383 vext_ldst_index(void *vd, void *v0, target_ulong base,
384                 void *vs2, CPURISCVState *env, uint32_t desc,
385                 vext_get_index_addr get_index_addr,
386                 vext_ldst_elem_fn *ldst_elem,
387                 uint32_t log2_esz, uintptr_t ra)
388 {
389     uint32_t i, k;
390     uint32_t nf = vext_nf(desc);
391     uint32_t vm = vext_vm(desc);
392     uint32_t max_elems = vext_max_elems(desc, log2_esz);
393     uint32_t esz = 1 << log2_esz;
394     uint32_t vma = vext_vma(desc);
395 
396     VSTART_CHECK_EARLY_EXIT(env);
397 
398     /* load bytes from guest memory */
399     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
400         k = 0;
401         while (k < nf) {
402             if (!vm && !vext_elem_mask(v0, i)) {
403                 /* set masked-off elements to 1s */
404                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
405                                   (i + k * max_elems + 1) * esz);
406                 k++;
407                 continue;
408             }
409             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
410             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
411             k++;
412         }
413     }
414     env->vstart = 0;
415 
416     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
417 }
418 
419 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
420 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
421                   void *vs2, CPURISCVState *env, uint32_t desc)            \
422 {                                                                          \
423     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
424                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
425 }
426 
427 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
428 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
429 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
430 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
431 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
432 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
433 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
434 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
435 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
436 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
437 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
438 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
439 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
440 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
441 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
442 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
443 
444 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
445 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
446                   void *vs2, CPURISCVState *env, uint32_t desc)  \
447 {                                                                \
448     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
449                     STORE_FN, ctzl(sizeof(ETYPE)),               \
450                     GETPC());                                    \
451 }
452 
453 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
454 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
455 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
456 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
457 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
458 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
459 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
460 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
461 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
462 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
463 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
464 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
465 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
466 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
467 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
468 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
469 
470 /*
471  * unit-stride fault-only-fisrt load instructions
472  */
473 static inline void
474 vext_ldff(void *vd, void *v0, target_ulong base,
475           CPURISCVState *env, uint32_t desc,
476           vext_ldst_elem_fn *ldst_elem,
477           uint32_t log2_esz, uintptr_t ra)
478 {
479     uint32_t i, k, vl = 0;
480     uint32_t nf = vext_nf(desc);
481     uint32_t vm = vext_vm(desc);
482     uint32_t max_elems = vext_max_elems(desc, log2_esz);
483     uint32_t esz = 1 << log2_esz;
484     uint32_t vma = vext_vma(desc);
485     target_ulong addr, offset, remain;
486     int mmu_index = riscv_env_mmu_index(env, false);
487 
488     VSTART_CHECK_EARLY_EXIT(env);
489 
490     /* probe every access */
491     for (i = env->vstart; i < env->vl; i++) {
492         if (!vm && !vext_elem_mask(v0, i)) {
493             continue;
494         }
495         addr = adjust_addr(env, base + i * (nf << log2_esz));
496         if (i == 0) {
497             /* Allow fault on first element. */
498             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
499         } else {
500             remain = nf << log2_esz;
501             while (remain > 0) {
502                 void *host;
503                 int flags;
504 
505                 offset = -(addr | TARGET_PAGE_MASK);
506 
507                 /* Probe nonfault on subsequent elements. */
508                 flags = probe_access_flags(env, addr, offset, MMU_DATA_LOAD,
509                                            mmu_index, true, &host, 0);
510 
511                 /*
512                  * Stop if invalid (unmapped) or mmio (transaction may fail).
513                  * Do not stop if watchpoint, as the spec says that
514                  * first-fault should continue to access the same
515                  * elements regardless of any watchpoint.
516                  */
517                 if (flags & ~TLB_WATCHPOINT) {
518                     vl = i;
519                     goto ProbeSuccess;
520                 }
521                 if (remain <= offset) {
522                     break;
523                 }
524                 remain -= offset;
525                 addr = adjust_addr(env, addr + offset);
526             }
527         }
528     }
529 ProbeSuccess:
530     /* load bytes from guest memory */
531     if (vl != 0) {
532         env->vl = vl;
533     }
534     for (i = env->vstart; i < env->vl; i++) {
535         k = 0;
536         while (k < nf) {
537             if (!vm && !vext_elem_mask(v0, i)) {
538                 /* set masked-off elements to 1s */
539                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
540                                   (i + k * max_elems + 1) * esz);
541                 k++;
542                 continue;
543             }
544             addr = base + ((i * nf + k) << log2_esz);
545             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
546             k++;
547         }
548     }
549     env->vstart = 0;
550 
551     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
552 }
553 
554 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
555 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
556                   CPURISCVState *env, uint32_t desc)      \
557 {                                                         \
558     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
559               ctzl(sizeof(ETYPE)), GETPC());              \
560 }
561 
562 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
563 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
564 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
565 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
566 
567 #define DO_SWAP(N, M) (M)
568 #define DO_AND(N, M)  (N & M)
569 #define DO_XOR(N, M)  (N ^ M)
570 #define DO_OR(N, M)   (N | M)
571 #define DO_ADD(N, M)  (N + M)
572 
573 /* Signed min/max */
574 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
575 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
576 
577 /*
578  * load and store whole register instructions
579  */
580 static void
581 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
582                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
583 {
584     uint32_t i, k, off, pos;
585     uint32_t nf = vext_nf(desc);
586     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
587     uint32_t max_elems = vlenb >> log2_esz;
588 
589     if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
590         env->vstart = 0;
591         return;
592     }
593 
594     k = env->vstart / max_elems;
595     off = env->vstart % max_elems;
596 
597     if (off) {
598         /* load/store rest of elements of current segment pointed by vstart */
599         for (pos = off; pos < max_elems; pos++, env->vstart++) {
600             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
601             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
602                       ra);
603         }
604         k++;
605     }
606 
607     /* load/store elements for rest of segments */
608     for (; k < nf; k++) {
609         for (i = 0; i < max_elems; i++, env->vstart++) {
610             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
611             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
612         }
613     }
614 
615     env->vstart = 0;
616 }
617 
618 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
619 void HELPER(NAME)(void *vd, target_ulong base,       \
620                   CPURISCVState *env, uint32_t desc) \
621 {                                                    \
622     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
623                     ctzl(sizeof(ETYPE)), GETPC());   \
624 }
625 
626 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
627 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
628 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
629 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
630 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
631 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
632 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
633 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
634 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
635 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
636 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
637 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
638 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
639 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
640 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
641 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
642 
643 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
644 void HELPER(NAME)(void *vd, target_ulong base,       \
645                   CPURISCVState *env, uint32_t desc) \
646 {                                                    \
647     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
648                     ctzl(sizeof(ETYPE)), GETPC());   \
649 }
650 
651 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
652 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
653 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
654 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
655 
656 /*
657  * Vector Integer Arithmetic Instructions
658  */
659 
660 /* (TD, T1, T2, TX1, TX2) */
661 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
662 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
663 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
664 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
665 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
666 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
667 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
668 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
669 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
670 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
671 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
672 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
673 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
674 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
675 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
676 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
677 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
678 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
679 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
680 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
681 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
682 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
683 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
684 
685 #define DO_SUB(N, M) (N - M)
686 #define DO_RSUB(N, M) (M - N)
687 
688 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
689 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
690 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
691 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
692 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
693 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
694 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
695 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
696 
697 GEN_VEXT_VV(vadd_vv_b, 1)
698 GEN_VEXT_VV(vadd_vv_h, 2)
699 GEN_VEXT_VV(vadd_vv_w, 4)
700 GEN_VEXT_VV(vadd_vv_d, 8)
701 GEN_VEXT_VV(vsub_vv_b, 1)
702 GEN_VEXT_VV(vsub_vv_h, 2)
703 GEN_VEXT_VV(vsub_vv_w, 4)
704 GEN_VEXT_VV(vsub_vv_d, 8)
705 
706 
707 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
708 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
709 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
710 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
711 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
712 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
713 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
714 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
715 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
716 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
717 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
718 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
719 
720 GEN_VEXT_VX(vadd_vx_b, 1)
721 GEN_VEXT_VX(vadd_vx_h, 2)
722 GEN_VEXT_VX(vadd_vx_w, 4)
723 GEN_VEXT_VX(vadd_vx_d, 8)
724 GEN_VEXT_VX(vsub_vx_b, 1)
725 GEN_VEXT_VX(vsub_vx_h, 2)
726 GEN_VEXT_VX(vsub_vx_w, 4)
727 GEN_VEXT_VX(vsub_vx_d, 8)
728 GEN_VEXT_VX(vrsub_vx_b, 1)
729 GEN_VEXT_VX(vrsub_vx_h, 2)
730 GEN_VEXT_VX(vrsub_vx_w, 4)
731 GEN_VEXT_VX(vrsub_vx_d, 8)
732 
733 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
734 {
735     intptr_t oprsz = simd_oprsz(desc);
736     intptr_t i;
737 
738     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
739         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
740     }
741 }
742 
743 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
744 {
745     intptr_t oprsz = simd_oprsz(desc);
746     intptr_t i;
747 
748     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
749         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
750     }
751 }
752 
753 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
754 {
755     intptr_t oprsz = simd_oprsz(desc);
756     intptr_t i;
757 
758     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
759         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
760     }
761 }
762 
763 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
764 {
765     intptr_t oprsz = simd_oprsz(desc);
766     intptr_t i;
767 
768     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
769         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
770     }
771 }
772 
773 /* Vector Widening Integer Add/Subtract */
774 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
775 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
776 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
777 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
778 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
779 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
780 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
781 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
782 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
783 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
784 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
785 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
786 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
787 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
788 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
789 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
790 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
791 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
792 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
793 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
794 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
795 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
796 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
797 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
798 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
799 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
800 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
801 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
802 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
803 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
804 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
805 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
806 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
807 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
808 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
809 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
810 GEN_VEXT_VV(vwaddu_vv_b, 2)
811 GEN_VEXT_VV(vwaddu_vv_h, 4)
812 GEN_VEXT_VV(vwaddu_vv_w, 8)
813 GEN_VEXT_VV(vwsubu_vv_b, 2)
814 GEN_VEXT_VV(vwsubu_vv_h, 4)
815 GEN_VEXT_VV(vwsubu_vv_w, 8)
816 GEN_VEXT_VV(vwadd_vv_b, 2)
817 GEN_VEXT_VV(vwadd_vv_h, 4)
818 GEN_VEXT_VV(vwadd_vv_w, 8)
819 GEN_VEXT_VV(vwsub_vv_b, 2)
820 GEN_VEXT_VV(vwsub_vv_h, 4)
821 GEN_VEXT_VV(vwsub_vv_w, 8)
822 GEN_VEXT_VV(vwaddu_wv_b, 2)
823 GEN_VEXT_VV(vwaddu_wv_h, 4)
824 GEN_VEXT_VV(vwaddu_wv_w, 8)
825 GEN_VEXT_VV(vwsubu_wv_b, 2)
826 GEN_VEXT_VV(vwsubu_wv_h, 4)
827 GEN_VEXT_VV(vwsubu_wv_w, 8)
828 GEN_VEXT_VV(vwadd_wv_b, 2)
829 GEN_VEXT_VV(vwadd_wv_h, 4)
830 GEN_VEXT_VV(vwadd_wv_w, 8)
831 GEN_VEXT_VV(vwsub_wv_b, 2)
832 GEN_VEXT_VV(vwsub_wv_h, 4)
833 GEN_VEXT_VV(vwsub_wv_w, 8)
834 
835 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
836 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
837 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
838 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
839 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
840 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
841 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
842 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
843 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
844 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
845 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
846 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
847 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
848 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
849 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
850 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
851 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
852 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
853 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
854 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
855 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
856 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
857 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
858 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
859 GEN_VEXT_VX(vwaddu_vx_b, 2)
860 GEN_VEXT_VX(vwaddu_vx_h, 4)
861 GEN_VEXT_VX(vwaddu_vx_w, 8)
862 GEN_VEXT_VX(vwsubu_vx_b, 2)
863 GEN_VEXT_VX(vwsubu_vx_h, 4)
864 GEN_VEXT_VX(vwsubu_vx_w, 8)
865 GEN_VEXT_VX(vwadd_vx_b, 2)
866 GEN_VEXT_VX(vwadd_vx_h, 4)
867 GEN_VEXT_VX(vwadd_vx_w, 8)
868 GEN_VEXT_VX(vwsub_vx_b, 2)
869 GEN_VEXT_VX(vwsub_vx_h, 4)
870 GEN_VEXT_VX(vwsub_vx_w, 8)
871 GEN_VEXT_VX(vwaddu_wx_b, 2)
872 GEN_VEXT_VX(vwaddu_wx_h, 4)
873 GEN_VEXT_VX(vwaddu_wx_w, 8)
874 GEN_VEXT_VX(vwsubu_wx_b, 2)
875 GEN_VEXT_VX(vwsubu_wx_h, 4)
876 GEN_VEXT_VX(vwsubu_wx_w, 8)
877 GEN_VEXT_VX(vwadd_wx_b, 2)
878 GEN_VEXT_VX(vwadd_wx_h, 4)
879 GEN_VEXT_VX(vwadd_wx_w, 8)
880 GEN_VEXT_VX(vwsub_wx_b, 2)
881 GEN_VEXT_VX(vwsub_wx_h, 4)
882 GEN_VEXT_VX(vwsub_wx_w, 8)
883 
884 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
885 #define DO_VADC(N, M, C) (N + M + C)
886 #define DO_VSBC(N, M, C) (N - M - C)
887 
888 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
889 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
890                   CPURISCVState *env, uint32_t desc)          \
891 {                                                             \
892     uint32_t vl = env->vl;                                    \
893     uint32_t esz = sizeof(ETYPE);                             \
894     uint32_t total_elems =                                    \
895         vext_get_total_elems(env, desc, esz);                 \
896     uint32_t vta = vext_vta(desc);                            \
897     uint32_t i;                                               \
898                                                               \
899     VSTART_CHECK_EARLY_EXIT(env);                             \
900                                                               \
901     for (i = env->vstart; i < vl; i++) {                      \
902         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
903         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
904         ETYPE carry = vext_elem_mask(v0, i);                  \
905                                                               \
906         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
907     }                                                         \
908     env->vstart = 0;                                          \
909     /* set tail elements to 1s */                             \
910     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
911 }
912 
913 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
914 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
915 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
916 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
917 
918 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
919 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
920 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
921 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
922 
923 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
924 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
925                   CPURISCVState *env, uint32_t desc)                     \
926 {                                                                        \
927     uint32_t vl = env->vl;                                               \
928     uint32_t esz = sizeof(ETYPE);                                        \
929     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
930     uint32_t vta = vext_vta(desc);                                       \
931     uint32_t i;                                                          \
932                                                                          \
933     VSTART_CHECK_EARLY_EXIT(env);                                        \
934                                                                          \
935     for (i = env->vstart; i < vl; i++) {                                 \
936         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
937         ETYPE carry = vext_elem_mask(v0, i);                             \
938                                                                          \
939         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
940     }                                                                    \
941     env->vstart = 0;                                                     \
942     /* set tail elements to 1s */                                        \
943     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
944 }
945 
946 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
947 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
948 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
949 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
950 
951 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
952 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
953 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
954 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
955 
956 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
957                           (__typeof(N))(N + M) < N)
958 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
959 
960 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
961 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
962                   CPURISCVState *env, uint32_t desc)          \
963 {                                                             \
964     uint32_t vl = env->vl;                                    \
965     uint32_t vm = vext_vm(desc);                              \
966     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
967     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
968     uint32_t i;                                               \
969                                                               \
970     VSTART_CHECK_EARLY_EXIT(env);                             \
971                                                               \
972     for (i = env->vstart; i < vl; i++) {                      \
973         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
974         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
975         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
976         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
977     }                                                         \
978     env->vstart = 0;                                          \
979     /*
980      * mask destination register are always tail-agnostic
981      * set tail elements to 1s
982      */                                                       \
983     if (vta_all_1s) {                                         \
984         for (; i < total_elems; i++) {                        \
985             vext_set_elem_mask(vd, i, 1);                     \
986         }                                                     \
987     }                                                         \
988 }
989 
990 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
991 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
992 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
993 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
994 
995 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
996 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
997 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
998 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
999 
1000 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1001 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1002                   void *vs2, CPURISCVState *env, uint32_t desc) \
1003 {                                                               \
1004     uint32_t vl = env->vl;                                      \
1005     uint32_t vm = vext_vm(desc);                                \
1006     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1007     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1008     uint32_t i;                                                 \
1009                                                                 \
1010     VSTART_CHECK_EARLY_EXIT(env);                               \
1011                                                                 \
1012     for (i = env->vstart; i < vl; i++) {                        \
1013         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1014         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1015         vext_set_elem_mask(vd, i,                               \
1016                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1017     }                                                           \
1018     env->vstart = 0;                                            \
1019     /*
1020      * mask destination register are always tail-agnostic
1021      * set tail elements to 1s
1022      */                                                         \
1023     if (vta_all_1s) {                                           \
1024         for (; i < total_elems; i++) {                          \
1025             vext_set_elem_mask(vd, i, 1);                       \
1026         }                                                       \
1027     }                                                           \
1028 }
1029 
1030 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1031 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1032 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1033 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1034 
1035 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1036 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1037 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1038 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1039 
1040 /* Vector Bitwise Logical Instructions */
1041 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1042 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1043 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1044 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1045 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1046 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1047 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1048 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1049 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1050 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1051 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1052 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1053 GEN_VEXT_VV(vand_vv_b, 1)
1054 GEN_VEXT_VV(vand_vv_h, 2)
1055 GEN_VEXT_VV(vand_vv_w, 4)
1056 GEN_VEXT_VV(vand_vv_d, 8)
1057 GEN_VEXT_VV(vor_vv_b, 1)
1058 GEN_VEXT_VV(vor_vv_h, 2)
1059 GEN_VEXT_VV(vor_vv_w, 4)
1060 GEN_VEXT_VV(vor_vv_d, 8)
1061 GEN_VEXT_VV(vxor_vv_b, 1)
1062 GEN_VEXT_VV(vxor_vv_h, 2)
1063 GEN_VEXT_VV(vxor_vv_w, 4)
1064 GEN_VEXT_VV(vxor_vv_d, 8)
1065 
1066 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1067 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1068 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1069 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1070 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1071 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1072 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1073 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1074 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1075 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1076 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1077 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1078 GEN_VEXT_VX(vand_vx_b, 1)
1079 GEN_VEXT_VX(vand_vx_h, 2)
1080 GEN_VEXT_VX(vand_vx_w, 4)
1081 GEN_VEXT_VX(vand_vx_d, 8)
1082 GEN_VEXT_VX(vor_vx_b, 1)
1083 GEN_VEXT_VX(vor_vx_h, 2)
1084 GEN_VEXT_VX(vor_vx_w, 4)
1085 GEN_VEXT_VX(vor_vx_d, 8)
1086 GEN_VEXT_VX(vxor_vx_b, 1)
1087 GEN_VEXT_VX(vxor_vx_h, 2)
1088 GEN_VEXT_VX(vxor_vx_w, 4)
1089 GEN_VEXT_VX(vxor_vx_d, 8)
1090 
1091 /* Vector Single-Width Bit Shift Instructions */
1092 #define DO_SLL(N, M)  (N << (M))
1093 #define DO_SRL(N, M)  (N >> (M))
1094 
1095 /* generate the helpers for shift instructions with two vector operators */
1096 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1097 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1098                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1099 {                                                                         \
1100     uint32_t vm = vext_vm(desc);                                          \
1101     uint32_t vl = env->vl;                                                \
1102     uint32_t esz = sizeof(TS1);                                           \
1103     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1104     uint32_t vta = vext_vta(desc);                                        \
1105     uint32_t vma = vext_vma(desc);                                        \
1106     uint32_t i;                                                           \
1107                                                                           \
1108     VSTART_CHECK_EARLY_EXIT(env);                                         \
1109                                                                           \
1110     for (i = env->vstart; i < vl; i++) {                                  \
1111         if (!vm && !vext_elem_mask(v0, i)) {                              \
1112             /* set masked-off elements to 1s */                           \
1113             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1114             continue;                                                     \
1115         }                                                                 \
1116         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1117         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1118         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1119     }                                                                     \
1120     env->vstart = 0;                                                      \
1121     /* set tail elements to 1s */                                         \
1122     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1123 }
1124 
1125 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1126 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1127 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1128 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1129 
1130 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1131 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1132 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1133 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1134 
1135 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1136 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1137 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1138 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1139 
1140 /*
1141  * generate the helpers for shift instructions with one vector and one scalar
1142  */
1143 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1144 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1145                   void *vs2, CPURISCVState *env,            \
1146                   uint32_t desc)                            \
1147 {                                                           \
1148     uint32_t vm = vext_vm(desc);                            \
1149     uint32_t vl = env->vl;                                  \
1150     uint32_t esz = sizeof(TD);                              \
1151     uint32_t total_elems =                                  \
1152         vext_get_total_elems(env, desc, esz);               \
1153     uint32_t vta = vext_vta(desc);                          \
1154     uint32_t vma = vext_vma(desc);                          \
1155     uint32_t i;                                             \
1156                                                             \
1157     VSTART_CHECK_EARLY_EXIT(env);                           \
1158                                                             \
1159     for (i = env->vstart; i < vl; i++) {                    \
1160         if (!vm && !vext_elem_mask(v0, i)) {                \
1161             /* set masked-off elements to 1s */             \
1162             vext_set_elems_1s(vd, vma, i * esz,             \
1163                               (i + 1) * esz);               \
1164             continue;                                       \
1165         }                                                   \
1166         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1167         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1168     }                                                       \
1169     env->vstart = 0;                                        \
1170     /* set tail elements to 1s */                           \
1171     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1172 }
1173 
1174 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1175 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1176 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1177 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1178 
1179 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1180 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1181 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1182 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1183 
1184 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1185 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1186 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1187 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1188 
1189 /* Vector Narrowing Integer Right Shift Instructions */
1190 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1191 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1192 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1193 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1194 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1195 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1196 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1197 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1198 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1199 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1200 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1201 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1202 
1203 /* Vector Integer Comparison Instructions */
1204 #define DO_MSEQ(N, M) (N == M)
1205 #define DO_MSNE(N, M) (N != M)
1206 #define DO_MSLT(N, M) (N < M)
1207 #define DO_MSLE(N, M) (N <= M)
1208 #define DO_MSGT(N, M) (N > M)
1209 
1210 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1211 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1212                   CPURISCVState *env, uint32_t desc)          \
1213 {                                                             \
1214     uint32_t vm = vext_vm(desc);                              \
1215     uint32_t vl = env->vl;                                    \
1216     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1217     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1218     uint32_t vma = vext_vma(desc);                            \
1219     uint32_t i;                                               \
1220                                                               \
1221     VSTART_CHECK_EARLY_EXIT(env);                             \
1222                                                               \
1223     for (i = env->vstart; i < vl; i++) {                      \
1224         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1225         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1226         if (!vm && !vext_elem_mask(v0, i)) {                  \
1227             /* set masked-off elements to 1s */               \
1228             if (vma) {                                        \
1229                 vext_set_elem_mask(vd, i, 1);                 \
1230             }                                                 \
1231             continue;                                         \
1232         }                                                     \
1233         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1234     }                                                         \
1235     env->vstart = 0;                                          \
1236     /*
1237      * mask destination register are always tail-agnostic
1238      * set tail elements to 1s
1239      */                                                       \
1240     if (vta_all_1s) {                                         \
1241         for (; i < total_elems; i++) {                        \
1242             vext_set_elem_mask(vd, i, 1);                     \
1243         }                                                     \
1244     }                                                         \
1245 }
1246 
1247 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1248 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1249 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1250 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1251 
1252 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1253 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1254 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1255 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1256 
1257 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1258 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1259 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1260 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1261 
1262 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1263 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1264 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1265 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1266 
1267 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1268 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1269 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1270 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1271 
1272 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1273 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1274 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1275 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1276 
1277 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1278 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1279                   CPURISCVState *env, uint32_t desc)                \
1280 {                                                                   \
1281     uint32_t vm = vext_vm(desc);                                    \
1282     uint32_t vl = env->vl;                                          \
1283     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1284     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1285     uint32_t vma = vext_vma(desc);                                  \
1286     uint32_t i;                                                     \
1287                                                                     \
1288     VSTART_CHECK_EARLY_EXIT(env);                                   \
1289                                                                     \
1290     for (i = env->vstart; i < vl; i++) {                            \
1291         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1292         if (!vm && !vext_elem_mask(v0, i)) {                        \
1293             /* set masked-off elements to 1s */                     \
1294             if (vma) {                                              \
1295                 vext_set_elem_mask(vd, i, 1);                       \
1296             }                                                       \
1297             continue;                                               \
1298         }                                                           \
1299         vext_set_elem_mask(vd, i,                                   \
1300                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1301     }                                                               \
1302     env->vstart = 0;                                                \
1303     /*
1304      * mask destination register are always tail-agnostic
1305      * set tail elements to 1s
1306      */                                                             \
1307     if (vta_all_1s) {                                               \
1308         for (; i < total_elems; i++) {                              \
1309             vext_set_elem_mask(vd, i, 1);                           \
1310         }                                                           \
1311     }                                                               \
1312 }
1313 
1314 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1315 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1316 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1317 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1318 
1319 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1320 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1321 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1322 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1323 
1324 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1325 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1326 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1327 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1328 
1329 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1330 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1331 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1332 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1333 
1334 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1335 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1336 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1337 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1338 
1339 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1340 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1341 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1342 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1343 
1344 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1345 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1346 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1347 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1348 
1349 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1350 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1351 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1352 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1353 
1354 /* Vector Integer Min/Max Instructions */
1355 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1356 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1357 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1358 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1359 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1360 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1361 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1362 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1363 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1364 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1365 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1366 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1367 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1368 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1369 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1370 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1371 GEN_VEXT_VV(vminu_vv_b, 1)
1372 GEN_VEXT_VV(vminu_vv_h, 2)
1373 GEN_VEXT_VV(vminu_vv_w, 4)
1374 GEN_VEXT_VV(vminu_vv_d, 8)
1375 GEN_VEXT_VV(vmin_vv_b, 1)
1376 GEN_VEXT_VV(vmin_vv_h, 2)
1377 GEN_VEXT_VV(vmin_vv_w, 4)
1378 GEN_VEXT_VV(vmin_vv_d, 8)
1379 GEN_VEXT_VV(vmaxu_vv_b, 1)
1380 GEN_VEXT_VV(vmaxu_vv_h, 2)
1381 GEN_VEXT_VV(vmaxu_vv_w, 4)
1382 GEN_VEXT_VV(vmaxu_vv_d, 8)
1383 GEN_VEXT_VV(vmax_vv_b, 1)
1384 GEN_VEXT_VV(vmax_vv_h, 2)
1385 GEN_VEXT_VV(vmax_vv_w, 4)
1386 GEN_VEXT_VV(vmax_vv_d, 8)
1387 
1388 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1389 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1390 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1391 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1392 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1393 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1394 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1395 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1396 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1397 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1398 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1399 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1400 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1401 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1402 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1403 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1404 GEN_VEXT_VX(vminu_vx_b, 1)
1405 GEN_VEXT_VX(vminu_vx_h, 2)
1406 GEN_VEXT_VX(vminu_vx_w, 4)
1407 GEN_VEXT_VX(vminu_vx_d, 8)
1408 GEN_VEXT_VX(vmin_vx_b, 1)
1409 GEN_VEXT_VX(vmin_vx_h, 2)
1410 GEN_VEXT_VX(vmin_vx_w, 4)
1411 GEN_VEXT_VX(vmin_vx_d, 8)
1412 GEN_VEXT_VX(vmaxu_vx_b, 1)
1413 GEN_VEXT_VX(vmaxu_vx_h, 2)
1414 GEN_VEXT_VX(vmaxu_vx_w, 4)
1415 GEN_VEXT_VX(vmaxu_vx_d, 8)
1416 GEN_VEXT_VX(vmax_vx_b, 1)
1417 GEN_VEXT_VX(vmax_vx_h, 2)
1418 GEN_VEXT_VX(vmax_vx_w, 4)
1419 GEN_VEXT_VX(vmax_vx_d, 8)
1420 
1421 /* Vector Single-Width Integer Multiply Instructions */
1422 #define DO_MUL(N, M) (N * M)
1423 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1424 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1425 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1426 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1427 GEN_VEXT_VV(vmul_vv_b, 1)
1428 GEN_VEXT_VV(vmul_vv_h, 2)
1429 GEN_VEXT_VV(vmul_vv_w, 4)
1430 GEN_VEXT_VV(vmul_vv_d, 8)
1431 
1432 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1433 {
1434     return (int16_t)s2 * (int16_t)s1 >> 8;
1435 }
1436 
1437 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1438 {
1439     return (int32_t)s2 * (int32_t)s1 >> 16;
1440 }
1441 
1442 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1443 {
1444     return (int64_t)s2 * (int64_t)s1 >> 32;
1445 }
1446 
1447 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1448 {
1449     uint64_t hi_64, lo_64;
1450 
1451     muls64(&lo_64, &hi_64, s1, s2);
1452     return hi_64;
1453 }
1454 
1455 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1456 {
1457     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1458 }
1459 
1460 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1461 {
1462     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1463 }
1464 
1465 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1466 {
1467     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1468 }
1469 
1470 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1471 {
1472     uint64_t hi_64, lo_64;
1473 
1474     mulu64(&lo_64, &hi_64, s2, s1);
1475     return hi_64;
1476 }
1477 
1478 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1479 {
1480     return (int16_t)s2 * (uint16_t)s1 >> 8;
1481 }
1482 
1483 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1484 {
1485     return (int32_t)s2 * (uint32_t)s1 >> 16;
1486 }
1487 
1488 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1489 {
1490     return (int64_t)s2 * (uint64_t)s1 >> 32;
1491 }
1492 
1493 /*
1494  * Let  A = signed operand,
1495  *      B = unsigned operand
1496  *      P = mulu64(A, B), unsigned product
1497  *
1498  * LET  X = 2 ** 64  - A, 2's complement of A
1499  *      SP = signed product
1500  * THEN
1501  *      IF A < 0
1502  *          SP = -X * B
1503  *             = -(2 ** 64 - A) * B
1504  *             = A * B - 2 ** 64 * B
1505  *             = P - 2 ** 64 * B
1506  *      ELSE
1507  *          SP = P
1508  * THEN
1509  *      HI_P -= (A < 0 ? B : 0)
1510  */
1511 
1512 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1513 {
1514     uint64_t hi_64, lo_64;
1515 
1516     mulu64(&lo_64, &hi_64, s2, s1);
1517 
1518     hi_64 -= s2 < 0 ? s1 : 0;
1519     return hi_64;
1520 }
1521 
1522 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1523 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1524 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1525 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1526 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1527 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1528 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1529 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1530 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1531 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1532 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1533 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1534 GEN_VEXT_VV(vmulh_vv_b, 1)
1535 GEN_VEXT_VV(vmulh_vv_h, 2)
1536 GEN_VEXT_VV(vmulh_vv_w, 4)
1537 GEN_VEXT_VV(vmulh_vv_d, 8)
1538 GEN_VEXT_VV(vmulhu_vv_b, 1)
1539 GEN_VEXT_VV(vmulhu_vv_h, 2)
1540 GEN_VEXT_VV(vmulhu_vv_w, 4)
1541 GEN_VEXT_VV(vmulhu_vv_d, 8)
1542 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1543 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1544 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1545 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1546 
1547 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1548 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1549 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1550 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1551 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1552 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1553 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1554 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1555 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1556 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1557 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1558 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1559 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1560 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1561 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1562 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1563 GEN_VEXT_VX(vmul_vx_b, 1)
1564 GEN_VEXT_VX(vmul_vx_h, 2)
1565 GEN_VEXT_VX(vmul_vx_w, 4)
1566 GEN_VEXT_VX(vmul_vx_d, 8)
1567 GEN_VEXT_VX(vmulh_vx_b, 1)
1568 GEN_VEXT_VX(vmulh_vx_h, 2)
1569 GEN_VEXT_VX(vmulh_vx_w, 4)
1570 GEN_VEXT_VX(vmulh_vx_d, 8)
1571 GEN_VEXT_VX(vmulhu_vx_b, 1)
1572 GEN_VEXT_VX(vmulhu_vx_h, 2)
1573 GEN_VEXT_VX(vmulhu_vx_w, 4)
1574 GEN_VEXT_VX(vmulhu_vx_d, 8)
1575 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1576 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1577 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1578 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1579 
1580 /* Vector Integer Divide Instructions */
1581 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1582 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1583 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1584         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1585 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1586         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1587 
1588 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1589 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1590 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1591 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1592 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1593 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1594 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1595 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1596 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1597 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1598 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1599 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1600 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1601 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1602 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1603 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1604 GEN_VEXT_VV(vdivu_vv_b, 1)
1605 GEN_VEXT_VV(vdivu_vv_h, 2)
1606 GEN_VEXT_VV(vdivu_vv_w, 4)
1607 GEN_VEXT_VV(vdivu_vv_d, 8)
1608 GEN_VEXT_VV(vdiv_vv_b, 1)
1609 GEN_VEXT_VV(vdiv_vv_h, 2)
1610 GEN_VEXT_VV(vdiv_vv_w, 4)
1611 GEN_VEXT_VV(vdiv_vv_d, 8)
1612 GEN_VEXT_VV(vremu_vv_b, 1)
1613 GEN_VEXT_VV(vremu_vv_h, 2)
1614 GEN_VEXT_VV(vremu_vv_w, 4)
1615 GEN_VEXT_VV(vremu_vv_d, 8)
1616 GEN_VEXT_VV(vrem_vv_b, 1)
1617 GEN_VEXT_VV(vrem_vv_h, 2)
1618 GEN_VEXT_VV(vrem_vv_w, 4)
1619 GEN_VEXT_VV(vrem_vv_d, 8)
1620 
1621 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1622 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1623 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1624 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1625 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1626 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1627 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1628 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1629 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1630 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1631 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1632 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1633 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1634 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1635 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1636 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1637 GEN_VEXT_VX(vdivu_vx_b, 1)
1638 GEN_VEXT_VX(vdivu_vx_h, 2)
1639 GEN_VEXT_VX(vdivu_vx_w, 4)
1640 GEN_VEXT_VX(vdivu_vx_d, 8)
1641 GEN_VEXT_VX(vdiv_vx_b, 1)
1642 GEN_VEXT_VX(vdiv_vx_h, 2)
1643 GEN_VEXT_VX(vdiv_vx_w, 4)
1644 GEN_VEXT_VX(vdiv_vx_d, 8)
1645 GEN_VEXT_VX(vremu_vx_b, 1)
1646 GEN_VEXT_VX(vremu_vx_h, 2)
1647 GEN_VEXT_VX(vremu_vx_w, 4)
1648 GEN_VEXT_VX(vremu_vx_d, 8)
1649 GEN_VEXT_VX(vrem_vx_b, 1)
1650 GEN_VEXT_VX(vrem_vx_h, 2)
1651 GEN_VEXT_VX(vrem_vx_w, 4)
1652 GEN_VEXT_VX(vrem_vx_d, 8)
1653 
1654 /* Vector Widening Integer Multiply Instructions */
1655 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1656 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1657 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1658 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1659 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1660 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1661 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1662 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1663 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1664 GEN_VEXT_VV(vwmul_vv_b, 2)
1665 GEN_VEXT_VV(vwmul_vv_h, 4)
1666 GEN_VEXT_VV(vwmul_vv_w, 8)
1667 GEN_VEXT_VV(vwmulu_vv_b, 2)
1668 GEN_VEXT_VV(vwmulu_vv_h, 4)
1669 GEN_VEXT_VV(vwmulu_vv_w, 8)
1670 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1671 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1672 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1673 
1674 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1675 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1676 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1677 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1678 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1679 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1680 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1681 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1682 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1683 GEN_VEXT_VX(vwmul_vx_b, 2)
1684 GEN_VEXT_VX(vwmul_vx_h, 4)
1685 GEN_VEXT_VX(vwmul_vx_w, 8)
1686 GEN_VEXT_VX(vwmulu_vx_b, 2)
1687 GEN_VEXT_VX(vwmulu_vx_h, 4)
1688 GEN_VEXT_VX(vwmulu_vx_w, 8)
1689 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1690 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1691 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1692 
1693 /* Vector Single-Width Integer Multiply-Add Instructions */
1694 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1695 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1696 {                                                                  \
1697     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1698     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1699     TD d = *((TD *)vd + HD(i));                                    \
1700     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1701 }
1702 
1703 #define DO_MACC(N, M, D) (M * N + D)
1704 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1705 #define DO_MADD(N, M, D) (M * D + N)
1706 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1707 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1708 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1709 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1710 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1711 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1712 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1713 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1714 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1715 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1716 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1717 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1718 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1719 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1720 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1721 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1722 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1723 GEN_VEXT_VV(vmacc_vv_b, 1)
1724 GEN_VEXT_VV(vmacc_vv_h, 2)
1725 GEN_VEXT_VV(vmacc_vv_w, 4)
1726 GEN_VEXT_VV(vmacc_vv_d, 8)
1727 GEN_VEXT_VV(vnmsac_vv_b, 1)
1728 GEN_VEXT_VV(vnmsac_vv_h, 2)
1729 GEN_VEXT_VV(vnmsac_vv_w, 4)
1730 GEN_VEXT_VV(vnmsac_vv_d, 8)
1731 GEN_VEXT_VV(vmadd_vv_b, 1)
1732 GEN_VEXT_VV(vmadd_vv_h, 2)
1733 GEN_VEXT_VV(vmadd_vv_w, 4)
1734 GEN_VEXT_VV(vmadd_vv_d, 8)
1735 GEN_VEXT_VV(vnmsub_vv_b, 1)
1736 GEN_VEXT_VV(vnmsub_vv_h, 2)
1737 GEN_VEXT_VV(vnmsub_vv_w, 4)
1738 GEN_VEXT_VV(vnmsub_vv_d, 8)
1739 
1740 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1741 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1742 {                                                                   \
1743     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1744     TD d = *((TD *)vd + HD(i));                                     \
1745     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1746 }
1747 
1748 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1749 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1750 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1751 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1752 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1753 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1754 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1755 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1756 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1757 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1758 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1759 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1760 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1761 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1762 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1763 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1764 GEN_VEXT_VX(vmacc_vx_b, 1)
1765 GEN_VEXT_VX(vmacc_vx_h, 2)
1766 GEN_VEXT_VX(vmacc_vx_w, 4)
1767 GEN_VEXT_VX(vmacc_vx_d, 8)
1768 GEN_VEXT_VX(vnmsac_vx_b, 1)
1769 GEN_VEXT_VX(vnmsac_vx_h, 2)
1770 GEN_VEXT_VX(vnmsac_vx_w, 4)
1771 GEN_VEXT_VX(vnmsac_vx_d, 8)
1772 GEN_VEXT_VX(vmadd_vx_b, 1)
1773 GEN_VEXT_VX(vmadd_vx_h, 2)
1774 GEN_VEXT_VX(vmadd_vx_w, 4)
1775 GEN_VEXT_VX(vmadd_vx_d, 8)
1776 GEN_VEXT_VX(vnmsub_vx_b, 1)
1777 GEN_VEXT_VX(vnmsub_vx_h, 2)
1778 GEN_VEXT_VX(vnmsub_vx_w, 4)
1779 GEN_VEXT_VX(vnmsub_vx_d, 8)
1780 
1781 /* Vector Widening Integer Multiply-Add Instructions */
1782 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1783 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1784 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1785 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1786 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1787 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1788 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1789 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1790 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1791 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1792 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1793 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1794 GEN_VEXT_VV(vwmacc_vv_b, 2)
1795 GEN_VEXT_VV(vwmacc_vv_h, 4)
1796 GEN_VEXT_VV(vwmacc_vv_w, 8)
1797 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1798 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1799 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1800 
1801 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1802 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1803 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1804 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1805 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1806 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1807 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1808 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1809 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1810 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1811 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1812 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1813 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1814 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1815 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1816 GEN_VEXT_VX(vwmacc_vx_b, 2)
1817 GEN_VEXT_VX(vwmacc_vx_h, 4)
1818 GEN_VEXT_VX(vwmacc_vx_w, 8)
1819 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1820 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1821 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1822 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1823 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1824 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1825 
1826 /* Vector Integer Merge and Move Instructions */
1827 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1828 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1829                   uint32_t desc)                                     \
1830 {                                                                    \
1831     uint32_t vl = env->vl;                                           \
1832     uint32_t esz = sizeof(ETYPE);                                    \
1833     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1834     uint32_t vta = vext_vta(desc);                                   \
1835     uint32_t i;                                                      \
1836                                                                      \
1837     VSTART_CHECK_EARLY_EXIT(env);                                    \
1838                                                                      \
1839     for (i = env->vstart; i < vl; i++) {                             \
1840         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1841         *((ETYPE *)vd + H(i)) = s1;                                  \
1842     }                                                                \
1843     env->vstart = 0;                                                 \
1844     /* set tail elements to 1s */                                    \
1845     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1846 }
1847 
1848 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1849 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1850 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1851 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1852 
1853 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1854 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1855                   uint32_t desc)                                     \
1856 {                                                                    \
1857     uint32_t vl = env->vl;                                           \
1858     uint32_t esz = sizeof(ETYPE);                                    \
1859     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1860     uint32_t vta = vext_vta(desc);                                   \
1861     uint32_t i;                                                      \
1862                                                                      \
1863     VSTART_CHECK_EARLY_EXIT(env);                                    \
1864                                                                      \
1865     for (i = env->vstart; i < vl; i++) {                             \
1866         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1867     }                                                                \
1868     env->vstart = 0;                                                 \
1869     /* set tail elements to 1s */                                    \
1870     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1871 }
1872 
1873 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1874 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1875 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1876 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1877 
1878 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1879 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1880                   CPURISCVState *env, uint32_t desc)                 \
1881 {                                                                    \
1882     uint32_t vl = env->vl;                                           \
1883     uint32_t esz = sizeof(ETYPE);                                    \
1884     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1885     uint32_t vta = vext_vta(desc);                                   \
1886     uint32_t i;                                                      \
1887                                                                      \
1888     VSTART_CHECK_EARLY_EXIT(env);                                    \
1889                                                                      \
1890     for (i = env->vstart; i < vl; i++) {                             \
1891         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1892         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1893     }                                                                \
1894     env->vstart = 0;                                                 \
1895     /* set tail elements to 1s */                                    \
1896     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1897 }
1898 
1899 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1900 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1901 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1902 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1903 
1904 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1905 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1906                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1907 {                                                                    \
1908     uint32_t vl = env->vl;                                           \
1909     uint32_t esz = sizeof(ETYPE);                                    \
1910     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1911     uint32_t vta = vext_vta(desc);                                   \
1912     uint32_t i;                                                      \
1913                                                                      \
1914     VSTART_CHECK_EARLY_EXIT(env);                                    \
1915                                                                      \
1916     for (i = env->vstart; i < vl; i++) {                             \
1917         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1918         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1919                    (ETYPE)(target_long)s1);                          \
1920         *((ETYPE *)vd + H(i)) = d;                                   \
1921     }                                                                \
1922     env->vstart = 0;                                                 \
1923     /* set tail elements to 1s */                                    \
1924     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1925 }
1926 
1927 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1928 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1929 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1930 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1931 
1932 /*
1933  * Vector Fixed-Point Arithmetic Instructions
1934  */
1935 
1936 /* Vector Single-Width Saturating Add and Subtract */
1937 
1938 /*
1939  * As fixed point instructions probably have round mode and saturation,
1940  * define common macros for fixed point here.
1941  */
1942 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1943                           CPURISCVState *env, int vxrm);
1944 
1945 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1946 static inline void                                                  \
1947 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1948           CPURISCVState *env, int vxrm)                             \
1949 {                                                                   \
1950     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1951     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1952     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1953 }
1954 
1955 static inline void
1956 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1957              CPURISCVState *env,
1958              uint32_t vl, uint32_t vm, int vxrm,
1959              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1960 {
1961     VSTART_CHECK_EARLY_EXIT(env);
1962 
1963     for (uint32_t i = env->vstart; i < vl; i++) {
1964         if (!vm && !vext_elem_mask(v0, i)) {
1965             /* set masked-off elements to 1s */
1966             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1967             continue;
1968         }
1969         fn(vd, vs1, vs2, i, env, vxrm);
1970     }
1971     env->vstart = 0;
1972 }
1973 
1974 static inline void
1975 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1976              CPURISCVState *env,
1977              uint32_t desc,
1978              opivv2_rm_fn *fn, uint32_t esz)
1979 {
1980     uint32_t vm = vext_vm(desc);
1981     uint32_t vl = env->vl;
1982     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1983     uint32_t vta = vext_vta(desc);
1984     uint32_t vma = vext_vma(desc);
1985 
1986     switch (env->vxrm) {
1987     case 0: /* rnu */
1988         vext_vv_rm_1(vd, v0, vs1, vs2,
1989                      env, vl, vm, 0, fn, vma, esz);
1990         break;
1991     case 1: /* rne */
1992         vext_vv_rm_1(vd, v0, vs1, vs2,
1993                      env, vl, vm, 1, fn, vma, esz);
1994         break;
1995     case 2: /* rdn */
1996         vext_vv_rm_1(vd, v0, vs1, vs2,
1997                      env, vl, vm, 2, fn, vma, esz);
1998         break;
1999     default: /* rod */
2000         vext_vv_rm_1(vd, v0, vs1, vs2,
2001                      env, vl, vm, 3, fn, vma, esz);
2002         break;
2003     }
2004     /* set tail elements to 1s */
2005     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2006 }
2007 
2008 /* generate helpers for fixed point instructions with OPIVV format */
2009 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2010 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2011                   CPURISCVState *env, uint32_t desc)            \
2012 {                                                               \
2013     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2014                  do_##NAME, ESZ);                               \
2015 }
2016 
2017 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2018                              uint8_t b)
2019 {
2020     uint8_t res = a + b;
2021     if (res < a) {
2022         res = UINT8_MAX;
2023         env->vxsat = 0x1;
2024     }
2025     return res;
2026 }
2027 
2028 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2029                                uint16_t b)
2030 {
2031     uint16_t res = a + b;
2032     if (res < a) {
2033         res = UINT16_MAX;
2034         env->vxsat = 0x1;
2035     }
2036     return res;
2037 }
2038 
2039 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2040                                uint32_t b)
2041 {
2042     uint32_t res = a + b;
2043     if (res < a) {
2044         res = UINT32_MAX;
2045         env->vxsat = 0x1;
2046     }
2047     return res;
2048 }
2049 
2050 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2051                                uint64_t b)
2052 {
2053     uint64_t res = a + b;
2054     if (res < a) {
2055         res = UINT64_MAX;
2056         env->vxsat = 0x1;
2057     }
2058     return res;
2059 }
2060 
2061 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2062 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2063 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2064 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2065 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2066 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2067 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2068 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2069 
2070 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2071                           CPURISCVState *env, int vxrm);
2072 
2073 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2074 static inline void                                                  \
2075 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2076           CPURISCVState *env, int vxrm)                             \
2077 {                                                                   \
2078     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2079     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2080 }
2081 
2082 static inline void
2083 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2084              CPURISCVState *env,
2085              uint32_t vl, uint32_t vm, int vxrm,
2086              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2087 {
2088     VSTART_CHECK_EARLY_EXIT(env);
2089 
2090     for (uint32_t i = env->vstart; i < vl; i++) {
2091         if (!vm && !vext_elem_mask(v0, i)) {
2092             /* set masked-off elements to 1s */
2093             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2094             continue;
2095         }
2096         fn(vd, s1, vs2, i, env, vxrm);
2097     }
2098     env->vstart = 0;
2099 }
2100 
2101 static inline void
2102 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2103              CPURISCVState *env,
2104              uint32_t desc,
2105              opivx2_rm_fn *fn, uint32_t esz)
2106 {
2107     uint32_t vm = vext_vm(desc);
2108     uint32_t vl = env->vl;
2109     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2110     uint32_t vta = vext_vta(desc);
2111     uint32_t vma = vext_vma(desc);
2112 
2113     switch (env->vxrm) {
2114     case 0: /* rnu */
2115         vext_vx_rm_1(vd, v0, s1, vs2,
2116                      env, vl, vm, 0, fn, vma, esz);
2117         break;
2118     case 1: /* rne */
2119         vext_vx_rm_1(vd, v0, s1, vs2,
2120                      env, vl, vm, 1, fn, vma, esz);
2121         break;
2122     case 2: /* rdn */
2123         vext_vx_rm_1(vd, v0, s1, vs2,
2124                      env, vl, vm, 2, fn, vma, esz);
2125         break;
2126     default: /* rod */
2127         vext_vx_rm_1(vd, v0, s1, vs2,
2128                      env, vl, vm, 3, fn, vma, esz);
2129         break;
2130     }
2131     /* set tail elements to 1s */
2132     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2133 }
2134 
2135 /* generate helpers for fixed point instructions with OPIVX format */
2136 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2137 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2138                   void *vs2, CPURISCVState *env,          \
2139                   uint32_t desc)                          \
2140 {                                                         \
2141     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2142                  do_##NAME, ESZ);                         \
2143 }
2144 
2145 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2146 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2147 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2148 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2149 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2150 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2151 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2152 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2153 
2154 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2155 {
2156     int8_t res = a + b;
2157     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2158         res = a > 0 ? INT8_MAX : INT8_MIN;
2159         env->vxsat = 0x1;
2160     }
2161     return res;
2162 }
2163 
2164 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2165                              int16_t b)
2166 {
2167     int16_t res = a + b;
2168     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2169         res = a > 0 ? INT16_MAX : INT16_MIN;
2170         env->vxsat = 0x1;
2171     }
2172     return res;
2173 }
2174 
2175 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2176                              int32_t b)
2177 {
2178     int32_t res = a + b;
2179     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2180         res = a > 0 ? INT32_MAX : INT32_MIN;
2181         env->vxsat = 0x1;
2182     }
2183     return res;
2184 }
2185 
2186 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2187                              int64_t b)
2188 {
2189     int64_t res = a + b;
2190     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2191         res = a > 0 ? INT64_MAX : INT64_MIN;
2192         env->vxsat = 0x1;
2193     }
2194     return res;
2195 }
2196 
2197 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2198 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2199 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2200 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2201 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2202 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2203 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2204 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2205 
2206 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2207 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2208 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2209 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2210 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2211 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2212 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2213 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2214 
2215 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2216                              uint8_t b)
2217 {
2218     uint8_t res = a - b;
2219     if (res > a) {
2220         res = 0;
2221         env->vxsat = 0x1;
2222     }
2223     return res;
2224 }
2225 
2226 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2227                                uint16_t b)
2228 {
2229     uint16_t res = a - b;
2230     if (res > a) {
2231         res = 0;
2232         env->vxsat = 0x1;
2233     }
2234     return res;
2235 }
2236 
2237 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2238                                uint32_t b)
2239 {
2240     uint32_t res = a - b;
2241     if (res > a) {
2242         res = 0;
2243         env->vxsat = 0x1;
2244     }
2245     return res;
2246 }
2247 
2248 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2249                                uint64_t b)
2250 {
2251     uint64_t res = a - b;
2252     if (res > a) {
2253         res = 0;
2254         env->vxsat = 0x1;
2255     }
2256     return res;
2257 }
2258 
2259 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2260 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2261 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2262 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2263 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2264 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2265 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2266 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2267 
2268 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2269 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2270 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2271 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2272 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2273 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2274 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2275 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2276 
2277 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2278 {
2279     int8_t res = a - b;
2280     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2281         res = a >= 0 ? INT8_MAX : INT8_MIN;
2282         env->vxsat = 0x1;
2283     }
2284     return res;
2285 }
2286 
2287 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2288                              int16_t b)
2289 {
2290     int16_t res = a - b;
2291     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2292         res = a >= 0 ? INT16_MAX : INT16_MIN;
2293         env->vxsat = 0x1;
2294     }
2295     return res;
2296 }
2297 
2298 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2299                              int32_t b)
2300 {
2301     int32_t res = a - b;
2302     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2303         res = a >= 0 ? INT32_MAX : INT32_MIN;
2304         env->vxsat = 0x1;
2305     }
2306     return res;
2307 }
2308 
2309 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2310                              int64_t b)
2311 {
2312     int64_t res = a - b;
2313     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2314         res = a >= 0 ? INT64_MAX : INT64_MIN;
2315         env->vxsat = 0x1;
2316     }
2317     return res;
2318 }
2319 
2320 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2321 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2322 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2323 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2324 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2325 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2326 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2327 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2328 
2329 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2330 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2331 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2332 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2333 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2334 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2335 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2336 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2337 
2338 /* Vector Single-Width Averaging Add and Subtract */
2339 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2340 {
2341     uint8_t d = extract64(v, shift, 1);
2342     uint8_t d1;
2343     uint64_t D1, D2;
2344 
2345     if (shift == 0 || shift > 64) {
2346         return 0;
2347     }
2348 
2349     d1 = extract64(v, shift - 1, 1);
2350     D1 = extract64(v, 0, shift);
2351     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2352         return d1;
2353     } else if (vxrm == 1) { /* round-to-nearest-even */
2354         if (shift > 1) {
2355             D2 = extract64(v, 0, shift - 1);
2356             return d1 & ((D2 != 0) | d);
2357         } else {
2358             return d1 & d;
2359         }
2360     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2361         return !d & (D1 != 0);
2362     }
2363     return 0; /* round-down (truncate) */
2364 }
2365 
2366 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2367                              int32_t b)
2368 {
2369     int64_t res = (int64_t)a + b;
2370     uint8_t round = get_round(vxrm, res, 1);
2371 
2372     return (res >> 1) + round;
2373 }
2374 
2375 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2376                              int64_t b)
2377 {
2378     int64_t res = a + b;
2379     uint8_t round = get_round(vxrm, res, 1);
2380     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2381 
2382     /* With signed overflow, bit 64 is inverse of bit 63. */
2383     return ((res >> 1) ^ over) + round;
2384 }
2385 
2386 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2387 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2388 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2389 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2390 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2391 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2392 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2393 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2394 
2395 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2396 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2397 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2398 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2399 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2400 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2401 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2402 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2403 
2404 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2405                                uint32_t a, uint32_t b)
2406 {
2407     uint64_t res = (uint64_t)a + b;
2408     uint8_t round = get_round(vxrm, res, 1);
2409 
2410     return (res >> 1) + round;
2411 }
2412 
2413 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2414                                uint64_t a, uint64_t b)
2415 {
2416     uint64_t res = a + b;
2417     uint8_t round = get_round(vxrm, res, 1);
2418     uint64_t over = (uint64_t)(res < a) << 63;
2419 
2420     return ((res >> 1) | over) + round;
2421 }
2422 
2423 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2424 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2425 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2426 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2427 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2428 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2429 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2430 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2431 
2432 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2433 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2434 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2435 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2436 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2437 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2438 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2439 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2440 
2441 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2442                              int32_t b)
2443 {
2444     int64_t res = (int64_t)a - b;
2445     uint8_t round = get_round(vxrm, res, 1);
2446 
2447     return (res >> 1) + round;
2448 }
2449 
2450 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2451                              int64_t b)
2452 {
2453     int64_t res = (int64_t)a - b;
2454     uint8_t round = get_round(vxrm, res, 1);
2455     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2456 
2457     /* With signed overflow, bit 64 is inverse of bit 63. */
2458     return ((res >> 1) ^ over) + round;
2459 }
2460 
2461 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2462 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2463 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2464 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2465 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2466 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2467 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2468 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2469 
2470 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2471 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2472 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2473 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2474 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2475 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2476 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2477 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2478 
2479 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2480                                uint32_t a, uint32_t b)
2481 {
2482     int64_t res = (int64_t)a - b;
2483     uint8_t round = get_round(vxrm, res, 1);
2484 
2485     return (res >> 1) + round;
2486 }
2487 
2488 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2489                                uint64_t a, uint64_t b)
2490 {
2491     uint64_t res = (uint64_t)a - b;
2492     uint8_t round = get_round(vxrm, res, 1);
2493     uint64_t over = (uint64_t)(res > a) << 63;
2494 
2495     return ((res >> 1) | over) + round;
2496 }
2497 
2498 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2499 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2500 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2501 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2502 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2503 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2504 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2505 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2506 
2507 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2508 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2509 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2510 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2511 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2512 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2513 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2514 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2515 
2516 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2517 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2518 {
2519     uint8_t round;
2520     int16_t res;
2521 
2522     res = (int16_t)a * (int16_t)b;
2523     round = get_round(vxrm, res, 7);
2524     res = (res >> 7) + round;
2525 
2526     if (res > INT8_MAX) {
2527         env->vxsat = 0x1;
2528         return INT8_MAX;
2529     } else if (res < INT8_MIN) {
2530         env->vxsat = 0x1;
2531         return INT8_MIN;
2532     } else {
2533         return res;
2534     }
2535 }
2536 
2537 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2538 {
2539     uint8_t round;
2540     int32_t res;
2541 
2542     res = (int32_t)a * (int32_t)b;
2543     round = get_round(vxrm, res, 15);
2544     res = (res >> 15) + round;
2545 
2546     if (res > INT16_MAX) {
2547         env->vxsat = 0x1;
2548         return INT16_MAX;
2549     } else if (res < INT16_MIN) {
2550         env->vxsat = 0x1;
2551         return INT16_MIN;
2552     } else {
2553         return res;
2554     }
2555 }
2556 
2557 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2558 {
2559     uint8_t round;
2560     int64_t res;
2561 
2562     res = (int64_t)a * (int64_t)b;
2563     round = get_round(vxrm, res, 31);
2564     res = (res >> 31) + round;
2565 
2566     if (res > INT32_MAX) {
2567         env->vxsat = 0x1;
2568         return INT32_MAX;
2569     } else if (res < INT32_MIN) {
2570         env->vxsat = 0x1;
2571         return INT32_MIN;
2572     } else {
2573         return res;
2574     }
2575 }
2576 
2577 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2578 {
2579     uint8_t round;
2580     uint64_t hi_64, lo_64;
2581     int64_t res;
2582 
2583     if (a == INT64_MIN && b == INT64_MIN) {
2584         env->vxsat = 1;
2585         return INT64_MAX;
2586     }
2587 
2588     muls64(&lo_64, &hi_64, a, b);
2589     round = get_round(vxrm, lo_64, 63);
2590     /*
2591      * Cannot overflow, as there are always
2592      * 2 sign bits after multiply.
2593      */
2594     res = (hi_64 << 1) | (lo_64 >> 63);
2595     if (round) {
2596         if (res == INT64_MAX) {
2597             env->vxsat = 1;
2598         } else {
2599             res += 1;
2600         }
2601     }
2602     return res;
2603 }
2604 
2605 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2606 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2607 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2608 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2609 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2610 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2611 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2612 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2613 
2614 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2615 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2616 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2617 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2618 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2619 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2620 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2621 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2622 
2623 /* Vector Single-Width Scaling Shift Instructions */
2624 static inline uint8_t
2625 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2626 {
2627     uint8_t round, shift = b & 0x7;
2628     uint8_t res;
2629 
2630     round = get_round(vxrm, a, shift);
2631     res = (a >> shift) + round;
2632     return res;
2633 }
2634 static inline uint16_t
2635 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2636 {
2637     uint8_t round, shift = b & 0xf;
2638 
2639     round = get_round(vxrm, a, shift);
2640     return (a >> shift) + round;
2641 }
2642 static inline uint32_t
2643 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2644 {
2645     uint8_t round, shift = b & 0x1f;
2646 
2647     round = get_round(vxrm, a, shift);
2648     return (a >> shift) + round;
2649 }
2650 static inline uint64_t
2651 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2652 {
2653     uint8_t round, shift = b & 0x3f;
2654 
2655     round = get_round(vxrm, a, shift);
2656     return (a >> shift) + round;
2657 }
2658 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2659 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2660 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2661 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2662 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2663 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2664 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2665 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2666 
2667 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2668 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2669 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2670 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2671 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2672 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2673 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2674 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2675 
2676 static inline int8_t
2677 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2678 {
2679     uint8_t round, shift = b & 0x7;
2680 
2681     round = get_round(vxrm, a, shift);
2682     return (a >> shift) + round;
2683 }
2684 static inline int16_t
2685 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2686 {
2687     uint8_t round, shift = b & 0xf;
2688 
2689     round = get_round(vxrm, a, shift);
2690     return (a >> shift) + round;
2691 }
2692 static inline int32_t
2693 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2694 {
2695     uint8_t round, shift = b & 0x1f;
2696 
2697     round = get_round(vxrm, a, shift);
2698     return (a >> shift) + round;
2699 }
2700 static inline int64_t
2701 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2702 {
2703     uint8_t round, shift = b & 0x3f;
2704 
2705     round = get_round(vxrm, a, shift);
2706     return (a >> shift) + round;
2707 }
2708 
2709 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2710 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2711 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2712 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2713 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2714 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2715 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2716 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2717 
2718 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2719 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2720 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2721 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2722 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2723 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2724 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2725 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2726 
2727 /* Vector Narrowing Fixed-Point Clip Instructions */
2728 static inline int8_t
2729 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2730 {
2731     uint8_t round, shift = b & 0xf;
2732     int16_t res;
2733 
2734     round = get_round(vxrm, a, shift);
2735     res = (a >> shift) + round;
2736     if (res > INT8_MAX) {
2737         env->vxsat = 0x1;
2738         return INT8_MAX;
2739     } else if (res < INT8_MIN) {
2740         env->vxsat = 0x1;
2741         return INT8_MIN;
2742     } else {
2743         return res;
2744     }
2745 }
2746 
2747 static inline int16_t
2748 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2749 {
2750     uint8_t round, shift = b & 0x1f;
2751     int32_t res;
2752 
2753     round = get_round(vxrm, a, shift);
2754     res = (a >> shift) + round;
2755     if (res > INT16_MAX) {
2756         env->vxsat = 0x1;
2757         return INT16_MAX;
2758     } else if (res < INT16_MIN) {
2759         env->vxsat = 0x1;
2760         return INT16_MIN;
2761     } else {
2762         return res;
2763     }
2764 }
2765 
2766 static inline int32_t
2767 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2768 {
2769     uint8_t round, shift = b & 0x3f;
2770     int64_t res;
2771 
2772     round = get_round(vxrm, a, shift);
2773     res = (a >> shift) + round;
2774     if (res > INT32_MAX) {
2775         env->vxsat = 0x1;
2776         return INT32_MAX;
2777     } else if (res < INT32_MIN) {
2778         env->vxsat = 0x1;
2779         return INT32_MIN;
2780     } else {
2781         return res;
2782     }
2783 }
2784 
2785 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2786 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2787 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2788 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2789 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2790 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2791 
2792 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2793 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2794 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2795 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2796 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2797 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2798 
2799 static inline uint8_t
2800 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2801 {
2802     uint8_t round, shift = b & 0xf;
2803     uint16_t res;
2804 
2805     round = get_round(vxrm, a, shift);
2806     res = (a >> shift) + round;
2807     if (res > UINT8_MAX) {
2808         env->vxsat = 0x1;
2809         return UINT8_MAX;
2810     } else {
2811         return res;
2812     }
2813 }
2814 
2815 static inline uint16_t
2816 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2817 {
2818     uint8_t round, shift = b & 0x1f;
2819     uint32_t res;
2820 
2821     round = get_round(vxrm, a, shift);
2822     res = (a >> shift) + round;
2823     if (res > UINT16_MAX) {
2824         env->vxsat = 0x1;
2825         return UINT16_MAX;
2826     } else {
2827         return res;
2828     }
2829 }
2830 
2831 static inline uint32_t
2832 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2833 {
2834     uint8_t round, shift = b & 0x3f;
2835     uint64_t res;
2836 
2837     round = get_round(vxrm, a, shift);
2838     res = (a >> shift) + round;
2839     if (res > UINT32_MAX) {
2840         env->vxsat = 0x1;
2841         return UINT32_MAX;
2842     } else {
2843         return res;
2844     }
2845 }
2846 
2847 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2848 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2849 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2850 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2851 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2852 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2853 
2854 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2855 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2856 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2857 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2858 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2859 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2860 
2861 /*
2862  * Vector Float Point Arithmetic Instructions
2863  */
2864 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2865 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2866 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2867                       CPURISCVState *env)                      \
2868 {                                                              \
2869     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2870     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2871     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2872 }
2873 
2874 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2875 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2876                   void *vs2, CPURISCVState *env,          \
2877                   uint32_t desc)                          \
2878 {                                                         \
2879     uint32_t vm = vext_vm(desc);                          \
2880     uint32_t vl = env->vl;                                \
2881     uint32_t total_elems =                                \
2882         vext_get_total_elems(env, desc, ESZ);             \
2883     uint32_t vta = vext_vta(desc);                        \
2884     uint32_t vma = vext_vma(desc);                        \
2885     uint32_t i;                                           \
2886                                                           \
2887     VSTART_CHECK_EARLY_EXIT(env);                         \
2888                                                           \
2889     for (i = env->vstart; i < vl; i++) {                  \
2890         if (!vm && !vext_elem_mask(v0, i)) {              \
2891             /* set masked-off elements to 1s */           \
2892             vext_set_elems_1s(vd, vma, i * ESZ,           \
2893                               (i + 1) * ESZ);             \
2894             continue;                                     \
2895         }                                                 \
2896         do_##NAME(vd, vs1, vs2, i, env);                  \
2897     }                                                     \
2898     env->vstart = 0;                                      \
2899     /* set tail elements to 1s */                         \
2900     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2901                       total_elems * ESZ);                 \
2902 }
2903 
2904 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2905 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2906 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2907 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2908 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2909 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2910 
2911 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2912 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2913                       CPURISCVState *env)                      \
2914 {                                                              \
2915     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2916     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2917 }
2918 
2919 #define GEN_VEXT_VF(NAME, ESZ)                            \
2920 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2921                   void *vs2, CPURISCVState *env,          \
2922                   uint32_t desc)                          \
2923 {                                                         \
2924     uint32_t vm = vext_vm(desc);                          \
2925     uint32_t vl = env->vl;                                \
2926     uint32_t total_elems =                                \
2927         vext_get_total_elems(env, desc, ESZ);             \
2928     uint32_t vta = vext_vta(desc);                        \
2929     uint32_t vma = vext_vma(desc);                        \
2930     uint32_t i;                                           \
2931                                                           \
2932     VSTART_CHECK_EARLY_EXIT(env);                         \
2933                                                           \
2934     for (i = env->vstart; i < vl; i++) {                  \
2935         if (!vm && !vext_elem_mask(v0, i)) {              \
2936             /* set masked-off elements to 1s */           \
2937             vext_set_elems_1s(vd, vma, i * ESZ,           \
2938                               (i + 1) * ESZ);             \
2939             continue;                                     \
2940         }                                                 \
2941         do_##NAME(vd, s1, vs2, i, env);                   \
2942     }                                                     \
2943     env->vstart = 0;                                      \
2944     /* set tail elements to 1s */                         \
2945     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2946                       total_elems * ESZ);                 \
2947 }
2948 
2949 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2950 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2951 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2952 GEN_VEXT_VF(vfadd_vf_h, 2)
2953 GEN_VEXT_VF(vfadd_vf_w, 4)
2954 GEN_VEXT_VF(vfadd_vf_d, 8)
2955 
2956 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2957 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2958 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2959 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2960 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2961 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2962 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2963 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2964 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2965 GEN_VEXT_VF(vfsub_vf_h, 2)
2966 GEN_VEXT_VF(vfsub_vf_w, 4)
2967 GEN_VEXT_VF(vfsub_vf_d, 8)
2968 
2969 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2970 {
2971     return float16_sub(b, a, s);
2972 }
2973 
2974 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2975 {
2976     return float32_sub(b, a, s);
2977 }
2978 
2979 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2980 {
2981     return float64_sub(b, a, s);
2982 }
2983 
2984 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2985 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2986 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2987 GEN_VEXT_VF(vfrsub_vf_h, 2)
2988 GEN_VEXT_VF(vfrsub_vf_w, 4)
2989 GEN_VEXT_VF(vfrsub_vf_d, 8)
2990 
2991 /* Vector Widening Floating-Point Add/Subtract Instructions */
2992 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2993 {
2994     return float32_add(float16_to_float32(a, true, s),
2995                        float16_to_float32(b, true, s), s);
2996 }
2997 
2998 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2999 {
3000     return float64_add(float32_to_float64(a, s),
3001                        float32_to_float64(b, s), s);
3002 
3003 }
3004 
3005 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3006 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3007 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3008 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3009 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3010 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3011 GEN_VEXT_VF(vfwadd_vf_h, 4)
3012 GEN_VEXT_VF(vfwadd_vf_w, 8)
3013 
3014 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3015 {
3016     return float32_sub(float16_to_float32(a, true, s),
3017                        float16_to_float32(b, true, s), s);
3018 }
3019 
3020 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3021 {
3022     return float64_sub(float32_to_float64(a, s),
3023                        float32_to_float64(b, s), s);
3024 
3025 }
3026 
3027 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3028 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3029 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3030 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3031 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3032 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3033 GEN_VEXT_VF(vfwsub_vf_h, 4)
3034 GEN_VEXT_VF(vfwsub_vf_w, 8)
3035 
3036 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3037 {
3038     return float32_add(a, float16_to_float32(b, true, s), s);
3039 }
3040 
3041 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3042 {
3043     return float64_add(a, float32_to_float64(b, s), s);
3044 }
3045 
3046 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3047 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3048 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3049 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3050 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3051 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3052 GEN_VEXT_VF(vfwadd_wf_h, 4)
3053 GEN_VEXT_VF(vfwadd_wf_w, 8)
3054 
3055 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3056 {
3057     return float32_sub(a, float16_to_float32(b, true, s), s);
3058 }
3059 
3060 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3061 {
3062     return float64_sub(a, float32_to_float64(b, s), s);
3063 }
3064 
3065 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3066 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3067 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3068 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3069 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3070 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3071 GEN_VEXT_VF(vfwsub_wf_h, 4)
3072 GEN_VEXT_VF(vfwsub_wf_w, 8)
3073 
3074 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3075 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3076 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3077 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3078 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3079 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3080 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3081 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3082 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3083 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3084 GEN_VEXT_VF(vfmul_vf_h, 2)
3085 GEN_VEXT_VF(vfmul_vf_w, 4)
3086 GEN_VEXT_VF(vfmul_vf_d, 8)
3087 
3088 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3089 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3090 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3091 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3092 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3093 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3094 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3095 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3096 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3097 GEN_VEXT_VF(vfdiv_vf_h, 2)
3098 GEN_VEXT_VF(vfdiv_vf_w, 4)
3099 GEN_VEXT_VF(vfdiv_vf_d, 8)
3100 
3101 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3102 {
3103     return float16_div(b, a, s);
3104 }
3105 
3106 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3107 {
3108     return float32_div(b, a, s);
3109 }
3110 
3111 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3112 {
3113     return float64_div(b, a, s);
3114 }
3115 
3116 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3117 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3118 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3119 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3120 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3121 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3122 
3123 /* Vector Widening Floating-Point Multiply */
3124 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3125 {
3126     return float32_mul(float16_to_float32(a, true, s),
3127                        float16_to_float32(b, true, s), s);
3128 }
3129 
3130 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3131 {
3132     return float64_mul(float32_to_float64(a, s),
3133                        float32_to_float64(b, s), s);
3134 
3135 }
3136 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3137 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3138 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3139 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3140 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3141 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3142 GEN_VEXT_VF(vfwmul_vf_h, 4)
3143 GEN_VEXT_VF(vfwmul_vf_w, 8)
3144 
3145 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3146 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3147 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3148                       CPURISCVState *env)                          \
3149 {                                                                  \
3150     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3151     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3152     TD d = *((TD *)vd + HD(i));                                    \
3153     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3154 }
3155 
3156 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3157 {
3158     return float16_muladd(a, b, d, 0, s);
3159 }
3160 
3161 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3162 {
3163     return float32_muladd(a, b, d, 0, s);
3164 }
3165 
3166 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3167 {
3168     return float64_muladd(a, b, d, 0, s);
3169 }
3170 
3171 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3172 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3173 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3174 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3175 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3176 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3177 
3178 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3179 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3180                       CPURISCVState *env)                         \
3181 {                                                                 \
3182     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3183     TD d = *((TD *)vd + HD(i));                                   \
3184     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3185 }
3186 
3187 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3188 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3189 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3190 GEN_VEXT_VF(vfmacc_vf_h, 2)
3191 GEN_VEXT_VF(vfmacc_vf_w, 4)
3192 GEN_VEXT_VF(vfmacc_vf_d, 8)
3193 
3194 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3195 {
3196     return float16_muladd(a, b, d, float_muladd_negate_c |
3197                                    float_muladd_negate_product, s);
3198 }
3199 
3200 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3201 {
3202     return float32_muladd(a, b, d, float_muladd_negate_c |
3203                                    float_muladd_negate_product, s);
3204 }
3205 
3206 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3207 {
3208     return float64_muladd(a, b, d, float_muladd_negate_c |
3209                                    float_muladd_negate_product, s);
3210 }
3211 
3212 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3213 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3214 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3215 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3216 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3217 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3218 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3219 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3220 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3221 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3222 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3223 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3224 
3225 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3226 {
3227     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3228 }
3229 
3230 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3231 {
3232     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3233 }
3234 
3235 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3236 {
3237     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3238 }
3239 
3240 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3241 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3242 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3243 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3244 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3245 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3246 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3247 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3248 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3249 GEN_VEXT_VF(vfmsac_vf_h, 2)
3250 GEN_VEXT_VF(vfmsac_vf_w, 4)
3251 GEN_VEXT_VF(vfmsac_vf_d, 8)
3252 
3253 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3254 {
3255     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3256 }
3257 
3258 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3259 {
3260     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3261 }
3262 
3263 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3264 {
3265     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3266 }
3267 
3268 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3269 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3270 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3271 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3272 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3273 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3274 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3275 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3276 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3277 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3278 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3279 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3280 
3281 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3282 {
3283     return float16_muladd(d, b, a, 0, s);
3284 }
3285 
3286 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3287 {
3288     return float32_muladd(d, b, a, 0, s);
3289 }
3290 
3291 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3292 {
3293     return float64_muladd(d, b, a, 0, s);
3294 }
3295 
3296 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3297 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3298 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3299 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3300 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3301 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3302 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3303 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3304 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3305 GEN_VEXT_VF(vfmadd_vf_h, 2)
3306 GEN_VEXT_VF(vfmadd_vf_w, 4)
3307 GEN_VEXT_VF(vfmadd_vf_d, 8)
3308 
3309 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3310 {
3311     return float16_muladd(d, b, a, float_muladd_negate_c |
3312                                    float_muladd_negate_product, s);
3313 }
3314 
3315 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3316 {
3317     return float32_muladd(d, b, a, float_muladd_negate_c |
3318                                    float_muladd_negate_product, s);
3319 }
3320 
3321 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3322 {
3323     return float64_muladd(d, b, a, float_muladd_negate_c |
3324                                    float_muladd_negate_product, s);
3325 }
3326 
3327 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3328 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3329 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3330 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3331 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3332 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3333 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3334 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3335 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3336 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3337 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3338 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3339 
3340 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3341 {
3342     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3343 }
3344 
3345 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3346 {
3347     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3348 }
3349 
3350 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3351 {
3352     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3353 }
3354 
3355 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3356 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3357 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3358 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3359 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3360 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3361 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3362 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3363 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3364 GEN_VEXT_VF(vfmsub_vf_h, 2)
3365 GEN_VEXT_VF(vfmsub_vf_w, 4)
3366 GEN_VEXT_VF(vfmsub_vf_d, 8)
3367 
3368 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3369 {
3370     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3371 }
3372 
3373 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3374 {
3375     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3376 }
3377 
3378 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3379 {
3380     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3381 }
3382 
3383 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3384 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3385 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3386 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3387 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3388 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3389 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3390 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3391 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3392 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3393 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3394 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3395 
3396 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3397 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3398 {
3399     return float32_muladd(float16_to_float32(a, true, s),
3400                           float16_to_float32(b, true, s), d, 0, s);
3401 }
3402 
3403 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3404 {
3405     return float64_muladd(float32_to_float64(a, s),
3406                           float32_to_float64(b, s), d, 0, s);
3407 }
3408 
3409 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3410 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3411 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3412 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3413 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3414 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3415 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3416 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3417 
3418 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3419 {
3420     return float32_muladd(bfloat16_to_float32(a, s),
3421                           bfloat16_to_float32(b, s), d, 0, s);
3422 }
3423 
3424 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3425 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3426 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3427 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3428 
3429 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3430 {
3431     return float32_muladd(float16_to_float32(a, true, s),
3432                           float16_to_float32(b, true, s), d,
3433                           float_muladd_negate_c | float_muladd_negate_product,
3434                           s);
3435 }
3436 
3437 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3438 {
3439     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3440                           d, float_muladd_negate_c |
3441                              float_muladd_negate_product, s);
3442 }
3443 
3444 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3445 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3446 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3447 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3448 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3449 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3450 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3451 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3452 
3453 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3454 {
3455     return float32_muladd(float16_to_float32(a, true, s),
3456                           float16_to_float32(b, true, s), d,
3457                           float_muladd_negate_c, s);
3458 }
3459 
3460 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3461 {
3462     return float64_muladd(float32_to_float64(a, s),
3463                           float32_to_float64(b, s), d,
3464                           float_muladd_negate_c, s);
3465 }
3466 
3467 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3468 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3469 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3470 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3471 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3472 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3473 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3474 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3475 
3476 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3477 {
3478     return float32_muladd(float16_to_float32(a, true, s),
3479                           float16_to_float32(b, true, s), d,
3480                           float_muladd_negate_product, s);
3481 }
3482 
3483 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3484 {
3485     return float64_muladd(float32_to_float64(a, s),
3486                           float32_to_float64(b, s), d,
3487                           float_muladd_negate_product, s);
3488 }
3489 
3490 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3491 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3492 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3493 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3494 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3495 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3496 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3497 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3498 
3499 /* Vector Floating-Point Square-Root Instruction */
3500 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3501 static void do_##NAME(void *vd, void *vs2, int i,      \
3502                       CPURISCVState *env)              \
3503 {                                                      \
3504     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3505     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3506 }
3507 
3508 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3509 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3510                   CPURISCVState *env, uint32_t desc)   \
3511 {                                                      \
3512     uint32_t vm = vext_vm(desc);                       \
3513     uint32_t vl = env->vl;                             \
3514     uint32_t total_elems =                             \
3515         vext_get_total_elems(env, desc, ESZ);          \
3516     uint32_t vta = vext_vta(desc);                     \
3517     uint32_t vma = vext_vma(desc);                     \
3518     uint32_t i;                                        \
3519                                                        \
3520     VSTART_CHECK_EARLY_EXIT(env);                      \
3521                                                        \
3522     if (vl == 0) {                                     \
3523         return;                                        \
3524     }                                                  \
3525     for (i = env->vstart; i < vl; i++) {               \
3526         if (!vm && !vext_elem_mask(v0, i)) {           \
3527             /* set masked-off elements to 1s */        \
3528             vext_set_elems_1s(vd, vma, i * ESZ,        \
3529                               (i + 1) * ESZ);          \
3530             continue;                                  \
3531         }                                              \
3532         do_##NAME(vd, vs2, i, env);                    \
3533     }                                                  \
3534     env->vstart = 0;                                   \
3535     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3536                       total_elems * ESZ);              \
3537 }
3538 
3539 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3540 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3541 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3542 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3543 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3544 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3545 
3546 /*
3547  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3548  *
3549  * Adapted from riscv-v-spec recip.c:
3550  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3551  */
3552 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3553 {
3554     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3555     uint64_t exp = extract64(f, frac_size, exp_size);
3556     uint64_t frac = extract64(f, 0, frac_size);
3557 
3558     const uint8_t lookup_table[] = {
3559         52, 51, 50, 48, 47, 46, 44, 43,
3560         42, 41, 40, 39, 38, 36, 35, 34,
3561         33, 32, 31, 30, 30, 29, 28, 27,
3562         26, 25, 24, 23, 23, 22, 21, 20,
3563         19, 19, 18, 17, 16, 16, 15, 14,
3564         14, 13, 12, 12, 11, 10, 10, 9,
3565         9, 8, 7, 7, 6, 6, 5, 4,
3566         4, 3, 3, 2, 2, 1, 1, 0,
3567         127, 125, 123, 121, 119, 118, 116, 114,
3568         113, 111, 109, 108, 106, 105, 103, 102,
3569         100, 99, 97, 96, 95, 93, 92, 91,
3570         90, 88, 87, 86, 85, 84, 83, 82,
3571         80, 79, 78, 77, 76, 75, 74, 73,
3572         72, 71, 70, 70, 69, 68, 67, 66,
3573         65, 64, 63, 63, 62, 61, 60, 59,
3574         59, 58, 57, 56, 56, 55, 54, 53
3575     };
3576     const int precision = 7;
3577 
3578     if (exp == 0 && frac != 0) { /* subnormal */
3579         /* Normalize the subnormal. */
3580         while (extract64(frac, frac_size - 1, 1) == 0) {
3581             exp--;
3582             frac <<= 1;
3583         }
3584 
3585         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3586     }
3587 
3588     int idx = ((exp & 1) << (precision - 1)) |
3589               (frac >> (frac_size - precision + 1));
3590     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3591                         (frac_size - precision);
3592     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3593 
3594     uint64_t val = 0;
3595     val = deposit64(val, 0, frac_size, out_frac);
3596     val = deposit64(val, frac_size, exp_size, out_exp);
3597     val = deposit64(val, frac_size + exp_size, 1, sign);
3598     return val;
3599 }
3600 
3601 static float16 frsqrt7_h(float16 f, float_status *s)
3602 {
3603     int exp_size = 5, frac_size = 10;
3604     bool sign = float16_is_neg(f);
3605 
3606     /*
3607      * frsqrt7(sNaN) = canonical NaN
3608      * frsqrt7(-inf) = canonical NaN
3609      * frsqrt7(-normal) = canonical NaN
3610      * frsqrt7(-subnormal) = canonical NaN
3611      */
3612     if (float16_is_signaling_nan(f, s) ||
3613         (float16_is_infinity(f) && sign) ||
3614         (float16_is_normal(f) && sign) ||
3615         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3616         s->float_exception_flags |= float_flag_invalid;
3617         return float16_default_nan(s);
3618     }
3619 
3620     /* frsqrt7(qNaN) = canonical NaN */
3621     if (float16_is_quiet_nan(f, s)) {
3622         return float16_default_nan(s);
3623     }
3624 
3625     /* frsqrt7(+-0) = +-inf */
3626     if (float16_is_zero(f)) {
3627         s->float_exception_flags |= float_flag_divbyzero;
3628         return float16_set_sign(float16_infinity, sign);
3629     }
3630 
3631     /* frsqrt7(+inf) = +0 */
3632     if (float16_is_infinity(f) && !sign) {
3633         return float16_set_sign(float16_zero, sign);
3634     }
3635 
3636     /* +normal, +subnormal */
3637     uint64_t val = frsqrt7(f, exp_size, frac_size);
3638     return make_float16(val);
3639 }
3640 
3641 static float32 frsqrt7_s(float32 f, float_status *s)
3642 {
3643     int exp_size = 8, frac_size = 23;
3644     bool sign = float32_is_neg(f);
3645 
3646     /*
3647      * frsqrt7(sNaN) = canonical NaN
3648      * frsqrt7(-inf) = canonical NaN
3649      * frsqrt7(-normal) = canonical NaN
3650      * frsqrt7(-subnormal) = canonical NaN
3651      */
3652     if (float32_is_signaling_nan(f, s) ||
3653         (float32_is_infinity(f) && sign) ||
3654         (float32_is_normal(f) && sign) ||
3655         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3656         s->float_exception_flags |= float_flag_invalid;
3657         return float32_default_nan(s);
3658     }
3659 
3660     /* frsqrt7(qNaN) = canonical NaN */
3661     if (float32_is_quiet_nan(f, s)) {
3662         return float32_default_nan(s);
3663     }
3664 
3665     /* frsqrt7(+-0) = +-inf */
3666     if (float32_is_zero(f)) {
3667         s->float_exception_flags |= float_flag_divbyzero;
3668         return float32_set_sign(float32_infinity, sign);
3669     }
3670 
3671     /* frsqrt7(+inf) = +0 */
3672     if (float32_is_infinity(f) && !sign) {
3673         return float32_set_sign(float32_zero, sign);
3674     }
3675 
3676     /* +normal, +subnormal */
3677     uint64_t val = frsqrt7(f, exp_size, frac_size);
3678     return make_float32(val);
3679 }
3680 
3681 static float64 frsqrt7_d(float64 f, float_status *s)
3682 {
3683     int exp_size = 11, frac_size = 52;
3684     bool sign = float64_is_neg(f);
3685 
3686     /*
3687      * frsqrt7(sNaN) = canonical NaN
3688      * frsqrt7(-inf) = canonical NaN
3689      * frsqrt7(-normal) = canonical NaN
3690      * frsqrt7(-subnormal) = canonical NaN
3691      */
3692     if (float64_is_signaling_nan(f, s) ||
3693         (float64_is_infinity(f) && sign) ||
3694         (float64_is_normal(f) && sign) ||
3695         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3696         s->float_exception_flags |= float_flag_invalid;
3697         return float64_default_nan(s);
3698     }
3699 
3700     /* frsqrt7(qNaN) = canonical NaN */
3701     if (float64_is_quiet_nan(f, s)) {
3702         return float64_default_nan(s);
3703     }
3704 
3705     /* frsqrt7(+-0) = +-inf */
3706     if (float64_is_zero(f)) {
3707         s->float_exception_flags |= float_flag_divbyzero;
3708         return float64_set_sign(float64_infinity, sign);
3709     }
3710 
3711     /* frsqrt7(+inf) = +0 */
3712     if (float64_is_infinity(f) && !sign) {
3713         return float64_set_sign(float64_zero, sign);
3714     }
3715 
3716     /* +normal, +subnormal */
3717     uint64_t val = frsqrt7(f, exp_size, frac_size);
3718     return make_float64(val);
3719 }
3720 
3721 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3722 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3723 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3724 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3725 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3726 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3727 
3728 /*
3729  * Vector Floating-Point Reciprocal Estimate Instruction
3730  *
3731  * Adapted from riscv-v-spec recip.c:
3732  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3733  */
3734 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3735                       float_status *s)
3736 {
3737     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3738     uint64_t exp = extract64(f, frac_size, exp_size);
3739     uint64_t frac = extract64(f, 0, frac_size);
3740 
3741     const uint8_t lookup_table[] = {
3742         127, 125, 123, 121, 119, 117, 116, 114,
3743         112, 110, 109, 107, 105, 104, 102, 100,
3744         99, 97, 96, 94, 93, 91, 90, 88,
3745         87, 85, 84, 83, 81, 80, 79, 77,
3746         76, 75, 74, 72, 71, 70, 69, 68,
3747         66, 65, 64, 63, 62, 61, 60, 59,
3748         58, 57, 56, 55, 54, 53, 52, 51,
3749         50, 49, 48, 47, 46, 45, 44, 43,
3750         42, 41, 40, 40, 39, 38, 37, 36,
3751         35, 35, 34, 33, 32, 31, 31, 30,
3752         29, 28, 28, 27, 26, 25, 25, 24,
3753         23, 23, 22, 21, 21, 20, 19, 19,
3754         18, 17, 17, 16, 15, 15, 14, 14,
3755         13, 12, 12, 11, 11, 10, 9, 9,
3756         8, 8, 7, 7, 6, 5, 5, 4,
3757         4, 3, 3, 2, 2, 1, 1, 0
3758     };
3759     const int precision = 7;
3760 
3761     if (exp == 0 && frac != 0) { /* subnormal */
3762         /* Normalize the subnormal. */
3763         while (extract64(frac, frac_size - 1, 1) == 0) {
3764             exp--;
3765             frac <<= 1;
3766         }
3767 
3768         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3769 
3770         if (exp != 0 && exp != UINT64_MAX) {
3771             /*
3772              * Overflow to inf or max value of same sign,
3773              * depending on sign and rounding mode.
3774              */
3775             s->float_exception_flags |= (float_flag_inexact |
3776                                          float_flag_overflow);
3777 
3778             if ((s->float_rounding_mode == float_round_to_zero) ||
3779                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3780                 ((s->float_rounding_mode == float_round_up) && sign)) {
3781                 /* Return greatest/negative finite value. */
3782                 return (sign << (exp_size + frac_size)) |
3783                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3784             } else {
3785                 /* Return +-inf. */
3786                 return (sign << (exp_size + frac_size)) |
3787                        MAKE_64BIT_MASK(frac_size, exp_size);
3788             }
3789         }
3790     }
3791 
3792     int idx = frac >> (frac_size - precision);
3793     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3794                         (frac_size - precision);
3795     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3796 
3797     if (out_exp == 0 || out_exp == UINT64_MAX) {
3798         /*
3799          * The result is subnormal, but don't raise the underflow exception,
3800          * because there's no additional loss of precision.
3801          */
3802         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3803         if (out_exp == UINT64_MAX) {
3804             out_frac >>= 1;
3805             out_exp = 0;
3806         }
3807     }
3808 
3809     uint64_t val = 0;
3810     val = deposit64(val, 0, frac_size, out_frac);
3811     val = deposit64(val, frac_size, exp_size, out_exp);
3812     val = deposit64(val, frac_size + exp_size, 1, sign);
3813     return val;
3814 }
3815 
3816 static float16 frec7_h(float16 f, float_status *s)
3817 {
3818     int exp_size = 5, frac_size = 10;
3819     bool sign = float16_is_neg(f);
3820 
3821     /* frec7(+-inf) = +-0 */
3822     if (float16_is_infinity(f)) {
3823         return float16_set_sign(float16_zero, sign);
3824     }
3825 
3826     /* frec7(+-0) = +-inf */
3827     if (float16_is_zero(f)) {
3828         s->float_exception_flags |= float_flag_divbyzero;
3829         return float16_set_sign(float16_infinity, sign);
3830     }
3831 
3832     /* frec7(sNaN) = canonical NaN */
3833     if (float16_is_signaling_nan(f, s)) {
3834         s->float_exception_flags |= float_flag_invalid;
3835         return float16_default_nan(s);
3836     }
3837 
3838     /* frec7(qNaN) = canonical NaN */
3839     if (float16_is_quiet_nan(f, s)) {
3840         return float16_default_nan(s);
3841     }
3842 
3843     /* +-normal, +-subnormal */
3844     uint64_t val = frec7(f, exp_size, frac_size, s);
3845     return make_float16(val);
3846 }
3847 
3848 static float32 frec7_s(float32 f, float_status *s)
3849 {
3850     int exp_size = 8, frac_size = 23;
3851     bool sign = float32_is_neg(f);
3852 
3853     /* frec7(+-inf) = +-0 */
3854     if (float32_is_infinity(f)) {
3855         return float32_set_sign(float32_zero, sign);
3856     }
3857 
3858     /* frec7(+-0) = +-inf */
3859     if (float32_is_zero(f)) {
3860         s->float_exception_flags |= float_flag_divbyzero;
3861         return float32_set_sign(float32_infinity, sign);
3862     }
3863 
3864     /* frec7(sNaN) = canonical NaN */
3865     if (float32_is_signaling_nan(f, s)) {
3866         s->float_exception_flags |= float_flag_invalid;
3867         return float32_default_nan(s);
3868     }
3869 
3870     /* frec7(qNaN) = canonical NaN */
3871     if (float32_is_quiet_nan(f, s)) {
3872         return float32_default_nan(s);
3873     }
3874 
3875     /* +-normal, +-subnormal */
3876     uint64_t val = frec7(f, exp_size, frac_size, s);
3877     return make_float32(val);
3878 }
3879 
3880 static float64 frec7_d(float64 f, float_status *s)
3881 {
3882     int exp_size = 11, frac_size = 52;
3883     bool sign = float64_is_neg(f);
3884 
3885     /* frec7(+-inf) = +-0 */
3886     if (float64_is_infinity(f)) {
3887         return float64_set_sign(float64_zero, sign);
3888     }
3889 
3890     /* frec7(+-0) = +-inf */
3891     if (float64_is_zero(f)) {
3892         s->float_exception_flags |= float_flag_divbyzero;
3893         return float64_set_sign(float64_infinity, sign);
3894     }
3895 
3896     /* frec7(sNaN) = canonical NaN */
3897     if (float64_is_signaling_nan(f, s)) {
3898         s->float_exception_flags |= float_flag_invalid;
3899         return float64_default_nan(s);
3900     }
3901 
3902     /* frec7(qNaN) = canonical NaN */
3903     if (float64_is_quiet_nan(f, s)) {
3904         return float64_default_nan(s);
3905     }
3906 
3907     /* +-normal, +-subnormal */
3908     uint64_t val = frec7(f, exp_size, frac_size, s);
3909     return make_float64(val);
3910 }
3911 
3912 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3913 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3914 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3915 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3916 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3917 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3918 
3919 /* Vector Floating-Point MIN/MAX Instructions */
3920 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3921 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3922 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3923 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3924 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3925 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3926 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3927 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3928 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3929 GEN_VEXT_VF(vfmin_vf_h, 2)
3930 GEN_VEXT_VF(vfmin_vf_w, 4)
3931 GEN_VEXT_VF(vfmin_vf_d, 8)
3932 
3933 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3934 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3935 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3936 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3937 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3938 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3939 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3940 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3941 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3942 GEN_VEXT_VF(vfmax_vf_h, 2)
3943 GEN_VEXT_VF(vfmax_vf_w, 4)
3944 GEN_VEXT_VF(vfmax_vf_d, 8)
3945 
3946 /* Vector Floating-Point Sign-Injection Instructions */
3947 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3948 {
3949     return deposit64(b, 0, 15, a);
3950 }
3951 
3952 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3953 {
3954     return deposit64(b, 0, 31, a);
3955 }
3956 
3957 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3958 {
3959     return deposit64(b, 0, 63, a);
3960 }
3961 
3962 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3963 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3964 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3965 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3966 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3967 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3968 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3969 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3970 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3971 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3972 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3973 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3974 
3975 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3976 {
3977     return deposit64(~b, 0, 15, a);
3978 }
3979 
3980 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3981 {
3982     return deposit64(~b, 0, 31, a);
3983 }
3984 
3985 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3986 {
3987     return deposit64(~b, 0, 63, a);
3988 }
3989 
3990 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3991 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3992 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3993 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3994 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3995 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3996 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3997 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3998 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3999 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4000 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4001 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4002 
4003 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4004 {
4005     return deposit64(b ^ a, 0, 15, a);
4006 }
4007 
4008 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4009 {
4010     return deposit64(b ^ a, 0, 31, a);
4011 }
4012 
4013 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4014 {
4015     return deposit64(b ^ a, 0, 63, a);
4016 }
4017 
4018 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4019 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4020 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4021 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4022 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4023 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4024 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4025 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4026 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4027 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4028 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4029 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4030 
4031 /* Vector Floating-Point Compare Instructions */
4032 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4033 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4034                   CPURISCVState *env, uint32_t desc)          \
4035 {                                                             \
4036     uint32_t vm = vext_vm(desc);                              \
4037     uint32_t vl = env->vl;                                    \
4038     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4039     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4040     uint32_t vma = vext_vma(desc);                            \
4041     uint32_t i;                                               \
4042                                                               \
4043     VSTART_CHECK_EARLY_EXIT(env);                             \
4044                                                               \
4045     for (i = env->vstart; i < vl; i++) {                      \
4046         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4047         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4048         if (!vm && !vext_elem_mask(v0, i)) {                  \
4049             /* set masked-off elements to 1s */               \
4050             if (vma) {                                        \
4051                 vext_set_elem_mask(vd, i, 1);                 \
4052             }                                                 \
4053             continue;                                         \
4054         }                                                     \
4055         vext_set_elem_mask(vd, i,                             \
4056                            DO_OP(s2, s1, &env->fp_status));   \
4057     }                                                         \
4058     env->vstart = 0;                                          \
4059     /*
4060      * mask destination register are always tail-agnostic
4061      * set tail elements to 1s
4062      */                                                       \
4063     if (vta_all_1s) {                                         \
4064         for (; i < total_elems; i++) {                        \
4065             vext_set_elem_mask(vd, i, 1);                     \
4066         }                                                     \
4067     }                                                         \
4068 }
4069 
4070 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4071 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4072 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4073 
4074 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4075 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4076                   CPURISCVState *env, uint32_t desc)                \
4077 {                                                                   \
4078     uint32_t vm = vext_vm(desc);                                    \
4079     uint32_t vl = env->vl;                                          \
4080     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4081     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4082     uint32_t vma = vext_vma(desc);                                  \
4083     uint32_t i;                                                     \
4084                                                                     \
4085     VSTART_CHECK_EARLY_EXIT(env);                                   \
4086                                                                     \
4087     for (i = env->vstart; i < vl; i++) {                            \
4088         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4089         if (!vm && !vext_elem_mask(v0, i)) {                        \
4090             /* set masked-off elements to 1s */                     \
4091             if (vma) {                                              \
4092                 vext_set_elem_mask(vd, i, 1);                       \
4093             }                                                       \
4094             continue;                                               \
4095         }                                                           \
4096         vext_set_elem_mask(vd, i,                                   \
4097                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4098     }                                                               \
4099     env->vstart = 0;                                                \
4100     /*
4101      * mask destination register are always tail-agnostic
4102      * set tail elements to 1s
4103      */                                                             \
4104     if (vta_all_1s) {                                               \
4105         for (; i < total_elems; i++) {                              \
4106             vext_set_elem_mask(vd, i, 1);                           \
4107         }                                                           \
4108     }                                                               \
4109 }
4110 
4111 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4112 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4113 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4114 
4115 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4116 {
4117     FloatRelation compare = float16_compare_quiet(a, b, s);
4118     return compare != float_relation_equal;
4119 }
4120 
4121 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4122 {
4123     FloatRelation compare = float32_compare_quiet(a, b, s);
4124     return compare != float_relation_equal;
4125 }
4126 
4127 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4128 {
4129     FloatRelation compare = float64_compare_quiet(a, b, s);
4130     return compare != float_relation_equal;
4131 }
4132 
4133 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4134 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4135 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4136 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4137 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4138 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4139 
4140 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4141 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4142 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4143 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4144 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4145 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4146 
4147 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4148 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4149 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4150 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4151 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4152 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4153 
4154 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4155 {
4156     FloatRelation compare = float16_compare(a, b, s);
4157     return compare == float_relation_greater;
4158 }
4159 
4160 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4161 {
4162     FloatRelation compare = float32_compare(a, b, s);
4163     return compare == float_relation_greater;
4164 }
4165 
4166 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4167 {
4168     FloatRelation compare = float64_compare(a, b, s);
4169     return compare == float_relation_greater;
4170 }
4171 
4172 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4173 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4174 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4175 
4176 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4177 {
4178     FloatRelation compare = float16_compare(a, b, s);
4179     return compare == float_relation_greater ||
4180            compare == float_relation_equal;
4181 }
4182 
4183 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4184 {
4185     FloatRelation compare = float32_compare(a, b, s);
4186     return compare == float_relation_greater ||
4187            compare == float_relation_equal;
4188 }
4189 
4190 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4191 {
4192     FloatRelation compare = float64_compare(a, b, s);
4193     return compare == float_relation_greater ||
4194            compare == float_relation_equal;
4195 }
4196 
4197 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4198 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4199 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4200 
4201 /* Vector Floating-Point Classify Instruction */
4202 target_ulong fclass_h(uint64_t frs1)
4203 {
4204     float16 f = frs1;
4205     bool sign = float16_is_neg(f);
4206 
4207     if (float16_is_infinity(f)) {
4208         return sign ? 1 << 0 : 1 << 7;
4209     } else if (float16_is_zero(f)) {
4210         return sign ? 1 << 3 : 1 << 4;
4211     } else if (float16_is_zero_or_denormal(f)) {
4212         return sign ? 1 << 2 : 1 << 5;
4213     } else if (float16_is_any_nan(f)) {
4214         float_status s = { }; /* for snan_bit_is_one */
4215         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4216     } else {
4217         return sign ? 1 << 1 : 1 << 6;
4218     }
4219 }
4220 
4221 target_ulong fclass_s(uint64_t frs1)
4222 {
4223     float32 f = frs1;
4224     bool sign = float32_is_neg(f);
4225 
4226     if (float32_is_infinity(f)) {
4227         return sign ? 1 << 0 : 1 << 7;
4228     } else if (float32_is_zero(f)) {
4229         return sign ? 1 << 3 : 1 << 4;
4230     } else if (float32_is_zero_or_denormal(f)) {
4231         return sign ? 1 << 2 : 1 << 5;
4232     } else if (float32_is_any_nan(f)) {
4233         float_status s = { }; /* for snan_bit_is_one */
4234         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4235     } else {
4236         return sign ? 1 << 1 : 1 << 6;
4237     }
4238 }
4239 
4240 target_ulong fclass_d(uint64_t frs1)
4241 {
4242     float64 f = frs1;
4243     bool sign = float64_is_neg(f);
4244 
4245     if (float64_is_infinity(f)) {
4246         return sign ? 1 << 0 : 1 << 7;
4247     } else if (float64_is_zero(f)) {
4248         return sign ? 1 << 3 : 1 << 4;
4249     } else if (float64_is_zero_or_denormal(f)) {
4250         return sign ? 1 << 2 : 1 << 5;
4251     } else if (float64_is_any_nan(f)) {
4252         float_status s = { }; /* for snan_bit_is_one */
4253         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4254     } else {
4255         return sign ? 1 << 1 : 1 << 6;
4256     }
4257 }
4258 
4259 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4260 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4261 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4262 GEN_VEXT_V(vfclass_v_h, 2)
4263 GEN_VEXT_V(vfclass_v_w, 4)
4264 GEN_VEXT_V(vfclass_v_d, 8)
4265 
4266 /* Vector Floating-Point Merge Instruction */
4267 
4268 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4269 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4270                   CPURISCVState *env, uint32_t desc)          \
4271 {                                                             \
4272     uint32_t vm = vext_vm(desc);                              \
4273     uint32_t vl = env->vl;                                    \
4274     uint32_t esz = sizeof(ETYPE);                             \
4275     uint32_t total_elems =                                    \
4276         vext_get_total_elems(env, desc, esz);                 \
4277     uint32_t vta = vext_vta(desc);                            \
4278     uint32_t i;                                               \
4279                                                               \
4280     VSTART_CHECK_EARLY_EXIT(env);                             \
4281                                                               \
4282     for (i = env->vstart; i < vl; i++) {                      \
4283         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4284         *((ETYPE *)vd + H(i)) =                               \
4285             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4286     }                                                         \
4287     env->vstart = 0;                                          \
4288     /* set tail elements to 1s */                             \
4289     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4290 }
4291 
4292 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4293 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4294 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4295 
4296 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4297 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4298 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4299 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4300 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4301 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4302 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4303 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4304 
4305 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4306 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4307 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4308 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4309 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4310 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4311 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4312 
4313 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4314 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4315 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4316 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4317 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4318 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4319 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4320 
4321 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4322 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4323 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4324 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4325 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4326 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4327 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4328 
4329 /* Widening Floating-Point/Integer Type-Convert Instructions */
4330 /* (TD, T2, TX2) */
4331 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4332 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4333 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4334 /*
4335  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4336  */
4337 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4338 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4339 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4340 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4341 
4342 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4343 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4344 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4345 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4346 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4347 
4348 /*
4349  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4350  */
4351 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4352 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4353 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4354 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4355 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4356 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4357 
4358 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4359 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4360 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4361 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4362 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4363 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4364 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4365 
4366 /*
4367  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4368  */
4369 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4370 {
4371     return float16_to_float32(a, true, s);
4372 }
4373 
4374 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4375 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4376 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4377 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4378 
4379 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4380 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4381 
4382 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4383 /* (TD, T2, TX2) */
4384 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4385 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4386 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4387 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4388 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4389 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4390 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4391 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4392 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4393 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4394 
4395 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4396 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4397 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4398 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4399 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4400 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4401 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4402 
4403 /*
4404  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4405  */
4406 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4407 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4408 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4409 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4410 
4411 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4412 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4413 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4414 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4415 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4416 
4417 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4418 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4419 {
4420     return float32_to_float16(a, true, s);
4421 }
4422 
4423 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4424 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4425 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4426 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4427 
4428 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4429 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4430 
4431 /*
4432  * Vector Reduction Operations
4433  */
4434 /* Vector Single-Width Integer Reduction Instructions */
4435 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4436 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4437                   void *vs2, CPURISCVState *env,          \
4438                   uint32_t desc)                          \
4439 {                                                         \
4440     uint32_t vm = vext_vm(desc);                          \
4441     uint32_t vl = env->vl;                                \
4442     uint32_t esz = sizeof(TD);                            \
4443     uint32_t vlenb = simd_maxsz(desc);                    \
4444     uint32_t vta = vext_vta(desc);                        \
4445     uint32_t i;                                           \
4446     TD s1 =  *((TD *)vs1 + HD(0));                        \
4447                                                           \
4448     for (i = env->vstart; i < vl; i++) {                  \
4449         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4450         if (!vm && !vext_elem_mask(v0, i)) {              \
4451             continue;                                     \
4452         }                                                 \
4453         s1 = OP(s1, (TD)s2);                              \
4454     }                                                     \
4455     *((TD *)vd + HD(0)) = s1;                             \
4456     env->vstart = 0;                                      \
4457     /* set tail elements to 1s */                         \
4458     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4459 }
4460 
4461 /* vd[0] = sum(vs1[0], vs2[*]) */
4462 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4463 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4464 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4465 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4466 
4467 /* vd[0] = maxu(vs1[0], vs2[*]) */
4468 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4469 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4470 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4471 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4472 
4473 /* vd[0] = max(vs1[0], vs2[*]) */
4474 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4475 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4476 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4477 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4478 
4479 /* vd[0] = minu(vs1[0], vs2[*]) */
4480 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4481 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4482 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4483 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4484 
4485 /* vd[0] = min(vs1[0], vs2[*]) */
4486 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4487 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4488 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4489 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4490 
4491 /* vd[0] = and(vs1[0], vs2[*]) */
4492 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4493 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4494 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4495 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4496 
4497 /* vd[0] = or(vs1[0], vs2[*]) */
4498 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4499 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4500 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4501 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4502 
4503 /* vd[0] = xor(vs1[0], vs2[*]) */
4504 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4505 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4506 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4507 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4508 
4509 /* Vector Widening Integer Reduction Instructions */
4510 /* signed sum reduction into double-width accumulator */
4511 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4512 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4513 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4514 
4515 /* Unsigned sum reduction into double-width accumulator */
4516 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4517 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4518 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4519 
4520 /* Vector Single-Width Floating-Point Reduction Instructions */
4521 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4522 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4523                   void *vs2, CPURISCVState *env,           \
4524                   uint32_t desc)                           \
4525 {                                                          \
4526     uint32_t vm = vext_vm(desc);                           \
4527     uint32_t vl = env->vl;                                 \
4528     uint32_t esz = sizeof(TD);                             \
4529     uint32_t vlenb = simd_maxsz(desc);                     \
4530     uint32_t vta = vext_vta(desc);                         \
4531     uint32_t i;                                            \
4532     TD s1 =  *((TD *)vs1 + HD(0));                         \
4533                                                            \
4534     for (i = env->vstart; i < vl; i++) {                   \
4535         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4536         if (!vm && !vext_elem_mask(v0, i)) {               \
4537             continue;                                      \
4538         }                                                  \
4539         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4540     }                                                      \
4541     *((TD *)vd + HD(0)) = s1;                              \
4542     env->vstart = 0;                                       \
4543     /* set tail elements to 1s */                          \
4544     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4545 }
4546 
4547 /* Unordered sum */
4548 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4549 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4550 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4551 
4552 /* Ordered sum */
4553 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4554 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4555 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4556 
4557 /* Maximum value */
4558 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4559               float16_maximum_number)
4560 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4561               float32_maximum_number)
4562 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4563               float64_maximum_number)
4564 
4565 /* Minimum value */
4566 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4567               float16_minimum_number)
4568 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4569               float32_minimum_number)
4570 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4571               float64_minimum_number)
4572 
4573 /* Vector Widening Floating-Point Add Instructions */
4574 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4575 {
4576     return float32_add(a, float16_to_float32(b, true, s), s);
4577 }
4578 
4579 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4580 {
4581     return float64_add(a, float32_to_float64(b, s), s);
4582 }
4583 
4584 /* Vector Widening Floating-Point Reduction Instructions */
4585 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4586 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4587 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4588 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4589 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4590 
4591 /*
4592  * Vector Mask Operations
4593  */
4594 /* Vector Mask-Register Logical Instructions */
4595 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4596 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4597                   void *vs2, CPURISCVState *env,          \
4598                   uint32_t desc)                          \
4599 {                                                         \
4600     uint32_t vl = env->vl;                                \
4601     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4602     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4603     uint32_t i;                                           \
4604     int a, b;                                             \
4605                                                           \
4606     VSTART_CHECK_EARLY_EXIT(env);                         \
4607                                                           \
4608     for (i = env->vstart; i < vl; i++) {                  \
4609         a = vext_elem_mask(vs1, i);                       \
4610         b = vext_elem_mask(vs2, i);                       \
4611         vext_set_elem_mask(vd, i, OP(b, a));              \
4612     }                                                     \
4613     env->vstart = 0;                                      \
4614     /*
4615      * mask destination register are always tail-agnostic
4616      * set tail elements to 1s
4617      */                                                   \
4618     if (vta_all_1s) {                                     \
4619         for (; i < total_elems; i++) {                    \
4620             vext_set_elem_mask(vd, i, 1);                 \
4621         }                                                 \
4622     }                                                     \
4623 }
4624 
4625 #define DO_NAND(N, M)  (!(N & M))
4626 #define DO_ANDNOT(N, M)  (N & !M)
4627 #define DO_NOR(N, M)  (!(N | M))
4628 #define DO_ORNOT(N, M)  (N | !M)
4629 #define DO_XNOR(N, M)  (!(N ^ M))
4630 
4631 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4632 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4633 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4634 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4635 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4636 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4637 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4638 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4639 
4640 /* Vector count population in mask vcpop */
4641 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4642                              uint32_t desc)
4643 {
4644     target_ulong cnt = 0;
4645     uint32_t vm = vext_vm(desc);
4646     uint32_t vl = env->vl;
4647     int i;
4648 
4649     for (i = env->vstart; i < vl; i++) {
4650         if (vm || vext_elem_mask(v0, i)) {
4651             if (vext_elem_mask(vs2, i)) {
4652                 cnt++;
4653             }
4654         }
4655     }
4656     env->vstart = 0;
4657     return cnt;
4658 }
4659 
4660 /* vfirst find-first-set mask bit */
4661 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4662                               uint32_t desc)
4663 {
4664     uint32_t vm = vext_vm(desc);
4665     uint32_t vl = env->vl;
4666     int i;
4667 
4668     for (i = env->vstart; i < vl; i++) {
4669         if (vm || vext_elem_mask(v0, i)) {
4670             if (vext_elem_mask(vs2, i)) {
4671                 return i;
4672             }
4673         }
4674     }
4675     env->vstart = 0;
4676     return -1LL;
4677 }
4678 
4679 enum set_mask_type {
4680     ONLY_FIRST = 1,
4681     INCLUDE_FIRST,
4682     BEFORE_FIRST,
4683 };
4684 
4685 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4686                    uint32_t desc, enum set_mask_type type)
4687 {
4688     uint32_t vm = vext_vm(desc);
4689     uint32_t vl = env->vl;
4690     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4691     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4692     uint32_t vma = vext_vma(desc);
4693     int i;
4694     bool first_mask_bit = false;
4695 
4696     for (i = env->vstart; i < vl; i++) {
4697         if (!vm && !vext_elem_mask(v0, i)) {
4698             /* set masked-off elements to 1s */
4699             if (vma) {
4700                 vext_set_elem_mask(vd, i, 1);
4701             }
4702             continue;
4703         }
4704         /* write a zero to all following active elements */
4705         if (first_mask_bit) {
4706             vext_set_elem_mask(vd, i, 0);
4707             continue;
4708         }
4709         if (vext_elem_mask(vs2, i)) {
4710             first_mask_bit = true;
4711             if (type == BEFORE_FIRST) {
4712                 vext_set_elem_mask(vd, i, 0);
4713             } else {
4714                 vext_set_elem_mask(vd, i, 1);
4715             }
4716         } else {
4717             if (type == ONLY_FIRST) {
4718                 vext_set_elem_mask(vd, i, 0);
4719             } else {
4720                 vext_set_elem_mask(vd, i, 1);
4721             }
4722         }
4723     }
4724     env->vstart = 0;
4725     /*
4726      * mask destination register are always tail-agnostic
4727      * set tail elements to 1s
4728      */
4729     if (vta_all_1s) {
4730         for (; i < total_elems; i++) {
4731             vext_set_elem_mask(vd, i, 1);
4732         }
4733     }
4734 }
4735 
4736 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4737                      uint32_t desc)
4738 {
4739     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4740 }
4741 
4742 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4743                      uint32_t desc)
4744 {
4745     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4746 }
4747 
4748 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4749                      uint32_t desc)
4750 {
4751     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4752 }
4753 
4754 /* Vector Iota Instruction */
4755 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4756 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4757                   uint32_t desc)                                          \
4758 {                                                                         \
4759     uint32_t vm = vext_vm(desc);                                          \
4760     uint32_t vl = env->vl;                                                \
4761     uint32_t esz = sizeof(ETYPE);                                         \
4762     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4763     uint32_t vta = vext_vta(desc);                                        \
4764     uint32_t vma = vext_vma(desc);                                        \
4765     uint32_t sum = 0;                                                     \
4766     int i;                                                                \
4767                                                                           \
4768     for (i = env->vstart; i < vl; i++) {                                  \
4769         if (!vm && !vext_elem_mask(v0, i)) {                              \
4770             /* set masked-off elements to 1s */                           \
4771             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4772             continue;                                                     \
4773         }                                                                 \
4774         *((ETYPE *)vd + H(i)) = sum;                                      \
4775         if (vext_elem_mask(vs2, i)) {                                     \
4776             sum++;                                                        \
4777         }                                                                 \
4778     }                                                                     \
4779     env->vstart = 0;                                                      \
4780     /* set tail elements to 1s */                                         \
4781     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4782 }
4783 
4784 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4785 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4786 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4787 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4788 
4789 /* Vector Element Index Instruction */
4790 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4791 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4792 {                                                                         \
4793     uint32_t vm = vext_vm(desc);                                          \
4794     uint32_t vl = env->vl;                                                \
4795     uint32_t esz = sizeof(ETYPE);                                         \
4796     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4797     uint32_t vta = vext_vta(desc);                                        \
4798     uint32_t vma = vext_vma(desc);                                        \
4799     int i;                                                                \
4800                                                                           \
4801     VSTART_CHECK_EARLY_EXIT(env);                                         \
4802                                                                           \
4803     for (i = env->vstart; i < vl; i++) {                                  \
4804         if (!vm && !vext_elem_mask(v0, i)) {                              \
4805             /* set masked-off elements to 1s */                           \
4806             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4807             continue;                                                     \
4808         }                                                                 \
4809         *((ETYPE *)vd + H(i)) = i;                                        \
4810     }                                                                     \
4811     env->vstart = 0;                                                      \
4812     /* set tail elements to 1s */                                         \
4813     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4814 }
4815 
4816 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4817 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4818 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4819 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4820 
4821 /*
4822  * Vector Permutation Instructions
4823  */
4824 
4825 /* Vector Slide Instructions */
4826 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4827 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4828                   CPURISCVState *env, uint32_t desc)                      \
4829 {                                                                         \
4830     uint32_t vm = vext_vm(desc);                                          \
4831     uint32_t vl = env->vl;                                                \
4832     uint32_t esz = sizeof(ETYPE);                                         \
4833     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4834     uint32_t vta = vext_vta(desc);                                        \
4835     uint32_t vma = vext_vma(desc);                                        \
4836     target_ulong offset = s1, i_min, i;                                   \
4837                                                                           \
4838     VSTART_CHECK_EARLY_EXIT(env);                                         \
4839                                                                           \
4840     i_min = MAX(env->vstart, offset);                                     \
4841     for (i = i_min; i < vl; i++) {                                        \
4842         if (!vm && !vext_elem_mask(v0, i)) {                              \
4843             /* set masked-off elements to 1s */                           \
4844             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4845             continue;                                                     \
4846         }                                                                 \
4847         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4848     }                                                                     \
4849     env->vstart = 0;                                                      \
4850     /* set tail elements to 1s */                                         \
4851     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4852 }
4853 
4854 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4855 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4856 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4857 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4858 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4859 
4860 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4861 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4862                   CPURISCVState *env, uint32_t desc)                      \
4863 {                                                                         \
4864     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4865     uint32_t vm = vext_vm(desc);                                          \
4866     uint32_t vl = env->vl;                                                \
4867     uint32_t esz = sizeof(ETYPE);                                         \
4868     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4869     uint32_t vta = vext_vta(desc);                                        \
4870     uint32_t vma = vext_vma(desc);                                        \
4871     target_ulong i_max, i_min, i;                                         \
4872                                                                           \
4873     VSTART_CHECK_EARLY_EXIT(env);                                         \
4874                                                                           \
4875     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
4876     i_max = MAX(i_min, env->vstart);                                      \
4877     for (i = env->vstart; i < i_max; ++i) {                               \
4878         if (!vm && !vext_elem_mask(v0, i)) {                              \
4879             /* set masked-off elements to 1s */                           \
4880             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4881             continue;                                                     \
4882         }                                                                 \
4883         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4884     }                                                                     \
4885                                                                           \
4886     for (i = i_max; i < vl; ++i) {                                        \
4887         if (vm || vext_elem_mask(v0, i)) {                                \
4888             *((ETYPE *)vd + H(i)) = 0;                                    \
4889         }                                                                 \
4890     }                                                                     \
4891                                                                           \
4892     env->vstart = 0;                                                      \
4893     /* set tail elements to 1s */                                         \
4894     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4895 }
4896 
4897 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4898 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4899 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4900 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4901 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4902 
4903 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4904 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4905                                  void *vs2, CPURISCVState *env,             \
4906                                  uint32_t desc)                             \
4907 {                                                                           \
4908     typedef uint##BITWIDTH##_t ETYPE;                                       \
4909     uint32_t vm = vext_vm(desc);                                            \
4910     uint32_t vl = env->vl;                                                  \
4911     uint32_t esz = sizeof(ETYPE);                                           \
4912     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4913     uint32_t vta = vext_vta(desc);                                          \
4914     uint32_t vma = vext_vma(desc);                                          \
4915     uint32_t i;                                                             \
4916                                                                             \
4917     VSTART_CHECK_EARLY_EXIT(env);                                           \
4918                                                                             \
4919     for (i = env->vstart; i < vl; i++) {                                    \
4920         if (!vm && !vext_elem_mask(v0, i)) {                                \
4921             /* set masked-off elements to 1s */                             \
4922             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4923             continue;                                                       \
4924         }                                                                   \
4925         if (i == 0) {                                                       \
4926             *((ETYPE *)vd + H(i)) = s1;                                     \
4927         } else {                                                            \
4928             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4929         }                                                                   \
4930     }                                                                       \
4931     env->vstart = 0;                                                        \
4932     /* set tail elements to 1s */                                           \
4933     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4934 }
4935 
4936 GEN_VEXT_VSLIE1UP(8,  H1)
4937 GEN_VEXT_VSLIE1UP(16, H2)
4938 GEN_VEXT_VSLIE1UP(32, H4)
4939 GEN_VEXT_VSLIE1UP(64, H8)
4940 
4941 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4942 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4943                   CPURISCVState *env, uint32_t desc)              \
4944 {                                                                 \
4945     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4946 }
4947 
4948 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4949 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4950 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4951 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4952 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4953 
4954 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4955 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4956                                    void *vs2, CPURISCVState *env,             \
4957                                    uint32_t desc)                             \
4958 {                                                                             \
4959     typedef uint##BITWIDTH##_t ETYPE;                                         \
4960     uint32_t vm = vext_vm(desc);                                              \
4961     uint32_t vl = env->vl;                                                    \
4962     uint32_t esz = sizeof(ETYPE);                                             \
4963     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4964     uint32_t vta = vext_vta(desc);                                            \
4965     uint32_t vma = vext_vma(desc);                                            \
4966     uint32_t i;                                                               \
4967                                                                               \
4968     VSTART_CHECK_EARLY_EXIT(env);                                             \
4969                                                                               \
4970     for (i = env->vstart; i < vl; i++) {                                      \
4971         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4972             /* set masked-off elements to 1s */                               \
4973             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4974             continue;                                                         \
4975         }                                                                     \
4976         if (i == vl - 1) {                                                    \
4977             *((ETYPE *)vd + H(i)) = s1;                                       \
4978         } else {                                                              \
4979             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4980         }                                                                     \
4981     }                                                                         \
4982     env->vstart = 0;                                                          \
4983     /* set tail elements to 1s */                                             \
4984     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4985 }
4986 
4987 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4988 GEN_VEXT_VSLIDE1DOWN(16, H2)
4989 GEN_VEXT_VSLIDE1DOWN(32, H4)
4990 GEN_VEXT_VSLIDE1DOWN(64, H8)
4991 
4992 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4993 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4994                   CPURISCVState *env, uint32_t desc)              \
4995 {                                                                 \
4996     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4997 }
4998 
4999 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5000 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5001 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5002 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5003 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5004 
5005 /* Vector Floating-Point Slide Instructions */
5006 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5007 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5008                   CPURISCVState *env, uint32_t desc)          \
5009 {                                                             \
5010     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5011 }
5012 
5013 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5014 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5015 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5016 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5017 
5018 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5019 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5020                   CPURISCVState *env, uint32_t desc)          \
5021 {                                                             \
5022     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5023 }
5024 
5025 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5026 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5027 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5028 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5029 
5030 /* Vector Register Gather Instruction */
5031 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5032 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5033                   CPURISCVState *env, uint32_t desc)                      \
5034 {                                                                         \
5035     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5036     uint32_t vm = vext_vm(desc);                                          \
5037     uint32_t vl = env->vl;                                                \
5038     uint32_t esz = sizeof(TS2);                                           \
5039     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5040     uint32_t vta = vext_vta(desc);                                        \
5041     uint32_t vma = vext_vma(desc);                                        \
5042     uint64_t index;                                                       \
5043     uint32_t i;                                                           \
5044                                                                           \
5045     VSTART_CHECK_EARLY_EXIT(env);                                         \
5046                                                                           \
5047     for (i = env->vstart; i < vl; i++) {                                  \
5048         if (!vm && !vext_elem_mask(v0, i)) {                              \
5049             /* set masked-off elements to 1s */                           \
5050             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5051             continue;                                                     \
5052         }                                                                 \
5053         index = *((TS1 *)vs1 + HS1(i));                                   \
5054         if (index >= vlmax) {                                             \
5055             *((TS2 *)vd + HS2(i)) = 0;                                    \
5056         } else {                                                          \
5057             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5058         }                                                                 \
5059     }                                                                     \
5060     env->vstart = 0;                                                      \
5061     /* set tail elements to 1s */                                         \
5062     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5063 }
5064 
5065 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5066 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5067 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5068 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5069 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5070 
5071 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5072 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5073 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5074 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5075 
5076 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5077 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5078                   CPURISCVState *env, uint32_t desc)                      \
5079 {                                                                         \
5080     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5081     uint32_t vm = vext_vm(desc);                                          \
5082     uint32_t vl = env->vl;                                                \
5083     uint32_t esz = sizeof(ETYPE);                                         \
5084     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5085     uint32_t vta = vext_vta(desc);                                        \
5086     uint32_t vma = vext_vma(desc);                                        \
5087     uint64_t index = s1;                                                  \
5088     uint32_t i;                                                           \
5089                                                                           \
5090     VSTART_CHECK_EARLY_EXIT(env);                                         \
5091                                                                           \
5092     for (i = env->vstart; i < vl; i++) {                                  \
5093         if (!vm && !vext_elem_mask(v0, i)) {                              \
5094             /* set masked-off elements to 1s */                           \
5095             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5096             continue;                                                     \
5097         }                                                                 \
5098         if (index >= vlmax) {                                             \
5099             *((ETYPE *)vd + H(i)) = 0;                                    \
5100         } else {                                                          \
5101             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5102         }                                                                 \
5103     }                                                                     \
5104     env->vstart = 0;                                                      \
5105     /* set tail elements to 1s */                                         \
5106     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5107 }
5108 
5109 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5110 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5111 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5112 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5113 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5114 
5115 /* Vector Compress Instruction */
5116 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5117 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5118                   CPURISCVState *env, uint32_t desc)                      \
5119 {                                                                         \
5120     uint32_t vl = env->vl;                                                \
5121     uint32_t esz = sizeof(ETYPE);                                         \
5122     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5123     uint32_t vta = vext_vta(desc);                                        \
5124     uint32_t num = 0, i;                                                  \
5125                                                                           \
5126     for (i = env->vstart; i < vl; i++) {                                  \
5127         if (!vext_elem_mask(vs1, i)) {                                    \
5128             continue;                                                     \
5129         }                                                                 \
5130         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5131         num++;                                                            \
5132     }                                                                     \
5133     env->vstart = 0;                                                      \
5134     /* set tail elements to 1s */                                         \
5135     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5136 }
5137 
5138 /* Compress into vd elements of vs2 where vs1 is enabled */
5139 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5140 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5141 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5142 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5143 
5144 /* Vector Whole Register Move */
5145 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5146 {
5147     /* EEW = SEW */
5148     uint32_t maxsz = simd_maxsz(desc);
5149     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5150     uint32_t startb = env->vstart * sewb;
5151     uint32_t i = startb;
5152 
5153     if (startb >= maxsz) {
5154         env->vstart = 0;
5155         return;
5156     }
5157 
5158     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5159         uint32_t j = ROUND_UP(i, 8);
5160         memcpy((uint8_t *)vd + H1(j - 1),
5161                (uint8_t *)vs2 + H1(j - 1),
5162                j - i);
5163         i = j;
5164     }
5165 
5166     memcpy((uint8_t *)vd + H1(i),
5167            (uint8_t *)vs2 + H1(i),
5168            maxsz - i);
5169 
5170     env->vstart = 0;
5171 }
5172 
5173 /* Vector Integer Extension */
5174 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5175 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5176                   CPURISCVState *env, uint32_t desc)             \
5177 {                                                                \
5178     uint32_t vl = env->vl;                                       \
5179     uint32_t vm = vext_vm(desc);                                 \
5180     uint32_t esz = sizeof(ETYPE);                                \
5181     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5182     uint32_t vta = vext_vta(desc);                               \
5183     uint32_t vma = vext_vma(desc);                               \
5184     uint32_t i;                                                  \
5185                                                                  \
5186     VSTART_CHECK_EARLY_EXIT(env);                                \
5187                                                                  \
5188     for (i = env->vstart; i < vl; i++) {                         \
5189         if (!vm && !vext_elem_mask(v0, i)) {                     \
5190             /* set masked-off elements to 1s */                  \
5191             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5192             continue;                                            \
5193         }                                                        \
5194         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5195     }                                                            \
5196     env->vstart = 0;                                             \
5197     /* set tail elements to 1s */                                \
5198     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5199 }
5200 
5201 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5202 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5203 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5204 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5205 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5206 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5207 
5208 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5209 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5210 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5211 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5212 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5213 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5214