xref: /openbmc/qemu/target/riscv/vector_helper.c (revision fe1a3ace13a8b53fc20c74fb7e3337f754396e6b)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "accel/tcg/cpu-ldst.h"
26 #include "accel/tcg/probe.h"
27 #include "exec/page-protection.h"
28 #include "exec/helper-proto.h"
29 #include "exec/tlb-flags.h"
30 #include "exec/target_page.h"
31 #include "exec/tswap.h"
32 #include "fpu/softfloat.h"
33 #include "tcg/tcg-gvec-desc.h"
34 #include "internals.h"
35 #include "vector_internals.h"
36 #include <math.h>
37 
38 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
39                             target_ulong s2)
40 {
41     int vlmax, vl;
42     RISCVCPU *cpu = env_archcpu(env);
43     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
44     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
45     uint16_t sew = 8 << vsew;
46     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
47     int xlen = riscv_cpu_xlen(env);
48     bool vill = (s2 >> (xlen - 1)) & 0x1;
49     target_ulong reserved = s2 &
50                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
51                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
52     uint16_t vlen = cpu->cfg.vlenb << 3;
53     int8_t lmul;
54 
55     if (vlmul & 4) {
56         /*
57          * Fractional LMUL, check:
58          *
59          * VLEN * LMUL >= SEW
60          * VLEN >> (8 - lmul) >= sew
61          * (vlenb << 3) >> (8 - lmul) >= sew
62          */
63         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
64             vill = true;
65         }
66     }
67 
68     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
69         /* only set vill bit. */
70         env->vill = 1;
71         env->vtype = 0;
72         env->vl = 0;
73         env->vstart = 0;
74         return 0;
75     }
76 
77     /* lmul encoded as in DisasContext::lmul */
78     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
79     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
80     if (s1 <= vlmax) {
81         vl = s1;
82     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
83         vl = (s1 + 1) >> 1;
84     } else {
85         vl = vlmax;
86     }
87     env->vl = vl;
88     env->vtype = s2;
89     env->vstart = 0;
90     env->vill = 0;
91     return vl;
92 }
93 
94 /*
95  * Get the maximum number of elements can be operated.
96  *
97  * log2_esz: log2 of element size in bytes.
98  */
99 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
100 {
101     /*
102      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
103      * so vlen in bytes (vlenb) is encoded as maxsz.
104      */
105     uint32_t vlenb = simd_maxsz(desc);
106 
107     /* Return VLMAX */
108     int scale = vext_lmul(desc) - log2_esz;
109     return scale < 0 ? vlenb >> -scale : vlenb << scale;
110 }
111 
112 /*
113  * This function checks watchpoint before real load operation.
114  *
115  * In system mode, the TLB API probe_access is enough for watchpoint check.
116  * In user mode, there is no watchpoint support now.
117  *
118  * It will trigger an exception if there is no mapping in TLB
119  * and page table walk can't fill the TLB entry. Then the guest
120  * software can return here after process the exception or never return.
121  */
122 static void probe_pages(CPURISCVState *env, target_ulong addr,
123                         target_ulong len, uintptr_t ra,
124                         MMUAccessType access_type)
125 {
126     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
127     target_ulong curlen = MIN(pagelen, len);
128     int mmu_index = riscv_env_mmu_index(env, false);
129 
130     probe_access(env, adjust_addr(env, addr), curlen, access_type,
131                  mmu_index, ra);
132     if (len > curlen) {
133         addr += curlen;
134         curlen = len - curlen;
135         probe_access(env, adjust_addr(env, addr), curlen, access_type,
136                      mmu_index, ra);
137     }
138 }
139 
140 static inline void vext_set_elem_mask(void *v0, int index,
141                                       uint8_t value)
142 {
143     int idx = index / 64;
144     int pos = index % 64;
145     uint64_t old = ((uint64_t *)v0)[idx];
146     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
147 }
148 
149 /* elements operations for load and store */
150 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
151                                    uint32_t idx, void *vd, uintptr_t retaddr);
152 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
153 
154 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
155 static inline QEMU_ALWAYS_INLINE                            \
156 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
157                 uint32_t idx, void *vd, uintptr_t retaddr)  \
158 {                                                           \
159     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
160     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
161 }                                                           \
162                                                             \
163 static inline QEMU_ALWAYS_INLINE                            \
164 void NAME##_host(void *vd, uint32_t idx, void *host)        \
165 {                                                           \
166     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
167     *cur = (ETYPE)LDSUF##_p(host);                          \
168 }
169 
170 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
171 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
172 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
173 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
174 
175 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
176 static inline QEMU_ALWAYS_INLINE                            \
177 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
178                 uint32_t idx, void *vd, uintptr_t retaddr)  \
179 {                                                           \
180     ETYPE data = *((ETYPE *)vd + H(idx));                   \
181     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
182 }                                                           \
183                                                             \
184 static inline QEMU_ALWAYS_INLINE                            \
185 void NAME##_host(void *vd, uint32_t idx, void *host)        \
186 {                                                           \
187     ETYPE data = *((ETYPE *)vd + H(idx));                   \
188     STSUF##_p(host, data);                                  \
189 }
190 
191 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
192 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
193 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
194 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
195 
196 static inline QEMU_ALWAYS_INLINE void
197 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
198                        void *vd, uint32_t evl, target_ulong addr,
199                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
200                        bool is_load)
201 {
202     uint32_t i;
203     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
204         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
205     }
206 }
207 
208 static inline QEMU_ALWAYS_INLINE void
209 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
210                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
211                         uint32_t esz, bool is_load)
212 {
213 #if HOST_BIG_ENDIAN
214     for (; reg_start < evl; reg_start++, host += esz) {
215         ldst_host(vd, reg_start, host);
216     }
217 #else
218     if (esz == 1) {
219         uint32_t byte_offset = reg_start * esz;
220         uint32_t size = (evl - reg_start) * esz;
221 
222         if (is_load) {
223             memcpy(vd + byte_offset, host, size);
224         } else {
225             memcpy(host, vd + byte_offset, size);
226         }
227     } else {
228         for (; reg_start < evl; reg_start++, host += esz) {
229             ldst_host(vd, reg_start, host);
230         }
231     }
232 #endif
233 }
234 
235 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
236                                    uint32_t desc, uint32_t nf,
237                                    uint32_t esz, uint32_t max_elems)
238 {
239     uint32_t vta = vext_vta(desc);
240     int k;
241 
242     if (vta == 0) {
243         return;
244     }
245 
246     for (k = 0; k < nf; ++k) {
247         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
248                           (k * max_elems + max_elems) * esz);
249     }
250 }
251 
252 /*
253  * stride: access vector element from strided memory
254  */
255 static void
256 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
257                  CPURISCVState *env, uint32_t desc, uint32_t vm,
258                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
259                  uintptr_t ra)
260 {
261     uint32_t i, k;
262     uint32_t nf = vext_nf(desc);
263     uint32_t max_elems = vext_max_elems(desc, log2_esz);
264     uint32_t esz = 1 << log2_esz;
265     uint32_t vma = vext_vma(desc);
266 
267     VSTART_CHECK_EARLY_EXIT(env, env->vl);
268 
269     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
270         k = 0;
271         while (k < nf) {
272             if (!vm && !vext_elem_mask(v0, i)) {
273                 /* set masked-off elements to 1s */
274                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
275                                   (i + k * max_elems + 1) * esz);
276                 k++;
277                 continue;
278             }
279             target_ulong addr = base + stride * i + (k << log2_esz);
280             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
281             k++;
282         }
283     }
284     env->vstart = 0;
285 
286     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
287 }
288 
289 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
290 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
291                   target_ulong stride, CPURISCVState *env,              \
292                   uint32_t desc)                                        \
293 {                                                                       \
294     uint32_t vm = vext_vm(desc);                                        \
295     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
296                      ctzl(sizeof(ETYPE)), GETPC());                     \
297 }
298 
299 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
300 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
301 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
302 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
303 
304 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
305 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
306                   target_ulong stride, CPURISCVState *env,              \
307                   uint32_t desc)                                        \
308 {                                                                       \
309     uint32_t vm = vext_vm(desc);                                        \
310     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
311                      ctzl(sizeof(ETYPE)), GETPC());                     \
312 }
313 
314 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
315 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
316 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
317 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
318 
319 /*
320  * unit-stride: access elements stored contiguously in memory
321  */
322 
323 /* unmasked unit-stride load and store operation */
324 static inline QEMU_ALWAYS_INLINE void
325 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
326                   uint32_t elems, uint32_t nf, uint32_t max_elems,
327                   uint32_t log2_esz, bool is_load, int mmu_index,
328                   vext_ldst_elem_fn_tlb *ldst_tlb,
329                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
330 {
331     void *host;
332     int i, k, flags;
333     uint32_t esz = 1 << log2_esz;
334     uint32_t size = (elems * nf) << log2_esz;
335     uint32_t evl = env->vstart + elems;
336     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
337 
338     /* Check page permission/pmp/watchpoint/etc. */
339     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
340                                mmu_index, true, &host, ra);
341 
342     if (flags == 0) {
343         if (nf == 1) {
344             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
345                                       host, esz, is_load);
346         } else {
347             for (i = env->vstart; i < evl; ++i) {
348                 k = 0;
349                 while (k < nf) {
350                     ldst_host(vd, i + k * max_elems, host);
351                     host += esz;
352                     k++;
353                 }
354             }
355         }
356         env->vstart += elems;
357     } else {
358         if (nf == 1) {
359             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
360                                    ra, esz, is_load);
361         } else {
362             /* load bytes from guest memory */
363             for (i = env->vstart; i < evl; env->vstart = ++i) {
364                 k = 0;
365                 while (k < nf) {
366                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
367                              vd, ra);
368                     addr += esz;
369                     k++;
370                 }
371             }
372         }
373     }
374 }
375 
376 static inline QEMU_ALWAYS_INLINE void
377 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
378              vext_ldst_elem_fn_tlb *ldst_tlb,
379              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
380              uint32_t evl, uintptr_t ra, bool is_load)
381 {
382     uint32_t k;
383     target_ulong page_split, elems, addr;
384     uint32_t nf = vext_nf(desc);
385     uint32_t max_elems = vext_max_elems(desc, log2_esz);
386     uint32_t esz = 1 << log2_esz;
387     uint32_t msize = nf * esz;
388     int mmu_index = riscv_env_mmu_index(env, false);
389 
390     VSTART_CHECK_EARLY_EXIT(env, evl);
391 
392 #if defined(CONFIG_USER_ONLY)
393     /*
394      * For data sizes <= 6 bytes we get better performance by simply calling
395      * vext_continuous_ldst_tlb
396      */
397     if (nf == 1 && (evl << log2_esz) <= 6) {
398         addr = base + (env->vstart << log2_esz);
399         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
400                                  esz, is_load);
401 
402         env->vstart = 0;
403         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
404         return;
405     }
406 #endif
407 
408     /* Calculate the page range of first page */
409     addr = base + ((env->vstart * nf) << log2_esz);
410     page_split = -(addr | TARGET_PAGE_MASK);
411     /* Get number of elements */
412     elems = page_split / msize;
413     if (unlikely(env->vstart + elems >= evl)) {
414         elems = evl - env->vstart;
415     }
416 
417     /* Load/store elements in the first page */
418     if (likely(elems)) {
419         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
420                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
421     }
422 
423     /* Load/store elements in the second page */
424     if (unlikely(env->vstart < evl)) {
425         /* Cross page element */
426         if (unlikely(page_split % msize)) {
427             for (k = 0; k < nf; k++) {
428                 addr = base + ((env->vstart * nf + k) << log2_esz);
429                 ldst_tlb(env, adjust_addr(env, addr),
430                         env->vstart + k * max_elems, vd, ra);
431             }
432             env->vstart++;
433         }
434 
435         addr = base + ((env->vstart * nf) << log2_esz);
436         /* Get number of elements of second page */
437         elems = evl - env->vstart;
438 
439         /* Load/store elements in the second page */
440         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
441                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
442     }
443 
444     env->vstart = 0;
445     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
446 }
447 
448 /*
449  * masked unit-stride load and store operation will be a special case of
450  * stride, stride = NF * sizeof (ETYPE)
451  */
452 
453 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
454 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
455                          CPURISCVState *env, uint32_t desc)         \
456 {                                                                   \
457     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
458     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
459                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
460 }                                                                   \
461                                                                     \
462 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
463                   CPURISCVState *env, uint32_t desc)                \
464 {                                                                   \
465     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
466                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
467 }
468 
469 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
470 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
471 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
472 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
473 
474 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
475 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
476                          CPURISCVState *env, uint32_t desc)              \
477 {                                                                        \
478     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
479     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
480                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
481 }                                                                        \
482                                                                          \
483 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
484                   CPURISCVState *env, uint32_t desc)                     \
485 {                                                                        \
486     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
487                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
488 }
489 
490 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
491 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
492 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
493 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
494 
495 /*
496  * unit stride mask load and store, EEW = 1
497  */
498 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
499                     CPURISCVState *env, uint32_t desc)
500 {
501     /* evl = ceil(vl/8) */
502     uint8_t evl = (env->vl + 7) >> 3;
503     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
504                  0, evl, GETPC(), true);
505 }
506 
507 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
508                     CPURISCVState *env, uint32_t desc)
509 {
510     /* evl = ceil(vl/8) */
511     uint8_t evl = (env->vl + 7) >> 3;
512     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
513                  0, evl, GETPC(), false);
514 }
515 
516 /*
517  * index: access vector element from indexed memory
518  */
519 typedef target_ulong vext_get_index_addr(target_ulong base,
520         uint32_t idx, void *vs2);
521 
522 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
523 static target_ulong NAME(target_ulong base,            \
524                          uint32_t idx, void *vs2)      \
525 {                                                      \
526     return (base + *((ETYPE *)vs2 + H(idx)));          \
527 }
528 
529 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
530 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
531 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
532 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
533 
534 static inline void
535 vext_ldst_index(void *vd, void *v0, target_ulong base,
536                 void *vs2, CPURISCVState *env, uint32_t desc,
537                 vext_get_index_addr get_index_addr,
538                 vext_ldst_elem_fn_tlb *ldst_elem,
539                 uint32_t log2_esz, uintptr_t ra)
540 {
541     uint32_t i, k;
542     uint32_t nf = vext_nf(desc);
543     uint32_t vm = vext_vm(desc);
544     uint32_t max_elems = vext_max_elems(desc, log2_esz);
545     uint32_t esz = 1 << log2_esz;
546     uint32_t vma = vext_vma(desc);
547 
548     VSTART_CHECK_EARLY_EXIT(env, env->vl);
549 
550     /* load bytes from guest memory */
551     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
552         k = 0;
553         while (k < nf) {
554             if (!vm && !vext_elem_mask(v0, i)) {
555                 /* set masked-off elements to 1s */
556                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
557                                   (i + k * max_elems + 1) * esz);
558                 k++;
559                 continue;
560             }
561             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
562             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
563             k++;
564         }
565     }
566     env->vstart = 0;
567 
568     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
569 }
570 
571 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
572 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
573                   void *vs2, CPURISCVState *env, uint32_t desc)            \
574 {                                                                          \
575     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
576                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
577 }
578 
579 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
580 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
581 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
582 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
583 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
584 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
585 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
586 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
587 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
588 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
589 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
590 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
591 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
592 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
593 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
594 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
595 
596 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
597 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
598                   void *vs2, CPURISCVState *env, uint32_t desc)  \
599 {                                                                \
600     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
601                     STORE_FN, ctzl(sizeof(ETYPE)),               \
602                     GETPC());                                    \
603 }
604 
605 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
606 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
607 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
608 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
609 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
610 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
611 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
612 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
613 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
614 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
615 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
616 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
617 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
618 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
619 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
620 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
621 
622 /*
623  * unit-stride fault-only-fisrt load instructions
624  */
625 static inline void
626 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
627           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
628           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
629 {
630     uint32_t i, k, vl = 0;
631     uint32_t nf = vext_nf(desc);
632     uint32_t vm = vext_vm(desc);
633     uint32_t max_elems = vext_max_elems(desc, log2_esz);
634     uint32_t esz = 1 << log2_esz;
635     uint32_t msize = nf * esz;
636     uint32_t vma = vext_vma(desc);
637     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
638     int mmu_index = riscv_env_mmu_index(env, false);
639     int flags;
640     void *host;
641 
642     VSTART_CHECK_EARLY_EXIT(env, env->vl);
643 
644     addr = base + ((env->vstart * nf) << log2_esz);
645     page_split = -(addr | TARGET_PAGE_MASK);
646     /* Get number of elements */
647     elems = page_split / msize;
648     if (unlikely(env->vstart + elems >= env->vl)) {
649         elems = env->vl - env->vstart;
650     }
651 
652     /* Check page permission/pmp/watchpoint/etc. */
653     flags = probe_access_flags(env, adjust_addr(env, addr), elems * msize,
654                                MMU_DATA_LOAD, mmu_index, true, &host, ra);
655 
656     /* If we are crossing a page check also the second page. */
657     if (env->vl > elems) {
658         addr_probe = addr + (elems << log2_esz);
659         flags |= probe_access_flags(env, adjust_addr(env, addr_probe),
660                                     elems * msize, MMU_DATA_LOAD, mmu_index,
661                                     true, &host, ra);
662     }
663 
664     if (flags & ~TLB_WATCHPOINT) {
665         /* probe every access */
666         for (i = env->vstart; i < env->vl; i++) {
667             if (!vm && !vext_elem_mask(v0, i)) {
668                 continue;
669             }
670             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
671             if (i == 0) {
672                 /* Allow fault on first element. */
673                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD);
674             } else {
675                 remain = nf << log2_esz;
676                 while (remain > 0) {
677                     offset = -(addr_i | TARGET_PAGE_MASK);
678 
679                     /* Probe nonfault on subsequent elements. */
680                     flags = probe_access_flags(env, addr_i, offset,
681                                                MMU_DATA_LOAD, mmu_index, true,
682                                                &host, 0);
683 
684                     /*
685                      * Stop if invalid (unmapped) or mmio (transaction may
686                      * fail). Do not stop if watchpoint, as the spec says that
687                      * first-fault should continue to access the same
688                      * elements regardless of any watchpoint.
689                      */
690                     if (flags & ~TLB_WATCHPOINT) {
691                         vl = i;
692                         goto ProbeSuccess;
693                     }
694                     if (remain <= offset) {
695                         break;
696                     }
697                     remain -= offset;
698                     addr_i = adjust_addr(env, addr_i + offset);
699                 }
700             }
701         }
702     }
703 ProbeSuccess:
704     /* load bytes from guest memory */
705     if (vl != 0) {
706         env->vl = vl;
707     }
708 
709     if (env->vstart < env->vl) {
710         if (vm) {
711             /* Load/store elements in the first page */
712             if (likely(elems)) {
713                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
714                                   log2_esz, true, mmu_index, ldst_tlb,
715                                   ldst_host, ra);
716             }
717 
718             /* Load/store elements in the second page */
719             if (unlikely(env->vstart < env->vl)) {
720                 /* Cross page element */
721                 if (unlikely(page_split % msize)) {
722                     for (k = 0; k < nf; k++) {
723                         addr = base + ((env->vstart * nf + k) << log2_esz);
724                         ldst_tlb(env, adjust_addr(env, addr),
725                                  env->vstart + k * max_elems, vd, ra);
726                     }
727                     env->vstart++;
728                 }
729 
730                 addr = base + ((env->vstart * nf) << log2_esz);
731                 /* Get number of elements of second page */
732                 elems = env->vl - env->vstart;
733 
734                 /* Load/store elements in the second page */
735                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
736                                   log2_esz, true, mmu_index, ldst_tlb,
737                                   ldst_host, ra);
738             }
739         } else {
740             for (i = env->vstart; i < env->vl; i++) {
741                 k = 0;
742                 while (k < nf) {
743                     if (!vext_elem_mask(v0, i)) {
744                         /* set masked-off elements to 1s */
745                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
746                                           (i + k * max_elems + 1) * esz);
747                         k++;
748                         continue;
749                     }
750                     addr = base + ((i * nf + k) << log2_esz);
751                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
752                              vd, ra);
753                     k++;
754                 }
755             }
756         }
757     }
758     env->vstart = 0;
759 
760     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
761 }
762 
763 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
764 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
765                   CPURISCVState *env, uint32_t desc)            \
766 {                                                               \
767     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
768               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
769 }
770 
771 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
772 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
773 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
774 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
775 
776 #define DO_SWAP(N, M) (M)
777 #define DO_AND(N, M)  (N & M)
778 #define DO_XOR(N, M)  (N ^ M)
779 #define DO_OR(N, M)   (N | M)
780 #define DO_ADD(N, M)  (N + M)
781 
782 /* Signed min/max */
783 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
784 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
785 
786 /*
787  * load and store whole register instructions
788  */
789 static inline QEMU_ALWAYS_INLINE void
790 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
791                 vext_ldst_elem_fn_tlb *ldst_tlb,
792                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
793                 uintptr_t ra, bool is_load)
794 {
795     target_ulong page_split, elems, addr;
796     uint32_t nf = vext_nf(desc);
797     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
798     uint32_t max_elems = vlenb >> log2_esz;
799     uint32_t evl = nf * max_elems;
800     uint32_t esz = 1 << log2_esz;
801     int mmu_index = riscv_env_mmu_index(env, false);
802 
803     /* Calculate the page range of first page */
804     addr = base + (env->vstart << log2_esz);
805     page_split = -(addr | TARGET_PAGE_MASK);
806     /* Get number of elements */
807     elems = page_split / esz;
808     if (unlikely(env->vstart + elems >= evl)) {
809         elems = evl - env->vstart;
810     }
811 
812     /* Load/store elements in the first page */
813     if (likely(elems)) {
814         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
815                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
816     }
817 
818     /* Load/store elements in the second page */
819     if (unlikely(env->vstart < evl)) {
820         /* Cross page element */
821         if (unlikely(page_split % esz)) {
822             addr = base + (env->vstart << log2_esz);
823             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
824             env->vstart++;
825         }
826 
827         addr = base + (env->vstart << log2_esz);
828         /* Get number of elements of second page */
829         elems = evl - env->vstart;
830 
831         /* Load/store elements in the second page */
832         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
833                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
834     }
835 
836     env->vstart = 0;
837 }
838 
839 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
840 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
841                   uint32_t desc)                                    \
842 {                                                                   \
843     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
844                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
845 }
846 
847 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
848 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
849 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
850 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
851 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
852 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
853 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
854 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
855 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
856 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
857 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
858 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
859 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
860 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
861 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
862 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
863 
864 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
865 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
866                   uint32_t desc)                                        \
867 {                                                                       \
868     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
869                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
870 }
871 
872 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
873 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
874 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
875 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
876 
877 /*
878  * Vector Integer Arithmetic Instructions
879  */
880 
881 /* (TD, T1, T2, TX1, TX2) */
882 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
883 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
884 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
885 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
886 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
887 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
888 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
889 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
890 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
891 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
892 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
893 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
894 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
895 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
896 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
897 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
898 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
899 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
900 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
901 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
902 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
903 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
904 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
905 
906 #define DO_SUB(N, M) (N - M)
907 #define DO_RSUB(N, M) (M - N)
908 
909 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
910 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
911 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
912 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
913 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
914 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
915 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
916 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
917 
918 GEN_VEXT_VV(vadd_vv_b, 1)
919 GEN_VEXT_VV(vadd_vv_h, 2)
920 GEN_VEXT_VV(vadd_vv_w, 4)
921 GEN_VEXT_VV(vadd_vv_d, 8)
922 GEN_VEXT_VV(vsub_vv_b, 1)
923 GEN_VEXT_VV(vsub_vv_h, 2)
924 GEN_VEXT_VV(vsub_vv_w, 4)
925 GEN_VEXT_VV(vsub_vv_d, 8)
926 
927 
928 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
929 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
930 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
931 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
932 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
933 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
934 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
935 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
936 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
937 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
938 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
939 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
940 
941 GEN_VEXT_VX(vadd_vx_b, 1)
942 GEN_VEXT_VX(vadd_vx_h, 2)
943 GEN_VEXT_VX(vadd_vx_w, 4)
944 GEN_VEXT_VX(vadd_vx_d, 8)
945 GEN_VEXT_VX(vsub_vx_b, 1)
946 GEN_VEXT_VX(vsub_vx_h, 2)
947 GEN_VEXT_VX(vsub_vx_w, 4)
948 GEN_VEXT_VX(vsub_vx_d, 8)
949 GEN_VEXT_VX(vrsub_vx_b, 1)
950 GEN_VEXT_VX(vrsub_vx_h, 2)
951 GEN_VEXT_VX(vrsub_vx_w, 4)
952 GEN_VEXT_VX(vrsub_vx_d, 8)
953 
954 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
955 {
956     intptr_t oprsz = simd_oprsz(desc);
957     intptr_t i;
958 
959     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
960         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
961     }
962 }
963 
964 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
965 {
966     intptr_t oprsz = simd_oprsz(desc);
967     intptr_t i;
968 
969     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
970         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
971     }
972 }
973 
974 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
975 {
976     intptr_t oprsz = simd_oprsz(desc);
977     intptr_t i;
978 
979     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
980         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
981     }
982 }
983 
984 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
985 {
986     intptr_t oprsz = simd_oprsz(desc);
987     intptr_t i;
988 
989     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
990         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
991     }
992 }
993 
994 /* Vector Widening Integer Add/Subtract */
995 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
996 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
997 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
998 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
999 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
1000 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
1001 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1002 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1003 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1004 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1005 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1006 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
1007 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1009 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1010 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1012 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1013 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1015 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1016 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1018 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1019 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1020 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1021 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1022 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1023 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1024 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1025 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1026 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1027 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1028 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1029 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1030 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1031 GEN_VEXT_VV(vwaddu_vv_b, 2)
1032 GEN_VEXT_VV(vwaddu_vv_h, 4)
1033 GEN_VEXT_VV(vwaddu_vv_w, 8)
1034 GEN_VEXT_VV(vwsubu_vv_b, 2)
1035 GEN_VEXT_VV(vwsubu_vv_h, 4)
1036 GEN_VEXT_VV(vwsubu_vv_w, 8)
1037 GEN_VEXT_VV(vwadd_vv_b, 2)
1038 GEN_VEXT_VV(vwadd_vv_h, 4)
1039 GEN_VEXT_VV(vwadd_vv_w, 8)
1040 GEN_VEXT_VV(vwsub_vv_b, 2)
1041 GEN_VEXT_VV(vwsub_vv_h, 4)
1042 GEN_VEXT_VV(vwsub_vv_w, 8)
1043 GEN_VEXT_VV(vwaddu_wv_b, 2)
1044 GEN_VEXT_VV(vwaddu_wv_h, 4)
1045 GEN_VEXT_VV(vwaddu_wv_w, 8)
1046 GEN_VEXT_VV(vwsubu_wv_b, 2)
1047 GEN_VEXT_VV(vwsubu_wv_h, 4)
1048 GEN_VEXT_VV(vwsubu_wv_w, 8)
1049 GEN_VEXT_VV(vwadd_wv_b, 2)
1050 GEN_VEXT_VV(vwadd_wv_h, 4)
1051 GEN_VEXT_VV(vwadd_wv_w, 8)
1052 GEN_VEXT_VV(vwsub_wv_b, 2)
1053 GEN_VEXT_VV(vwsub_wv_h, 4)
1054 GEN_VEXT_VV(vwsub_wv_w, 8)
1055 
1056 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1058 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1059 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1061 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1062 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1064 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1065 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1067 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1068 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1069 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1070 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1071 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1072 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1073 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1074 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1075 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1076 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1077 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1078 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1079 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1080 GEN_VEXT_VX(vwaddu_vx_b, 2)
1081 GEN_VEXT_VX(vwaddu_vx_h, 4)
1082 GEN_VEXT_VX(vwaddu_vx_w, 8)
1083 GEN_VEXT_VX(vwsubu_vx_b, 2)
1084 GEN_VEXT_VX(vwsubu_vx_h, 4)
1085 GEN_VEXT_VX(vwsubu_vx_w, 8)
1086 GEN_VEXT_VX(vwadd_vx_b, 2)
1087 GEN_VEXT_VX(vwadd_vx_h, 4)
1088 GEN_VEXT_VX(vwadd_vx_w, 8)
1089 GEN_VEXT_VX(vwsub_vx_b, 2)
1090 GEN_VEXT_VX(vwsub_vx_h, 4)
1091 GEN_VEXT_VX(vwsub_vx_w, 8)
1092 GEN_VEXT_VX(vwaddu_wx_b, 2)
1093 GEN_VEXT_VX(vwaddu_wx_h, 4)
1094 GEN_VEXT_VX(vwaddu_wx_w, 8)
1095 GEN_VEXT_VX(vwsubu_wx_b, 2)
1096 GEN_VEXT_VX(vwsubu_wx_h, 4)
1097 GEN_VEXT_VX(vwsubu_wx_w, 8)
1098 GEN_VEXT_VX(vwadd_wx_b, 2)
1099 GEN_VEXT_VX(vwadd_wx_h, 4)
1100 GEN_VEXT_VX(vwadd_wx_w, 8)
1101 GEN_VEXT_VX(vwsub_wx_b, 2)
1102 GEN_VEXT_VX(vwsub_wx_h, 4)
1103 GEN_VEXT_VX(vwsub_wx_w, 8)
1104 
1105 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1106 #define DO_VADC(N, M, C) (N + M + C)
1107 #define DO_VSBC(N, M, C) (N - M - C)
1108 
1109 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1110 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1111                   CPURISCVState *env, uint32_t desc)          \
1112 {                                                             \
1113     uint32_t vl = env->vl;                                    \
1114     uint32_t esz = sizeof(ETYPE);                             \
1115     uint32_t total_elems =                                    \
1116         vext_get_total_elems(env, desc, esz);                 \
1117     uint32_t vta = vext_vta(desc);                            \
1118     uint32_t i;                                               \
1119                                                               \
1120     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1121                                                               \
1122     for (i = env->vstart; i < vl; i++) {                      \
1123         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1124         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1125         ETYPE carry = vext_elem_mask(v0, i);                  \
1126                                                               \
1127         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1128     }                                                         \
1129     env->vstart = 0;                                          \
1130     /* set tail elements to 1s */                             \
1131     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1132 }
1133 
1134 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1135 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1136 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1137 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1138 
1139 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1140 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1141 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1142 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1143 
1144 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1145 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1146                   CPURISCVState *env, uint32_t desc)                     \
1147 {                                                                        \
1148     uint32_t vl = env->vl;                                               \
1149     uint32_t esz = sizeof(ETYPE);                                        \
1150     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1151     uint32_t vta = vext_vta(desc);                                       \
1152     uint32_t i;                                                          \
1153                                                                          \
1154     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1155                                                                          \
1156     for (i = env->vstart; i < vl; i++) {                                 \
1157         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1158         ETYPE carry = vext_elem_mask(v0, i);                             \
1159                                                                          \
1160         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1161     }                                                                    \
1162     env->vstart = 0;                                                     \
1163     /* set tail elements to 1s */                                        \
1164     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1165 }
1166 
1167 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1168 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1169 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1170 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1171 
1172 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1173 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1174 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1175 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1176 
1177 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1178                           (__typeof(N))(N + M) < N)
1179 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1180 
1181 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1182 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1183                   CPURISCVState *env, uint32_t desc)          \
1184 {                                                             \
1185     uint32_t vl = env->vl;                                    \
1186     uint32_t vm = vext_vm(desc);                              \
1187     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1188     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1189     uint32_t i;                                               \
1190                                                               \
1191     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1192                                                               \
1193     for (i = env->vstart; i < vl; i++) {                      \
1194         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1195         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1196         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1197         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1198     }                                                         \
1199     env->vstart = 0;                                          \
1200     /*
1201      * mask destination register are always tail-agnostic
1202      * set tail elements to 1s
1203      */                                                       \
1204     if (vta_all_1s) {                                         \
1205         for (; i < total_elems; i++) {                        \
1206             vext_set_elem_mask(vd, i, 1);                     \
1207         }                                                     \
1208     }                                                         \
1209 }
1210 
1211 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1212 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1213 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1214 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1215 
1216 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1217 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1218 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1219 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1220 
1221 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1222 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1223                   void *vs2, CPURISCVState *env, uint32_t desc) \
1224 {                                                               \
1225     uint32_t vl = env->vl;                                      \
1226     uint32_t vm = vext_vm(desc);                                \
1227     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1228     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1229     uint32_t i;                                                 \
1230                                                                 \
1231     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1232                                                                 \
1233     for (i = env->vstart; i < vl; i++) {                        \
1234         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1235         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1236         vext_set_elem_mask(vd, i,                               \
1237                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1238     }                                                           \
1239     env->vstart = 0;                                            \
1240     /*
1241      * mask destination register are always tail-agnostic
1242      * set tail elements to 1s
1243      */                                                         \
1244     if (vta_all_1s) {                                           \
1245         for (; i < total_elems; i++) {                          \
1246             vext_set_elem_mask(vd, i, 1);                       \
1247         }                                                       \
1248     }                                                           \
1249 }
1250 
1251 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1252 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1253 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1254 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1255 
1256 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1257 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1258 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1259 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1260 
1261 /* Vector Bitwise Logical Instructions */
1262 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1263 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1264 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1265 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1266 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1267 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1268 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1269 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1270 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1271 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1272 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1273 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1274 GEN_VEXT_VV(vand_vv_b, 1)
1275 GEN_VEXT_VV(vand_vv_h, 2)
1276 GEN_VEXT_VV(vand_vv_w, 4)
1277 GEN_VEXT_VV(vand_vv_d, 8)
1278 GEN_VEXT_VV(vor_vv_b, 1)
1279 GEN_VEXT_VV(vor_vv_h, 2)
1280 GEN_VEXT_VV(vor_vv_w, 4)
1281 GEN_VEXT_VV(vor_vv_d, 8)
1282 GEN_VEXT_VV(vxor_vv_b, 1)
1283 GEN_VEXT_VV(vxor_vv_h, 2)
1284 GEN_VEXT_VV(vxor_vv_w, 4)
1285 GEN_VEXT_VV(vxor_vv_d, 8)
1286 
1287 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1288 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1289 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1290 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1291 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1292 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1293 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1294 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1295 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1296 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1297 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1298 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1299 GEN_VEXT_VX(vand_vx_b, 1)
1300 GEN_VEXT_VX(vand_vx_h, 2)
1301 GEN_VEXT_VX(vand_vx_w, 4)
1302 GEN_VEXT_VX(vand_vx_d, 8)
1303 GEN_VEXT_VX(vor_vx_b, 1)
1304 GEN_VEXT_VX(vor_vx_h, 2)
1305 GEN_VEXT_VX(vor_vx_w, 4)
1306 GEN_VEXT_VX(vor_vx_d, 8)
1307 GEN_VEXT_VX(vxor_vx_b, 1)
1308 GEN_VEXT_VX(vxor_vx_h, 2)
1309 GEN_VEXT_VX(vxor_vx_w, 4)
1310 GEN_VEXT_VX(vxor_vx_d, 8)
1311 
1312 /* Vector Single-Width Bit Shift Instructions */
1313 #define DO_SLL(N, M)  (N << (M))
1314 #define DO_SRL(N, M)  (N >> (M))
1315 
1316 /* generate the helpers for shift instructions with two vector operators */
1317 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1318 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1319                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1320 {                                                                         \
1321     uint32_t vm = vext_vm(desc);                                          \
1322     uint32_t vl = env->vl;                                                \
1323     uint32_t esz = sizeof(TS1);                                           \
1324     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1325     uint32_t vta = vext_vta(desc);                                        \
1326     uint32_t vma = vext_vma(desc);                                        \
1327     uint32_t i;                                                           \
1328                                                                           \
1329     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1330                                                                           \
1331     for (i = env->vstart; i < vl; i++) {                                  \
1332         if (!vm && !vext_elem_mask(v0, i)) {                              \
1333             /* set masked-off elements to 1s */                           \
1334             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1335             continue;                                                     \
1336         }                                                                 \
1337         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1338         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1339         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1340     }                                                                     \
1341     env->vstart = 0;                                                      \
1342     /* set tail elements to 1s */                                         \
1343     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1344 }
1345 
1346 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1347 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1348 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1349 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1350 
1351 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1352 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1353 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1354 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1355 
1356 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1357 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1358 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1359 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1360 
1361 /*
1362  * generate the helpers for shift instructions with one vector and one scalar
1363  */
1364 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1365 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1366                   void *vs2, CPURISCVState *env,            \
1367                   uint32_t desc)                            \
1368 {                                                           \
1369     uint32_t vm = vext_vm(desc);                            \
1370     uint32_t vl = env->vl;                                  \
1371     uint32_t esz = sizeof(TD);                              \
1372     uint32_t total_elems =                                  \
1373         vext_get_total_elems(env, desc, esz);               \
1374     uint32_t vta = vext_vta(desc);                          \
1375     uint32_t vma = vext_vma(desc);                          \
1376     uint32_t i;                                             \
1377                                                             \
1378     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1379                                                             \
1380     for (i = env->vstart; i < vl; i++) {                    \
1381         if (!vm && !vext_elem_mask(v0, i)) {                \
1382             /* set masked-off elements to 1s */             \
1383             vext_set_elems_1s(vd, vma, i * esz,             \
1384                               (i + 1) * esz);               \
1385             continue;                                       \
1386         }                                                   \
1387         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1388         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1389     }                                                       \
1390     env->vstart = 0;                                        \
1391     /* set tail elements to 1s */                           \
1392     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1393 }
1394 
1395 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1396 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1397 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1398 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1399 
1400 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1401 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1402 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1403 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1404 
1405 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1406 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1407 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1408 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1409 
1410 /* Vector Narrowing Integer Right Shift Instructions */
1411 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1412 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1413 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1414 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1415 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1416 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1417 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1418 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1419 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1420 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1421 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1422 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1423 
1424 /* Vector Integer Comparison Instructions */
1425 #define DO_MSEQ(N, M) (N == M)
1426 #define DO_MSNE(N, M) (N != M)
1427 #define DO_MSLT(N, M) (N < M)
1428 #define DO_MSLE(N, M) (N <= M)
1429 #define DO_MSGT(N, M) (N > M)
1430 
1431 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1432 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1433                   CPURISCVState *env, uint32_t desc)          \
1434 {                                                             \
1435     uint32_t vm = vext_vm(desc);                              \
1436     uint32_t vl = env->vl;                                    \
1437     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1438     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1439     uint32_t vma = vext_vma(desc);                            \
1440     uint32_t i;                                               \
1441                                                               \
1442     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1443                                                               \
1444     for (i = env->vstart; i < vl; i++) {                      \
1445         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1446         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1447         if (!vm && !vext_elem_mask(v0, i)) {                  \
1448             /* set masked-off elements to 1s */               \
1449             if (vma) {                                        \
1450                 vext_set_elem_mask(vd, i, 1);                 \
1451             }                                                 \
1452             continue;                                         \
1453         }                                                     \
1454         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1455     }                                                         \
1456     env->vstart = 0;                                          \
1457     /*
1458      * mask destination register are always tail-agnostic
1459      * set tail elements to 1s
1460      */                                                       \
1461     if (vta_all_1s) {                                         \
1462         for (; i < total_elems; i++) {                        \
1463             vext_set_elem_mask(vd, i, 1);                     \
1464         }                                                     \
1465     }                                                         \
1466 }
1467 
1468 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1469 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1470 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1471 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1472 
1473 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1474 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1475 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1476 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1477 
1478 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1479 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1480 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1481 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1482 
1483 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1484 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1485 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1486 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1487 
1488 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1489 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1490 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1491 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1492 
1493 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1494 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1495 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1496 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1497 
1498 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1499 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1500                   CPURISCVState *env, uint32_t desc)                \
1501 {                                                                   \
1502     uint32_t vm = vext_vm(desc);                                    \
1503     uint32_t vl = env->vl;                                          \
1504     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1505     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1506     uint32_t vma = vext_vma(desc);                                  \
1507     uint32_t i;                                                     \
1508                                                                     \
1509     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1510                                                                     \
1511     for (i = env->vstart; i < vl; i++) {                            \
1512         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1513         if (!vm && !vext_elem_mask(v0, i)) {                        \
1514             /* set masked-off elements to 1s */                     \
1515             if (vma) {                                              \
1516                 vext_set_elem_mask(vd, i, 1);                       \
1517             }                                                       \
1518             continue;                                               \
1519         }                                                           \
1520         vext_set_elem_mask(vd, i,                                   \
1521                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1522     }                                                               \
1523     env->vstart = 0;                                                \
1524     /*
1525      * mask destination register are always tail-agnostic
1526      * set tail elements to 1s
1527      */                                                             \
1528     if (vta_all_1s) {                                               \
1529         for (; i < total_elems; i++) {                              \
1530             vext_set_elem_mask(vd, i, 1);                           \
1531         }                                                           \
1532     }                                                               \
1533 }
1534 
1535 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1536 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1537 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1538 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1539 
1540 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1541 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1542 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1543 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1544 
1545 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1546 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1547 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1548 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1549 
1550 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1551 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1552 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1553 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1554 
1555 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1556 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1557 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1558 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1559 
1560 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1561 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1562 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1563 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1564 
1565 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1566 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1567 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1568 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1569 
1570 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1571 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1572 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1573 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1574 
1575 /* Vector Integer Min/Max Instructions */
1576 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1577 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1578 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1579 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1580 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1581 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1582 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1583 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1584 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1585 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1586 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1587 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1588 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1589 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1590 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1591 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1592 GEN_VEXT_VV(vminu_vv_b, 1)
1593 GEN_VEXT_VV(vminu_vv_h, 2)
1594 GEN_VEXT_VV(vminu_vv_w, 4)
1595 GEN_VEXT_VV(vminu_vv_d, 8)
1596 GEN_VEXT_VV(vmin_vv_b, 1)
1597 GEN_VEXT_VV(vmin_vv_h, 2)
1598 GEN_VEXT_VV(vmin_vv_w, 4)
1599 GEN_VEXT_VV(vmin_vv_d, 8)
1600 GEN_VEXT_VV(vmaxu_vv_b, 1)
1601 GEN_VEXT_VV(vmaxu_vv_h, 2)
1602 GEN_VEXT_VV(vmaxu_vv_w, 4)
1603 GEN_VEXT_VV(vmaxu_vv_d, 8)
1604 GEN_VEXT_VV(vmax_vv_b, 1)
1605 GEN_VEXT_VV(vmax_vv_h, 2)
1606 GEN_VEXT_VV(vmax_vv_w, 4)
1607 GEN_VEXT_VV(vmax_vv_d, 8)
1608 
1609 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1610 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1611 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1612 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1613 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1614 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1615 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1616 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1617 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1618 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1619 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1620 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1621 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1622 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1623 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1624 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1625 GEN_VEXT_VX(vminu_vx_b, 1)
1626 GEN_VEXT_VX(vminu_vx_h, 2)
1627 GEN_VEXT_VX(vminu_vx_w, 4)
1628 GEN_VEXT_VX(vminu_vx_d, 8)
1629 GEN_VEXT_VX(vmin_vx_b, 1)
1630 GEN_VEXT_VX(vmin_vx_h, 2)
1631 GEN_VEXT_VX(vmin_vx_w, 4)
1632 GEN_VEXT_VX(vmin_vx_d, 8)
1633 GEN_VEXT_VX(vmaxu_vx_b, 1)
1634 GEN_VEXT_VX(vmaxu_vx_h, 2)
1635 GEN_VEXT_VX(vmaxu_vx_w, 4)
1636 GEN_VEXT_VX(vmaxu_vx_d, 8)
1637 GEN_VEXT_VX(vmax_vx_b, 1)
1638 GEN_VEXT_VX(vmax_vx_h, 2)
1639 GEN_VEXT_VX(vmax_vx_w, 4)
1640 GEN_VEXT_VX(vmax_vx_d, 8)
1641 
1642 /* Vector Single-Width Integer Multiply Instructions */
1643 #define DO_MUL(N, M) (N * M)
1644 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1645 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1646 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1647 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1648 GEN_VEXT_VV(vmul_vv_b, 1)
1649 GEN_VEXT_VV(vmul_vv_h, 2)
1650 GEN_VEXT_VV(vmul_vv_w, 4)
1651 GEN_VEXT_VV(vmul_vv_d, 8)
1652 
1653 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1654 {
1655     return (int16_t)s2 * (int16_t)s1 >> 8;
1656 }
1657 
1658 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1659 {
1660     return (int32_t)s2 * (int32_t)s1 >> 16;
1661 }
1662 
1663 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1664 {
1665     return (int64_t)s2 * (int64_t)s1 >> 32;
1666 }
1667 
1668 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1669 {
1670     uint64_t hi_64, lo_64;
1671 
1672     muls64(&lo_64, &hi_64, s1, s2);
1673     return hi_64;
1674 }
1675 
1676 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1677 {
1678     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1679 }
1680 
1681 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1682 {
1683     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1684 }
1685 
1686 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1687 {
1688     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1689 }
1690 
1691 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1692 {
1693     uint64_t hi_64, lo_64;
1694 
1695     mulu64(&lo_64, &hi_64, s2, s1);
1696     return hi_64;
1697 }
1698 
1699 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1700 {
1701     return (int16_t)s2 * (uint16_t)s1 >> 8;
1702 }
1703 
1704 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1705 {
1706     return (int32_t)s2 * (uint32_t)s1 >> 16;
1707 }
1708 
1709 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1710 {
1711     return (int64_t)s2 * (uint64_t)s1 >> 32;
1712 }
1713 
1714 /*
1715  * Let  A = signed operand,
1716  *      B = unsigned operand
1717  *      P = mulu64(A, B), unsigned product
1718  *
1719  * LET  X = 2 ** 64  - A, 2's complement of A
1720  *      SP = signed product
1721  * THEN
1722  *      IF A < 0
1723  *          SP = -X * B
1724  *             = -(2 ** 64 - A) * B
1725  *             = A * B - 2 ** 64 * B
1726  *             = P - 2 ** 64 * B
1727  *      ELSE
1728  *          SP = P
1729  * THEN
1730  *      HI_P -= (A < 0 ? B : 0)
1731  */
1732 
1733 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1734 {
1735     uint64_t hi_64, lo_64;
1736 
1737     mulu64(&lo_64, &hi_64, s2, s1);
1738 
1739     hi_64 -= s2 < 0 ? s1 : 0;
1740     return hi_64;
1741 }
1742 
1743 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1744 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1745 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1746 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1747 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1748 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1749 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1750 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1751 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1752 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1753 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1754 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1755 GEN_VEXT_VV(vmulh_vv_b, 1)
1756 GEN_VEXT_VV(vmulh_vv_h, 2)
1757 GEN_VEXT_VV(vmulh_vv_w, 4)
1758 GEN_VEXT_VV(vmulh_vv_d, 8)
1759 GEN_VEXT_VV(vmulhu_vv_b, 1)
1760 GEN_VEXT_VV(vmulhu_vv_h, 2)
1761 GEN_VEXT_VV(vmulhu_vv_w, 4)
1762 GEN_VEXT_VV(vmulhu_vv_d, 8)
1763 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1764 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1765 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1766 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1767 
1768 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1769 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1770 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1771 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1772 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1773 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1774 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1775 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1776 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1777 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1778 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1779 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1780 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1781 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1782 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1783 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1784 GEN_VEXT_VX(vmul_vx_b, 1)
1785 GEN_VEXT_VX(vmul_vx_h, 2)
1786 GEN_VEXT_VX(vmul_vx_w, 4)
1787 GEN_VEXT_VX(vmul_vx_d, 8)
1788 GEN_VEXT_VX(vmulh_vx_b, 1)
1789 GEN_VEXT_VX(vmulh_vx_h, 2)
1790 GEN_VEXT_VX(vmulh_vx_w, 4)
1791 GEN_VEXT_VX(vmulh_vx_d, 8)
1792 GEN_VEXT_VX(vmulhu_vx_b, 1)
1793 GEN_VEXT_VX(vmulhu_vx_h, 2)
1794 GEN_VEXT_VX(vmulhu_vx_w, 4)
1795 GEN_VEXT_VX(vmulhu_vx_d, 8)
1796 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1797 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1798 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1799 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1800 
1801 /* Vector Integer Divide Instructions */
1802 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1803 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1804 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1805         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1806 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1807         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1808 
1809 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1810 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1811 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1812 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1813 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1814 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1815 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1816 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1817 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1818 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1819 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1820 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1821 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1822 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1823 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1824 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1825 GEN_VEXT_VV(vdivu_vv_b, 1)
1826 GEN_VEXT_VV(vdivu_vv_h, 2)
1827 GEN_VEXT_VV(vdivu_vv_w, 4)
1828 GEN_VEXT_VV(vdivu_vv_d, 8)
1829 GEN_VEXT_VV(vdiv_vv_b, 1)
1830 GEN_VEXT_VV(vdiv_vv_h, 2)
1831 GEN_VEXT_VV(vdiv_vv_w, 4)
1832 GEN_VEXT_VV(vdiv_vv_d, 8)
1833 GEN_VEXT_VV(vremu_vv_b, 1)
1834 GEN_VEXT_VV(vremu_vv_h, 2)
1835 GEN_VEXT_VV(vremu_vv_w, 4)
1836 GEN_VEXT_VV(vremu_vv_d, 8)
1837 GEN_VEXT_VV(vrem_vv_b, 1)
1838 GEN_VEXT_VV(vrem_vv_h, 2)
1839 GEN_VEXT_VV(vrem_vv_w, 4)
1840 GEN_VEXT_VV(vrem_vv_d, 8)
1841 
1842 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1843 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1844 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1845 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1846 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1847 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1848 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1849 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1850 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1851 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1852 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1853 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1854 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1855 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1856 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1857 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1858 GEN_VEXT_VX(vdivu_vx_b, 1)
1859 GEN_VEXT_VX(vdivu_vx_h, 2)
1860 GEN_VEXT_VX(vdivu_vx_w, 4)
1861 GEN_VEXT_VX(vdivu_vx_d, 8)
1862 GEN_VEXT_VX(vdiv_vx_b, 1)
1863 GEN_VEXT_VX(vdiv_vx_h, 2)
1864 GEN_VEXT_VX(vdiv_vx_w, 4)
1865 GEN_VEXT_VX(vdiv_vx_d, 8)
1866 GEN_VEXT_VX(vremu_vx_b, 1)
1867 GEN_VEXT_VX(vremu_vx_h, 2)
1868 GEN_VEXT_VX(vremu_vx_w, 4)
1869 GEN_VEXT_VX(vremu_vx_d, 8)
1870 GEN_VEXT_VX(vrem_vx_b, 1)
1871 GEN_VEXT_VX(vrem_vx_h, 2)
1872 GEN_VEXT_VX(vrem_vx_w, 4)
1873 GEN_VEXT_VX(vrem_vx_d, 8)
1874 
1875 /* Vector Widening Integer Multiply Instructions */
1876 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1877 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1878 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1879 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1880 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1881 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1882 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1883 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1884 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1885 GEN_VEXT_VV(vwmul_vv_b, 2)
1886 GEN_VEXT_VV(vwmul_vv_h, 4)
1887 GEN_VEXT_VV(vwmul_vv_w, 8)
1888 GEN_VEXT_VV(vwmulu_vv_b, 2)
1889 GEN_VEXT_VV(vwmulu_vv_h, 4)
1890 GEN_VEXT_VV(vwmulu_vv_w, 8)
1891 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1892 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1893 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1894 
1895 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1896 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1897 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1898 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1899 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1900 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1901 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1902 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1903 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1904 GEN_VEXT_VX(vwmul_vx_b, 2)
1905 GEN_VEXT_VX(vwmul_vx_h, 4)
1906 GEN_VEXT_VX(vwmul_vx_w, 8)
1907 GEN_VEXT_VX(vwmulu_vx_b, 2)
1908 GEN_VEXT_VX(vwmulu_vx_h, 4)
1909 GEN_VEXT_VX(vwmulu_vx_w, 8)
1910 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1911 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1912 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1913 
1914 /* Vector Single-Width Integer Multiply-Add Instructions */
1915 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1916 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1917 {                                                                  \
1918     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1919     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1920     TD d = *((TD *)vd + HD(i));                                    \
1921     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1922 }
1923 
1924 #define DO_MACC(N, M, D) (M * N + D)
1925 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1926 #define DO_MADD(N, M, D) (M * D + N)
1927 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1928 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1929 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1930 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1931 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1932 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1933 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1934 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1935 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1936 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1937 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1938 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1939 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1940 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1941 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1942 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1943 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1944 GEN_VEXT_VV(vmacc_vv_b, 1)
1945 GEN_VEXT_VV(vmacc_vv_h, 2)
1946 GEN_VEXT_VV(vmacc_vv_w, 4)
1947 GEN_VEXT_VV(vmacc_vv_d, 8)
1948 GEN_VEXT_VV(vnmsac_vv_b, 1)
1949 GEN_VEXT_VV(vnmsac_vv_h, 2)
1950 GEN_VEXT_VV(vnmsac_vv_w, 4)
1951 GEN_VEXT_VV(vnmsac_vv_d, 8)
1952 GEN_VEXT_VV(vmadd_vv_b, 1)
1953 GEN_VEXT_VV(vmadd_vv_h, 2)
1954 GEN_VEXT_VV(vmadd_vv_w, 4)
1955 GEN_VEXT_VV(vmadd_vv_d, 8)
1956 GEN_VEXT_VV(vnmsub_vv_b, 1)
1957 GEN_VEXT_VV(vnmsub_vv_h, 2)
1958 GEN_VEXT_VV(vnmsub_vv_w, 4)
1959 GEN_VEXT_VV(vnmsub_vv_d, 8)
1960 
1961 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1962 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1963 {                                                                   \
1964     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1965     TD d = *((TD *)vd + HD(i));                                     \
1966     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1967 }
1968 
1969 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1970 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1971 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1972 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1973 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1974 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1975 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1976 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1977 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1978 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1979 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1980 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1981 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1982 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1983 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1984 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1985 GEN_VEXT_VX(vmacc_vx_b, 1)
1986 GEN_VEXT_VX(vmacc_vx_h, 2)
1987 GEN_VEXT_VX(vmacc_vx_w, 4)
1988 GEN_VEXT_VX(vmacc_vx_d, 8)
1989 GEN_VEXT_VX(vnmsac_vx_b, 1)
1990 GEN_VEXT_VX(vnmsac_vx_h, 2)
1991 GEN_VEXT_VX(vnmsac_vx_w, 4)
1992 GEN_VEXT_VX(vnmsac_vx_d, 8)
1993 GEN_VEXT_VX(vmadd_vx_b, 1)
1994 GEN_VEXT_VX(vmadd_vx_h, 2)
1995 GEN_VEXT_VX(vmadd_vx_w, 4)
1996 GEN_VEXT_VX(vmadd_vx_d, 8)
1997 GEN_VEXT_VX(vnmsub_vx_b, 1)
1998 GEN_VEXT_VX(vnmsub_vx_h, 2)
1999 GEN_VEXT_VX(vnmsub_vx_w, 4)
2000 GEN_VEXT_VX(vnmsub_vx_d, 8)
2001 
2002 /* Vector Widening Integer Multiply-Add Instructions */
2003 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2004 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2005 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2006 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2007 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2008 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2009 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2010 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2011 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2012 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2013 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2014 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2015 GEN_VEXT_VV(vwmacc_vv_b, 2)
2016 GEN_VEXT_VV(vwmacc_vv_h, 4)
2017 GEN_VEXT_VV(vwmacc_vv_w, 8)
2018 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2019 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2020 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2021 
2022 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2023 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2024 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2025 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2026 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2027 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2028 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2029 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2030 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2031 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2032 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2033 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2034 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2035 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2036 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2037 GEN_VEXT_VX(vwmacc_vx_b, 2)
2038 GEN_VEXT_VX(vwmacc_vx_h, 4)
2039 GEN_VEXT_VX(vwmacc_vx_w, 8)
2040 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2041 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2042 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2043 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2044 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2045 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2046 
2047 /* Vector Integer Merge and Move Instructions */
2048 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2049 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2050                   uint32_t desc)                                     \
2051 {                                                                    \
2052     uint32_t vl = env->vl;                                           \
2053     uint32_t esz = sizeof(ETYPE);                                    \
2054     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2055     uint32_t vta = vext_vta(desc);                                   \
2056     uint32_t i;                                                      \
2057                                                                      \
2058     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2059                                                                      \
2060     for (i = env->vstart; i < vl; i++) {                             \
2061         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2062         *((ETYPE *)vd + H(i)) = s1;                                  \
2063     }                                                                \
2064     env->vstart = 0;                                                 \
2065     /* set tail elements to 1s */                                    \
2066     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2067 }
2068 
2069 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2070 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2071 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2072 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2073 
2074 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2075 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2076                   uint32_t desc)                                     \
2077 {                                                                    \
2078     uint32_t vl = env->vl;                                           \
2079     uint32_t esz = sizeof(ETYPE);                                    \
2080     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2081     uint32_t vta = vext_vta(desc);                                   \
2082     uint32_t i;                                                      \
2083                                                                      \
2084     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2085                                                                      \
2086     for (i = env->vstart; i < vl; i++) {                             \
2087         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2088     }                                                                \
2089     env->vstart = 0;                                                 \
2090     /* set tail elements to 1s */                                    \
2091     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2092 }
2093 
2094 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2095 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2096 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2097 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2098 
2099 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2100 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2101                   CPURISCVState *env, uint32_t desc)                 \
2102 {                                                                    \
2103     uint32_t vl = env->vl;                                           \
2104     uint32_t esz = sizeof(ETYPE);                                    \
2105     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2106     uint32_t vta = vext_vta(desc);                                   \
2107     uint32_t i;                                                      \
2108                                                                      \
2109     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2110                                                                      \
2111     for (i = env->vstart; i < vl; i++) {                             \
2112         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2113         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2114     }                                                                \
2115     env->vstart = 0;                                                 \
2116     /* set tail elements to 1s */                                    \
2117     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2118 }
2119 
2120 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2121 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2122 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2123 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2124 
2125 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2126 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2127                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2128 {                                                                    \
2129     uint32_t vl = env->vl;                                           \
2130     uint32_t esz = sizeof(ETYPE);                                    \
2131     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2132     uint32_t vta = vext_vta(desc);                                   \
2133     uint32_t i;                                                      \
2134                                                                      \
2135     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2136                                                                      \
2137     for (i = env->vstart; i < vl; i++) {                             \
2138         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2139         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2140                    (ETYPE)(target_long)s1);                          \
2141         *((ETYPE *)vd + H(i)) = d;                                   \
2142     }                                                                \
2143     env->vstart = 0;                                                 \
2144     /* set tail elements to 1s */                                    \
2145     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2146 }
2147 
2148 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2149 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2150 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2151 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2152 
2153 /*
2154  * Vector Fixed-Point Arithmetic Instructions
2155  */
2156 
2157 /* Vector Single-Width Saturating Add and Subtract */
2158 
2159 /*
2160  * As fixed point instructions probably have round mode and saturation,
2161  * define common macros for fixed point here.
2162  */
2163 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2164                           CPURISCVState *env, int vxrm);
2165 
2166 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2167 static inline void                                                  \
2168 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2169           CPURISCVState *env, int vxrm)                             \
2170 {                                                                   \
2171     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2172     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2173     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2174 }
2175 
2176 static inline void
2177 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2178              CPURISCVState *env,
2179              uint32_t vl, uint32_t vm, int vxrm,
2180              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2181 {
2182     for (uint32_t i = env->vstart; i < vl; i++) {
2183         if (!vm && !vext_elem_mask(v0, i)) {
2184             /* set masked-off elements to 1s */
2185             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2186             continue;
2187         }
2188         fn(vd, vs1, vs2, i, env, vxrm);
2189     }
2190     env->vstart = 0;
2191 }
2192 
2193 static inline void
2194 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2195              CPURISCVState *env,
2196              uint32_t desc,
2197              opivv2_rm_fn *fn, uint32_t esz)
2198 {
2199     uint32_t vm = vext_vm(desc);
2200     uint32_t vl = env->vl;
2201     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2202     uint32_t vta = vext_vta(desc);
2203     uint32_t vma = vext_vma(desc);
2204 
2205     VSTART_CHECK_EARLY_EXIT(env, vl);
2206 
2207     switch (env->vxrm) {
2208     case 0: /* rnu */
2209         vext_vv_rm_1(vd, v0, vs1, vs2,
2210                      env, vl, vm, 0, fn, vma, esz);
2211         break;
2212     case 1: /* rne */
2213         vext_vv_rm_1(vd, v0, vs1, vs2,
2214                      env, vl, vm, 1, fn, vma, esz);
2215         break;
2216     case 2: /* rdn */
2217         vext_vv_rm_1(vd, v0, vs1, vs2,
2218                      env, vl, vm, 2, fn, vma, esz);
2219         break;
2220     default: /* rod */
2221         vext_vv_rm_1(vd, v0, vs1, vs2,
2222                      env, vl, vm, 3, fn, vma, esz);
2223         break;
2224     }
2225     /* set tail elements to 1s */
2226     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2227 }
2228 
2229 /* generate helpers for fixed point instructions with OPIVV format */
2230 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2231 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2232                   CPURISCVState *env, uint32_t desc)            \
2233 {                                                               \
2234     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2235                  do_##NAME, ESZ);                               \
2236 }
2237 
2238 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2239                              uint8_t b)
2240 {
2241     uint8_t res = a + b;
2242     if (res < a) {
2243         res = UINT8_MAX;
2244         env->vxsat = 0x1;
2245     }
2246     return res;
2247 }
2248 
2249 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2250                                uint16_t b)
2251 {
2252     uint16_t res = a + b;
2253     if (res < a) {
2254         res = UINT16_MAX;
2255         env->vxsat = 0x1;
2256     }
2257     return res;
2258 }
2259 
2260 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2261                                uint32_t b)
2262 {
2263     uint32_t res = a + b;
2264     if (res < a) {
2265         res = UINT32_MAX;
2266         env->vxsat = 0x1;
2267     }
2268     return res;
2269 }
2270 
2271 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2272                                uint64_t b)
2273 {
2274     uint64_t res = a + b;
2275     if (res < a) {
2276         res = UINT64_MAX;
2277         env->vxsat = 0x1;
2278     }
2279     return res;
2280 }
2281 
2282 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2283 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2284 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2285 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2286 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2287 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2288 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2289 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2290 
2291 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2292                           CPURISCVState *env, int vxrm);
2293 
2294 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2295 static inline void                                                  \
2296 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2297           CPURISCVState *env, int vxrm)                             \
2298 {                                                                   \
2299     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2300     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2301 }
2302 
2303 static inline void
2304 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2305              CPURISCVState *env,
2306              uint32_t vl, uint32_t vm, int vxrm,
2307              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2308 {
2309     for (uint32_t i = env->vstart; i < vl; i++) {
2310         if (!vm && !vext_elem_mask(v0, i)) {
2311             /* set masked-off elements to 1s */
2312             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2313             continue;
2314         }
2315         fn(vd, s1, vs2, i, env, vxrm);
2316     }
2317     env->vstart = 0;
2318 }
2319 
2320 static inline void
2321 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2322              CPURISCVState *env,
2323              uint32_t desc,
2324              opivx2_rm_fn *fn, uint32_t esz)
2325 {
2326     uint32_t vm = vext_vm(desc);
2327     uint32_t vl = env->vl;
2328     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2329     uint32_t vta = vext_vta(desc);
2330     uint32_t vma = vext_vma(desc);
2331 
2332     VSTART_CHECK_EARLY_EXIT(env, vl);
2333 
2334     switch (env->vxrm) {
2335     case 0: /* rnu */
2336         vext_vx_rm_1(vd, v0, s1, vs2,
2337                      env, vl, vm, 0, fn, vma, esz);
2338         break;
2339     case 1: /* rne */
2340         vext_vx_rm_1(vd, v0, s1, vs2,
2341                      env, vl, vm, 1, fn, vma, esz);
2342         break;
2343     case 2: /* rdn */
2344         vext_vx_rm_1(vd, v0, s1, vs2,
2345                      env, vl, vm, 2, fn, vma, esz);
2346         break;
2347     default: /* rod */
2348         vext_vx_rm_1(vd, v0, s1, vs2,
2349                      env, vl, vm, 3, fn, vma, esz);
2350         break;
2351     }
2352     /* set tail elements to 1s */
2353     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2354 }
2355 
2356 /* generate helpers for fixed point instructions with OPIVX format */
2357 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2358 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2359                   void *vs2, CPURISCVState *env,          \
2360                   uint32_t desc)                          \
2361 {                                                         \
2362     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2363                  do_##NAME, ESZ);                         \
2364 }
2365 
2366 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2367 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2368 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2369 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2370 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2371 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2372 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2373 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2374 
2375 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2376 {
2377     int8_t res = a + b;
2378     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2379         res = a > 0 ? INT8_MAX : INT8_MIN;
2380         env->vxsat = 0x1;
2381     }
2382     return res;
2383 }
2384 
2385 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2386                              int16_t b)
2387 {
2388     int16_t res = a + b;
2389     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2390         res = a > 0 ? INT16_MAX : INT16_MIN;
2391         env->vxsat = 0x1;
2392     }
2393     return res;
2394 }
2395 
2396 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2397                              int32_t b)
2398 {
2399     int32_t res = a + b;
2400     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2401         res = a > 0 ? INT32_MAX : INT32_MIN;
2402         env->vxsat = 0x1;
2403     }
2404     return res;
2405 }
2406 
2407 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2408                              int64_t b)
2409 {
2410     int64_t res = a + b;
2411     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2412         res = a > 0 ? INT64_MAX : INT64_MIN;
2413         env->vxsat = 0x1;
2414     }
2415     return res;
2416 }
2417 
2418 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2419 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2420 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2421 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2422 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2423 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2424 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2425 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2426 
2427 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2428 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2429 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2430 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2431 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2432 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2433 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2434 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2435 
2436 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2437                              uint8_t b)
2438 {
2439     uint8_t res = a - b;
2440     if (res > a) {
2441         res = 0;
2442         env->vxsat = 0x1;
2443     }
2444     return res;
2445 }
2446 
2447 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2448                                uint16_t b)
2449 {
2450     uint16_t res = a - b;
2451     if (res > a) {
2452         res = 0;
2453         env->vxsat = 0x1;
2454     }
2455     return res;
2456 }
2457 
2458 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2459                                uint32_t b)
2460 {
2461     uint32_t res = a - b;
2462     if (res > a) {
2463         res = 0;
2464         env->vxsat = 0x1;
2465     }
2466     return res;
2467 }
2468 
2469 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2470                                uint64_t b)
2471 {
2472     uint64_t res = a - b;
2473     if (res > a) {
2474         res = 0;
2475         env->vxsat = 0x1;
2476     }
2477     return res;
2478 }
2479 
2480 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2481 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2482 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2483 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2484 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2485 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2486 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2487 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2488 
2489 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2490 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2491 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2492 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2493 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2494 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2495 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2496 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2497 
2498 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2499 {
2500     int8_t res = a - b;
2501     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2502         res = a >= 0 ? INT8_MAX : INT8_MIN;
2503         env->vxsat = 0x1;
2504     }
2505     return res;
2506 }
2507 
2508 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2509                              int16_t b)
2510 {
2511     int16_t res = a - b;
2512     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2513         res = a >= 0 ? INT16_MAX : INT16_MIN;
2514         env->vxsat = 0x1;
2515     }
2516     return res;
2517 }
2518 
2519 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2520                              int32_t b)
2521 {
2522     int32_t res = a - b;
2523     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2524         res = a >= 0 ? INT32_MAX : INT32_MIN;
2525         env->vxsat = 0x1;
2526     }
2527     return res;
2528 }
2529 
2530 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2531                              int64_t b)
2532 {
2533     int64_t res = a - b;
2534     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2535         res = a >= 0 ? INT64_MAX : INT64_MIN;
2536         env->vxsat = 0x1;
2537     }
2538     return res;
2539 }
2540 
2541 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2542 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2543 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2544 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2545 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2546 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2547 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2548 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2549 
2550 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2551 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2552 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2553 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2554 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2555 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2556 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2557 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2558 
2559 /* Vector Single-Width Averaging Add and Subtract */
2560 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2561 {
2562     uint8_t d = extract64(v, shift, 1);
2563     uint8_t d1;
2564     uint64_t D1, D2;
2565 
2566     if (shift == 0 || shift > 64) {
2567         return 0;
2568     }
2569 
2570     d1 = extract64(v, shift - 1, 1);
2571     D1 = extract64(v, 0, shift);
2572     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2573         return d1;
2574     } else if (vxrm == 1) { /* round-to-nearest-even */
2575         if (shift > 1) {
2576             D2 = extract64(v, 0, shift - 1);
2577             return d1 & ((D2 != 0) | d);
2578         } else {
2579             return d1 & d;
2580         }
2581     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2582         return !d & (D1 != 0);
2583     }
2584     return 0; /* round-down (truncate) */
2585 }
2586 
2587 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2588                              int32_t b)
2589 {
2590     int64_t res = (int64_t)a + b;
2591     uint8_t round = get_round(vxrm, res, 1);
2592 
2593     return (res >> 1) + round;
2594 }
2595 
2596 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2597                              int64_t b)
2598 {
2599     int64_t res = a + b;
2600     uint8_t round = get_round(vxrm, res, 1);
2601     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2602 
2603     /* With signed overflow, bit 64 is inverse of bit 63. */
2604     return ((res >> 1) ^ over) + round;
2605 }
2606 
2607 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2608 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2609 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2610 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2611 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2612 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2613 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2614 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2615 
2616 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2617 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2618 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2619 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2620 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2621 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2622 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2623 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2624 
2625 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2626                                uint32_t a, uint32_t b)
2627 {
2628     uint64_t res = (uint64_t)a + b;
2629     uint8_t round = get_round(vxrm, res, 1);
2630 
2631     return (res >> 1) + round;
2632 }
2633 
2634 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2635                                uint64_t a, uint64_t b)
2636 {
2637     uint64_t res = a + b;
2638     uint8_t round = get_round(vxrm, res, 1);
2639     uint64_t over = (uint64_t)(res < a) << 63;
2640 
2641     return ((res >> 1) | over) + round;
2642 }
2643 
2644 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2645 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2646 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2647 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2648 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2649 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2650 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2651 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2652 
2653 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2654 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2655 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2656 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2657 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2658 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2659 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2660 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2661 
2662 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2663                              int32_t b)
2664 {
2665     int64_t res = (int64_t)a - b;
2666     uint8_t round = get_round(vxrm, res, 1);
2667 
2668     return (res >> 1) + round;
2669 }
2670 
2671 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2672                              int64_t b)
2673 {
2674     int64_t res = (int64_t)a - b;
2675     uint8_t round = get_round(vxrm, res, 1);
2676     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2677 
2678     /* With signed overflow, bit 64 is inverse of bit 63. */
2679     return ((res >> 1) ^ over) + round;
2680 }
2681 
2682 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2683 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2684 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2685 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2686 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2687 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2688 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2689 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2690 
2691 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2692 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2693 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2694 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2695 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2696 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2697 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2698 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2699 
2700 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2701                                uint32_t a, uint32_t b)
2702 {
2703     int64_t res = (int64_t)a - b;
2704     uint8_t round = get_round(vxrm, res, 1);
2705 
2706     return (res >> 1) + round;
2707 }
2708 
2709 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2710                                uint64_t a, uint64_t b)
2711 {
2712     uint64_t res = (uint64_t)a - b;
2713     uint8_t round = get_round(vxrm, res, 1);
2714     uint64_t over = (uint64_t)(res > a) << 63;
2715 
2716     return ((res >> 1) | over) + round;
2717 }
2718 
2719 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2720 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2721 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2722 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2723 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2724 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2725 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2726 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2727 
2728 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2729 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2730 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2731 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2732 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2733 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2734 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2735 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2736 
2737 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2738 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2739 {
2740     uint8_t round;
2741     int16_t res;
2742 
2743     res = (int16_t)a * (int16_t)b;
2744     round = get_round(vxrm, res, 7);
2745     res = (res >> 7) + round;
2746 
2747     if (res > INT8_MAX) {
2748         env->vxsat = 0x1;
2749         return INT8_MAX;
2750     } else if (res < INT8_MIN) {
2751         env->vxsat = 0x1;
2752         return INT8_MIN;
2753     } else {
2754         return res;
2755     }
2756 }
2757 
2758 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2759 {
2760     uint8_t round;
2761     int32_t res;
2762 
2763     res = (int32_t)a * (int32_t)b;
2764     round = get_round(vxrm, res, 15);
2765     res = (res >> 15) + round;
2766 
2767     if (res > INT16_MAX) {
2768         env->vxsat = 0x1;
2769         return INT16_MAX;
2770     } else if (res < INT16_MIN) {
2771         env->vxsat = 0x1;
2772         return INT16_MIN;
2773     } else {
2774         return res;
2775     }
2776 }
2777 
2778 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2779 {
2780     uint8_t round;
2781     int64_t res;
2782 
2783     res = (int64_t)a * (int64_t)b;
2784     round = get_round(vxrm, res, 31);
2785     res = (res >> 31) + round;
2786 
2787     if (res > INT32_MAX) {
2788         env->vxsat = 0x1;
2789         return INT32_MAX;
2790     } else if (res < INT32_MIN) {
2791         env->vxsat = 0x1;
2792         return INT32_MIN;
2793     } else {
2794         return res;
2795     }
2796 }
2797 
2798 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2799 {
2800     uint8_t round;
2801     uint64_t hi_64, lo_64;
2802     int64_t res;
2803 
2804     if (a == INT64_MIN && b == INT64_MIN) {
2805         env->vxsat = 1;
2806         return INT64_MAX;
2807     }
2808 
2809     muls64(&lo_64, &hi_64, a, b);
2810     round = get_round(vxrm, lo_64, 63);
2811     /*
2812      * Cannot overflow, as there are always
2813      * 2 sign bits after multiply.
2814      */
2815     res = (hi_64 << 1) | (lo_64 >> 63);
2816     if (round) {
2817         if (res == INT64_MAX) {
2818             env->vxsat = 1;
2819         } else {
2820             res += 1;
2821         }
2822     }
2823     return res;
2824 }
2825 
2826 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2827 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2828 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2829 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2830 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2831 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2832 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2833 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2834 
2835 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2836 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2837 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2838 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2839 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2840 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2841 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2842 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2843 
2844 /* Vector Single-Width Scaling Shift Instructions */
2845 static inline uint8_t
2846 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2847 {
2848     uint8_t round, shift = b & 0x7;
2849     uint8_t res;
2850 
2851     round = get_round(vxrm, a, shift);
2852     res = (a >> shift) + round;
2853     return res;
2854 }
2855 static inline uint16_t
2856 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2857 {
2858     uint8_t round, shift = b & 0xf;
2859 
2860     round = get_round(vxrm, a, shift);
2861     return (a >> shift) + round;
2862 }
2863 static inline uint32_t
2864 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2865 {
2866     uint8_t round, shift = b & 0x1f;
2867 
2868     round = get_round(vxrm, a, shift);
2869     return (a >> shift) + round;
2870 }
2871 static inline uint64_t
2872 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2873 {
2874     uint8_t round, shift = b & 0x3f;
2875 
2876     round = get_round(vxrm, a, shift);
2877     return (a >> shift) + round;
2878 }
2879 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2880 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2881 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2882 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2883 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2884 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2885 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2886 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2887 
2888 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2889 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2890 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2891 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2892 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2893 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2894 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2895 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2896 
2897 static inline int8_t
2898 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2899 {
2900     uint8_t round, shift = b & 0x7;
2901 
2902     round = get_round(vxrm, a, shift);
2903     return (a >> shift) + round;
2904 }
2905 static inline int16_t
2906 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2907 {
2908     uint8_t round, shift = b & 0xf;
2909 
2910     round = get_round(vxrm, a, shift);
2911     return (a >> shift) + round;
2912 }
2913 static inline int32_t
2914 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2915 {
2916     uint8_t round, shift = b & 0x1f;
2917 
2918     round = get_round(vxrm, a, shift);
2919     return (a >> shift) + round;
2920 }
2921 static inline int64_t
2922 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2923 {
2924     uint8_t round, shift = b & 0x3f;
2925 
2926     round = get_round(vxrm, a, shift);
2927     return (a >> shift) + round;
2928 }
2929 
2930 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2931 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2932 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2933 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2934 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2935 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2936 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2937 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2938 
2939 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2940 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2941 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2942 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2943 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2944 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2945 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2946 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2947 
2948 /* Vector Narrowing Fixed-Point Clip Instructions */
2949 static inline int8_t
2950 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2951 {
2952     uint8_t round, shift = b & 0xf;
2953     int16_t res;
2954 
2955     round = get_round(vxrm, a, shift);
2956     res = (a >> shift) + round;
2957     if (res > INT8_MAX) {
2958         env->vxsat = 0x1;
2959         return INT8_MAX;
2960     } else if (res < INT8_MIN) {
2961         env->vxsat = 0x1;
2962         return INT8_MIN;
2963     } else {
2964         return res;
2965     }
2966 }
2967 
2968 static inline int16_t
2969 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2970 {
2971     uint8_t round, shift = b & 0x1f;
2972     int32_t res;
2973 
2974     round = get_round(vxrm, a, shift);
2975     res = (a >> shift) + round;
2976     if (res > INT16_MAX) {
2977         env->vxsat = 0x1;
2978         return INT16_MAX;
2979     } else if (res < INT16_MIN) {
2980         env->vxsat = 0x1;
2981         return INT16_MIN;
2982     } else {
2983         return res;
2984     }
2985 }
2986 
2987 static inline int32_t
2988 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2989 {
2990     uint8_t round, shift = b & 0x3f;
2991     int64_t res;
2992 
2993     round = get_round(vxrm, a, shift);
2994     res = (a >> shift) + round;
2995     if (res > INT32_MAX) {
2996         env->vxsat = 0x1;
2997         return INT32_MAX;
2998     } else if (res < INT32_MIN) {
2999         env->vxsat = 0x1;
3000         return INT32_MIN;
3001     } else {
3002         return res;
3003     }
3004 }
3005 
3006 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3007 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3008 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3009 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3010 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3011 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3012 
3013 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3014 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3015 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3016 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3017 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3018 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3019 
3020 static inline uint8_t
3021 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3022 {
3023     uint8_t round, shift = b & 0xf;
3024     uint16_t res;
3025 
3026     round = get_round(vxrm, a, shift);
3027     res = (a >> shift) + round;
3028     if (res > UINT8_MAX) {
3029         env->vxsat = 0x1;
3030         return UINT8_MAX;
3031     } else {
3032         return res;
3033     }
3034 }
3035 
3036 static inline uint16_t
3037 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3038 {
3039     uint8_t round, shift = b & 0x1f;
3040     uint32_t res;
3041 
3042     round = get_round(vxrm, a, shift);
3043     res = (a >> shift) + round;
3044     if (res > UINT16_MAX) {
3045         env->vxsat = 0x1;
3046         return UINT16_MAX;
3047     } else {
3048         return res;
3049     }
3050 }
3051 
3052 static inline uint32_t
3053 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3054 {
3055     uint8_t round, shift = b & 0x3f;
3056     uint64_t res;
3057 
3058     round = get_round(vxrm, a, shift);
3059     res = (a >> shift) + round;
3060     if (res > UINT32_MAX) {
3061         env->vxsat = 0x1;
3062         return UINT32_MAX;
3063     } else {
3064         return res;
3065     }
3066 }
3067 
3068 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3069 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3070 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3071 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3072 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3073 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3074 
3075 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3076 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3077 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3078 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3079 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3080 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3081 
3082 /*
3083  * Vector Float Point Arithmetic Instructions
3084  */
3085 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3086 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3087 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3088                       CPURISCVState *env)                      \
3089 {                                                              \
3090     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3091     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3092     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3093 }
3094 
3095 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3096 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3097                   void *vs2, CPURISCVState *env,          \
3098                   uint32_t desc)                          \
3099 {                                                         \
3100     uint32_t vm = vext_vm(desc);                          \
3101     uint32_t vl = env->vl;                                \
3102     uint32_t total_elems =                                \
3103         vext_get_total_elems(env, desc, ESZ);             \
3104     uint32_t vta = vext_vta(desc);                        \
3105     uint32_t vma = vext_vma(desc);                        \
3106     uint32_t i;                                           \
3107                                                           \
3108     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3109                                                           \
3110     for (i = env->vstart; i < vl; i++) {                  \
3111         if (!vm && !vext_elem_mask(v0, i)) {              \
3112             /* set masked-off elements to 1s */           \
3113             vext_set_elems_1s(vd, vma, i * ESZ,           \
3114                               (i + 1) * ESZ);             \
3115             continue;                                     \
3116         }                                                 \
3117         do_##NAME(vd, vs1, vs2, i, env);                  \
3118     }                                                     \
3119     env->vstart = 0;                                      \
3120     /* set tail elements to 1s */                         \
3121     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3122                       total_elems * ESZ);                 \
3123 }
3124 
3125 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3126 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3127 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3128 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3129 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3130 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3131 
3132 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3133 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3134                       CPURISCVState *env)                      \
3135 {                                                              \
3136     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3137     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3138 }
3139 
3140 #define GEN_VEXT_VF(NAME, ESZ)                            \
3141 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3142                   void *vs2, CPURISCVState *env,          \
3143                   uint32_t desc)                          \
3144 {                                                         \
3145     uint32_t vm = vext_vm(desc);                          \
3146     uint32_t vl = env->vl;                                \
3147     uint32_t total_elems =                                \
3148         vext_get_total_elems(env, desc, ESZ);             \
3149     uint32_t vta = vext_vta(desc);                        \
3150     uint32_t vma = vext_vma(desc);                        \
3151     uint32_t i;                                           \
3152                                                           \
3153     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3154                                                           \
3155     for (i = env->vstart; i < vl; i++) {                  \
3156         if (!vm && !vext_elem_mask(v0, i)) {              \
3157             /* set masked-off elements to 1s */           \
3158             vext_set_elems_1s(vd, vma, i * ESZ,           \
3159                               (i + 1) * ESZ);             \
3160             continue;                                     \
3161         }                                                 \
3162         do_##NAME(vd, s1, vs2, i, env);                   \
3163     }                                                     \
3164     env->vstart = 0;                                      \
3165     /* set tail elements to 1s */                         \
3166     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3167                       total_elems * ESZ);                 \
3168 }
3169 
3170 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3171 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3172 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3173 GEN_VEXT_VF(vfadd_vf_h, 2)
3174 GEN_VEXT_VF(vfadd_vf_w, 4)
3175 GEN_VEXT_VF(vfadd_vf_d, 8)
3176 
3177 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3178 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3179 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3180 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3181 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3182 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3183 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3184 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3185 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3186 GEN_VEXT_VF(vfsub_vf_h, 2)
3187 GEN_VEXT_VF(vfsub_vf_w, 4)
3188 GEN_VEXT_VF(vfsub_vf_d, 8)
3189 
3190 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3191 {
3192     return float16_sub(b, a, s);
3193 }
3194 
3195 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3196 {
3197     return float32_sub(b, a, s);
3198 }
3199 
3200 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3201 {
3202     return float64_sub(b, a, s);
3203 }
3204 
3205 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3206 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3207 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3208 GEN_VEXT_VF(vfrsub_vf_h, 2)
3209 GEN_VEXT_VF(vfrsub_vf_w, 4)
3210 GEN_VEXT_VF(vfrsub_vf_d, 8)
3211 
3212 /* Vector Widening Floating-Point Add/Subtract Instructions */
3213 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3214 {
3215     return float32_add(float16_to_float32(a, true, s),
3216                        float16_to_float32(b, true, s), s);
3217 }
3218 
3219 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3220 {
3221     return float64_add(float32_to_float64(a, s),
3222                        float32_to_float64(b, s), s);
3223 
3224 }
3225 
3226 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3227 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3228 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3229 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3230 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3231 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3232 GEN_VEXT_VF(vfwadd_vf_h, 4)
3233 GEN_VEXT_VF(vfwadd_vf_w, 8)
3234 
3235 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3236 {
3237     return float32_sub(float16_to_float32(a, true, s),
3238                        float16_to_float32(b, true, s), s);
3239 }
3240 
3241 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3242 {
3243     return float64_sub(float32_to_float64(a, s),
3244                        float32_to_float64(b, s), s);
3245 
3246 }
3247 
3248 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3249 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3250 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3251 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3252 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3253 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3254 GEN_VEXT_VF(vfwsub_vf_h, 4)
3255 GEN_VEXT_VF(vfwsub_vf_w, 8)
3256 
3257 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3258 {
3259     return float32_add(a, float16_to_float32(b, true, s), s);
3260 }
3261 
3262 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3263 {
3264     return float64_add(a, float32_to_float64(b, s), s);
3265 }
3266 
3267 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3268 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3269 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3270 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3271 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3272 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3273 GEN_VEXT_VF(vfwadd_wf_h, 4)
3274 GEN_VEXT_VF(vfwadd_wf_w, 8)
3275 
3276 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3277 {
3278     return float32_sub(a, float16_to_float32(b, true, s), s);
3279 }
3280 
3281 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3282 {
3283     return float64_sub(a, float32_to_float64(b, s), s);
3284 }
3285 
3286 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3287 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3288 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3289 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3290 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3291 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3292 GEN_VEXT_VF(vfwsub_wf_h, 4)
3293 GEN_VEXT_VF(vfwsub_wf_w, 8)
3294 
3295 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3296 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3297 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3298 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3299 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3300 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3301 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3302 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3303 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3304 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3305 GEN_VEXT_VF(vfmul_vf_h, 2)
3306 GEN_VEXT_VF(vfmul_vf_w, 4)
3307 GEN_VEXT_VF(vfmul_vf_d, 8)
3308 
3309 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3310 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3311 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3312 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3313 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3314 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3315 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3316 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3317 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3318 GEN_VEXT_VF(vfdiv_vf_h, 2)
3319 GEN_VEXT_VF(vfdiv_vf_w, 4)
3320 GEN_VEXT_VF(vfdiv_vf_d, 8)
3321 
3322 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3323 {
3324     return float16_div(b, a, s);
3325 }
3326 
3327 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3328 {
3329     return float32_div(b, a, s);
3330 }
3331 
3332 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3333 {
3334     return float64_div(b, a, s);
3335 }
3336 
3337 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3338 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3339 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3340 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3341 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3342 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3343 
3344 /* Vector Widening Floating-Point Multiply */
3345 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3346 {
3347     return float32_mul(float16_to_float32(a, true, s),
3348                        float16_to_float32(b, true, s), s);
3349 }
3350 
3351 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3352 {
3353     return float64_mul(float32_to_float64(a, s),
3354                        float32_to_float64(b, s), s);
3355 
3356 }
3357 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3358 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3359 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3360 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3361 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3362 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3363 GEN_VEXT_VF(vfwmul_vf_h, 4)
3364 GEN_VEXT_VF(vfwmul_vf_w, 8)
3365 
3366 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3367 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3368 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3369                       CPURISCVState *env)                          \
3370 {                                                                  \
3371     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3372     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3373     TD d = *((TD *)vd + HD(i));                                    \
3374     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3375 }
3376 
3377 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3378 {
3379     return float16_muladd(a, b, d, 0, s);
3380 }
3381 
3382 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3383 {
3384     return float32_muladd(a, b, d, 0, s);
3385 }
3386 
3387 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3388 {
3389     return float64_muladd(a, b, d, 0, s);
3390 }
3391 
3392 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3393 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3394 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3395 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3396 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3397 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3398 
3399 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3400 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3401                       CPURISCVState *env)                         \
3402 {                                                                 \
3403     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3404     TD d = *((TD *)vd + HD(i));                                   \
3405     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3406 }
3407 
3408 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3409 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3410 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3411 GEN_VEXT_VF(vfmacc_vf_h, 2)
3412 GEN_VEXT_VF(vfmacc_vf_w, 4)
3413 GEN_VEXT_VF(vfmacc_vf_d, 8)
3414 
3415 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3416 {
3417     return float16_muladd(a, b, d, float_muladd_negate_c |
3418                                    float_muladd_negate_product, s);
3419 }
3420 
3421 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3422 {
3423     return float32_muladd(a, b, d, float_muladd_negate_c |
3424                                    float_muladd_negate_product, s);
3425 }
3426 
3427 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3428 {
3429     return float64_muladd(a, b, d, float_muladd_negate_c |
3430                                    float_muladd_negate_product, s);
3431 }
3432 
3433 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3434 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3435 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3436 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3437 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3438 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3439 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3440 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3441 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3442 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3443 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3444 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3445 
3446 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3447 {
3448     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3449 }
3450 
3451 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3452 {
3453     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3454 }
3455 
3456 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3457 {
3458     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3459 }
3460 
3461 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3462 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3463 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3464 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3465 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3466 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3467 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3468 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3469 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3470 GEN_VEXT_VF(vfmsac_vf_h, 2)
3471 GEN_VEXT_VF(vfmsac_vf_w, 4)
3472 GEN_VEXT_VF(vfmsac_vf_d, 8)
3473 
3474 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3475 {
3476     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3477 }
3478 
3479 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3480 {
3481     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3482 }
3483 
3484 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3485 {
3486     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3487 }
3488 
3489 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3490 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3491 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3492 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3493 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3494 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3495 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3496 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3497 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3498 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3499 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3500 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3501 
3502 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3503 {
3504     return float16_muladd(d, b, a, 0, s);
3505 }
3506 
3507 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3508 {
3509     return float32_muladd(d, b, a, 0, s);
3510 }
3511 
3512 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3513 {
3514     return float64_muladd(d, b, a, 0, s);
3515 }
3516 
3517 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3518 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3519 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3520 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3521 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3522 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3523 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3524 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3525 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3526 GEN_VEXT_VF(vfmadd_vf_h, 2)
3527 GEN_VEXT_VF(vfmadd_vf_w, 4)
3528 GEN_VEXT_VF(vfmadd_vf_d, 8)
3529 
3530 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3531 {
3532     return float16_muladd(d, b, a, float_muladd_negate_c |
3533                                    float_muladd_negate_product, s);
3534 }
3535 
3536 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3537 {
3538     return float32_muladd(d, b, a, float_muladd_negate_c |
3539                                    float_muladd_negate_product, s);
3540 }
3541 
3542 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3543 {
3544     return float64_muladd(d, b, a, float_muladd_negate_c |
3545                                    float_muladd_negate_product, s);
3546 }
3547 
3548 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3549 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3550 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3551 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3552 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3553 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3554 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3555 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3556 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3557 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3558 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3559 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3560 
3561 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3562 {
3563     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3564 }
3565 
3566 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3567 {
3568     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3569 }
3570 
3571 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3572 {
3573     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3574 }
3575 
3576 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3577 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3578 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3579 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3580 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3581 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3582 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3583 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3584 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3585 GEN_VEXT_VF(vfmsub_vf_h, 2)
3586 GEN_VEXT_VF(vfmsub_vf_w, 4)
3587 GEN_VEXT_VF(vfmsub_vf_d, 8)
3588 
3589 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3590 {
3591     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3592 }
3593 
3594 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3595 {
3596     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3597 }
3598 
3599 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3600 {
3601     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3602 }
3603 
3604 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3605 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3606 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3607 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3608 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3609 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3610 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3611 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3612 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3613 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3614 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3615 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3616 
3617 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3618 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3619 {
3620     return float32_muladd(float16_to_float32(a, true, s),
3621                           float16_to_float32(b, true, s), d, 0, s);
3622 }
3623 
3624 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3625 {
3626     return float64_muladd(float32_to_float64(a, s),
3627                           float32_to_float64(b, s), d, 0, s);
3628 }
3629 
3630 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3631 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3632 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3633 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3634 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3635 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3636 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3637 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3638 
3639 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3640 {
3641     return float32_muladd(bfloat16_to_float32(a, s),
3642                           bfloat16_to_float32(b, s), d, 0, s);
3643 }
3644 
3645 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3646 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3647 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3648 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3649 
3650 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3651 {
3652     return float32_muladd(float16_to_float32(a, true, s),
3653                           float16_to_float32(b, true, s), d,
3654                           float_muladd_negate_c | float_muladd_negate_product,
3655                           s);
3656 }
3657 
3658 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3659 {
3660     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3661                           d, float_muladd_negate_c |
3662                              float_muladd_negate_product, s);
3663 }
3664 
3665 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3666 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3667 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3668 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3669 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3670 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3671 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3672 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3673 
3674 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3675 {
3676     return float32_muladd(float16_to_float32(a, true, s),
3677                           float16_to_float32(b, true, s), d,
3678                           float_muladd_negate_c, s);
3679 }
3680 
3681 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3682 {
3683     return float64_muladd(float32_to_float64(a, s),
3684                           float32_to_float64(b, s), d,
3685                           float_muladd_negate_c, s);
3686 }
3687 
3688 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3689 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3690 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3691 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3692 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3693 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3694 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3695 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3696 
3697 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3698 {
3699     return float32_muladd(float16_to_float32(a, true, s),
3700                           float16_to_float32(b, true, s), d,
3701                           float_muladd_negate_product, s);
3702 }
3703 
3704 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3705 {
3706     return float64_muladd(float32_to_float64(a, s),
3707                           float32_to_float64(b, s), d,
3708                           float_muladd_negate_product, s);
3709 }
3710 
3711 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3712 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3713 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3714 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3715 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3716 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3717 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3718 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3719 
3720 /* Vector Floating-Point Square-Root Instruction */
3721 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3722 static void do_##NAME(void *vd, void *vs2, int i,      \
3723                       CPURISCVState *env)              \
3724 {                                                      \
3725     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3726     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3727 }
3728 
3729 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3730 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3731                   CPURISCVState *env, uint32_t desc)   \
3732 {                                                      \
3733     uint32_t vm = vext_vm(desc);                       \
3734     uint32_t vl = env->vl;                             \
3735     uint32_t total_elems =                             \
3736         vext_get_total_elems(env, desc, ESZ);          \
3737     uint32_t vta = vext_vta(desc);                     \
3738     uint32_t vma = vext_vma(desc);                     \
3739     uint32_t i;                                        \
3740                                                        \
3741     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3742                                                        \
3743     if (vl == 0) {                                     \
3744         return;                                        \
3745     }                                                  \
3746     for (i = env->vstart; i < vl; i++) {               \
3747         if (!vm && !vext_elem_mask(v0, i)) {           \
3748             /* set masked-off elements to 1s */        \
3749             vext_set_elems_1s(vd, vma, i * ESZ,        \
3750                               (i + 1) * ESZ);          \
3751             continue;                                  \
3752         }                                              \
3753         do_##NAME(vd, vs2, i, env);                    \
3754     }                                                  \
3755     env->vstart = 0;                                   \
3756     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3757                       total_elems * ESZ);              \
3758 }
3759 
3760 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3761 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3762 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3763 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3764 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3765 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3766 
3767 /*
3768  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3769  *
3770  * Adapted from riscv-v-spec recip.c:
3771  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3772  */
3773 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3774 {
3775     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3776     uint64_t exp = extract64(f, frac_size, exp_size);
3777     uint64_t frac = extract64(f, 0, frac_size);
3778 
3779     const uint8_t lookup_table[] = {
3780         52, 51, 50, 48, 47, 46, 44, 43,
3781         42, 41, 40, 39, 38, 36, 35, 34,
3782         33, 32, 31, 30, 30, 29, 28, 27,
3783         26, 25, 24, 23, 23, 22, 21, 20,
3784         19, 19, 18, 17, 16, 16, 15, 14,
3785         14, 13, 12, 12, 11, 10, 10, 9,
3786         9, 8, 7, 7, 6, 6, 5, 4,
3787         4, 3, 3, 2, 2, 1, 1, 0,
3788         127, 125, 123, 121, 119, 118, 116, 114,
3789         113, 111, 109, 108, 106, 105, 103, 102,
3790         100, 99, 97, 96, 95, 93, 92, 91,
3791         90, 88, 87, 86, 85, 84, 83, 82,
3792         80, 79, 78, 77, 76, 75, 74, 73,
3793         72, 71, 70, 70, 69, 68, 67, 66,
3794         65, 64, 63, 63, 62, 61, 60, 59,
3795         59, 58, 57, 56, 56, 55, 54, 53
3796     };
3797     const int precision = 7;
3798 
3799     if (exp == 0 && frac != 0) { /* subnormal */
3800         /* Normalize the subnormal. */
3801         while (extract64(frac, frac_size - 1, 1) == 0) {
3802             exp--;
3803             frac <<= 1;
3804         }
3805 
3806         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3807     }
3808 
3809     int idx = ((exp & 1) << (precision - 1)) |
3810               (frac >> (frac_size - precision + 1));
3811     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3812                         (frac_size - precision);
3813     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3814 
3815     uint64_t val = 0;
3816     val = deposit64(val, 0, frac_size, out_frac);
3817     val = deposit64(val, frac_size, exp_size, out_exp);
3818     val = deposit64(val, frac_size + exp_size, 1, sign);
3819     return val;
3820 }
3821 
3822 static float16 frsqrt7_h(float16 f, float_status *s)
3823 {
3824     int exp_size = 5, frac_size = 10;
3825     bool sign = float16_is_neg(f);
3826 
3827     /*
3828      * frsqrt7(sNaN) = canonical NaN
3829      * frsqrt7(-inf) = canonical NaN
3830      * frsqrt7(-normal) = canonical NaN
3831      * frsqrt7(-subnormal) = canonical NaN
3832      */
3833     if (float16_is_signaling_nan(f, s) ||
3834         (float16_is_infinity(f) && sign) ||
3835         (float16_is_normal(f) && sign) ||
3836         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3837         s->float_exception_flags |= float_flag_invalid;
3838         return float16_default_nan(s);
3839     }
3840 
3841     /* frsqrt7(qNaN) = canonical NaN */
3842     if (float16_is_quiet_nan(f, s)) {
3843         return float16_default_nan(s);
3844     }
3845 
3846     /* frsqrt7(+-0) = +-inf */
3847     if (float16_is_zero(f)) {
3848         s->float_exception_flags |= float_flag_divbyzero;
3849         return float16_set_sign(float16_infinity, sign);
3850     }
3851 
3852     /* frsqrt7(+inf) = +0 */
3853     if (float16_is_infinity(f) && !sign) {
3854         return float16_set_sign(float16_zero, sign);
3855     }
3856 
3857     /* +normal, +subnormal */
3858     uint64_t val = frsqrt7(f, exp_size, frac_size);
3859     return make_float16(val);
3860 }
3861 
3862 static float32 frsqrt7_s(float32 f, float_status *s)
3863 {
3864     int exp_size = 8, frac_size = 23;
3865     bool sign = float32_is_neg(f);
3866 
3867     /*
3868      * frsqrt7(sNaN) = canonical NaN
3869      * frsqrt7(-inf) = canonical NaN
3870      * frsqrt7(-normal) = canonical NaN
3871      * frsqrt7(-subnormal) = canonical NaN
3872      */
3873     if (float32_is_signaling_nan(f, s) ||
3874         (float32_is_infinity(f) && sign) ||
3875         (float32_is_normal(f) && sign) ||
3876         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3877         s->float_exception_flags |= float_flag_invalid;
3878         return float32_default_nan(s);
3879     }
3880 
3881     /* frsqrt7(qNaN) = canonical NaN */
3882     if (float32_is_quiet_nan(f, s)) {
3883         return float32_default_nan(s);
3884     }
3885 
3886     /* frsqrt7(+-0) = +-inf */
3887     if (float32_is_zero(f)) {
3888         s->float_exception_flags |= float_flag_divbyzero;
3889         return float32_set_sign(float32_infinity, sign);
3890     }
3891 
3892     /* frsqrt7(+inf) = +0 */
3893     if (float32_is_infinity(f) && !sign) {
3894         return float32_set_sign(float32_zero, sign);
3895     }
3896 
3897     /* +normal, +subnormal */
3898     uint64_t val = frsqrt7(f, exp_size, frac_size);
3899     return make_float32(val);
3900 }
3901 
3902 static float64 frsqrt7_d(float64 f, float_status *s)
3903 {
3904     int exp_size = 11, frac_size = 52;
3905     bool sign = float64_is_neg(f);
3906 
3907     /*
3908      * frsqrt7(sNaN) = canonical NaN
3909      * frsqrt7(-inf) = canonical NaN
3910      * frsqrt7(-normal) = canonical NaN
3911      * frsqrt7(-subnormal) = canonical NaN
3912      */
3913     if (float64_is_signaling_nan(f, s) ||
3914         (float64_is_infinity(f) && sign) ||
3915         (float64_is_normal(f) && sign) ||
3916         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3917         s->float_exception_flags |= float_flag_invalid;
3918         return float64_default_nan(s);
3919     }
3920 
3921     /* frsqrt7(qNaN) = canonical NaN */
3922     if (float64_is_quiet_nan(f, s)) {
3923         return float64_default_nan(s);
3924     }
3925 
3926     /* frsqrt7(+-0) = +-inf */
3927     if (float64_is_zero(f)) {
3928         s->float_exception_flags |= float_flag_divbyzero;
3929         return float64_set_sign(float64_infinity, sign);
3930     }
3931 
3932     /* frsqrt7(+inf) = +0 */
3933     if (float64_is_infinity(f) && !sign) {
3934         return float64_set_sign(float64_zero, sign);
3935     }
3936 
3937     /* +normal, +subnormal */
3938     uint64_t val = frsqrt7(f, exp_size, frac_size);
3939     return make_float64(val);
3940 }
3941 
3942 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3943 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3944 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3945 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3946 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3947 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3948 
3949 /*
3950  * Vector Floating-Point Reciprocal Estimate Instruction
3951  *
3952  * Adapted from riscv-v-spec recip.c:
3953  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3954  */
3955 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3956                       float_status *s)
3957 {
3958     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3959     uint64_t exp = extract64(f, frac_size, exp_size);
3960     uint64_t frac = extract64(f, 0, frac_size);
3961 
3962     const uint8_t lookup_table[] = {
3963         127, 125, 123, 121, 119, 117, 116, 114,
3964         112, 110, 109, 107, 105, 104, 102, 100,
3965         99, 97, 96, 94, 93, 91, 90, 88,
3966         87, 85, 84, 83, 81, 80, 79, 77,
3967         76, 75, 74, 72, 71, 70, 69, 68,
3968         66, 65, 64, 63, 62, 61, 60, 59,
3969         58, 57, 56, 55, 54, 53, 52, 51,
3970         50, 49, 48, 47, 46, 45, 44, 43,
3971         42, 41, 40, 40, 39, 38, 37, 36,
3972         35, 35, 34, 33, 32, 31, 31, 30,
3973         29, 28, 28, 27, 26, 25, 25, 24,
3974         23, 23, 22, 21, 21, 20, 19, 19,
3975         18, 17, 17, 16, 15, 15, 14, 14,
3976         13, 12, 12, 11, 11, 10, 9, 9,
3977         8, 8, 7, 7, 6, 5, 5, 4,
3978         4, 3, 3, 2, 2, 1, 1, 0
3979     };
3980     const int precision = 7;
3981 
3982     if (exp == 0 && frac != 0) { /* subnormal */
3983         /* Normalize the subnormal. */
3984         while (extract64(frac, frac_size - 1, 1) == 0) {
3985             exp--;
3986             frac <<= 1;
3987         }
3988 
3989         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3990 
3991         if (exp != 0 && exp != UINT64_MAX) {
3992             /*
3993              * Overflow to inf or max value of same sign,
3994              * depending on sign and rounding mode.
3995              */
3996             s->float_exception_flags |= (float_flag_inexact |
3997                                          float_flag_overflow);
3998 
3999             if ((s->float_rounding_mode == float_round_to_zero) ||
4000                 ((s->float_rounding_mode == float_round_down) && !sign) ||
4001                 ((s->float_rounding_mode == float_round_up) && sign)) {
4002                 /* Return greatest/negative finite value. */
4003                 return (sign << (exp_size + frac_size)) |
4004                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4005             } else {
4006                 /* Return +-inf. */
4007                 return (sign << (exp_size + frac_size)) |
4008                        MAKE_64BIT_MASK(frac_size, exp_size);
4009             }
4010         }
4011     }
4012 
4013     int idx = frac >> (frac_size - precision);
4014     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4015                         (frac_size - precision);
4016     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4017 
4018     if (out_exp == 0 || out_exp == UINT64_MAX) {
4019         /*
4020          * The result is subnormal, but don't raise the underflow exception,
4021          * because there's no additional loss of precision.
4022          */
4023         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4024         if (out_exp == UINT64_MAX) {
4025             out_frac >>= 1;
4026             out_exp = 0;
4027         }
4028     }
4029 
4030     uint64_t val = 0;
4031     val = deposit64(val, 0, frac_size, out_frac);
4032     val = deposit64(val, frac_size, exp_size, out_exp);
4033     val = deposit64(val, frac_size + exp_size, 1, sign);
4034     return val;
4035 }
4036 
4037 static float16 frec7_h(float16 f, float_status *s)
4038 {
4039     int exp_size = 5, frac_size = 10;
4040     bool sign = float16_is_neg(f);
4041 
4042     /* frec7(+-inf) = +-0 */
4043     if (float16_is_infinity(f)) {
4044         return float16_set_sign(float16_zero, sign);
4045     }
4046 
4047     /* frec7(+-0) = +-inf */
4048     if (float16_is_zero(f)) {
4049         s->float_exception_flags |= float_flag_divbyzero;
4050         return float16_set_sign(float16_infinity, sign);
4051     }
4052 
4053     /* frec7(sNaN) = canonical NaN */
4054     if (float16_is_signaling_nan(f, s)) {
4055         s->float_exception_flags |= float_flag_invalid;
4056         return float16_default_nan(s);
4057     }
4058 
4059     /* frec7(qNaN) = canonical NaN */
4060     if (float16_is_quiet_nan(f, s)) {
4061         return float16_default_nan(s);
4062     }
4063 
4064     /* +-normal, +-subnormal */
4065     uint64_t val = frec7(f, exp_size, frac_size, s);
4066     return make_float16(val);
4067 }
4068 
4069 static float32 frec7_s(float32 f, float_status *s)
4070 {
4071     int exp_size = 8, frac_size = 23;
4072     bool sign = float32_is_neg(f);
4073 
4074     /* frec7(+-inf) = +-0 */
4075     if (float32_is_infinity(f)) {
4076         return float32_set_sign(float32_zero, sign);
4077     }
4078 
4079     /* frec7(+-0) = +-inf */
4080     if (float32_is_zero(f)) {
4081         s->float_exception_flags |= float_flag_divbyzero;
4082         return float32_set_sign(float32_infinity, sign);
4083     }
4084 
4085     /* frec7(sNaN) = canonical NaN */
4086     if (float32_is_signaling_nan(f, s)) {
4087         s->float_exception_flags |= float_flag_invalid;
4088         return float32_default_nan(s);
4089     }
4090 
4091     /* frec7(qNaN) = canonical NaN */
4092     if (float32_is_quiet_nan(f, s)) {
4093         return float32_default_nan(s);
4094     }
4095 
4096     /* +-normal, +-subnormal */
4097     uint64_t val = frec7(f, exp_size, frac_size, s);
4098     return make_float32(val);
4099 }
4100 
4101 static float64 frec7_d(float64 f, float_status *s)
4102 {
4103     int exp_size = 11, frac_size = 52;
4104     bool sign = float64_is_neg(f);
4105 
4106     /* frec7(+-inf) = +-0 */
4107     if (float64_is_infinity(f)) {
4108         return float64_set_sign(float64_zero, sign);
4109     }
4110 
4111     /* frec7(+-0) = +-inf */
4112     if (float64_is_zero(f)) {
4113         s->float_exception_flags |= float_flag_divbyzero;
4114         return float64_set_sign(float64_infinity, sign);
4115     }
4116 
4117     /* frec7(sNaN) = canonical NaN */
4118     if (float64_is_signaling_nan(f, s)) {
4119         s->float_exception_flags |= float_flag_invalid;
4120         return float64_default_nan(s);
4121     }
4122 
4123     /* frec7(qNaN) = canonical NaN */
4124     if (float64_is_quiet_nan(f, s)) {
4125         return float64_default_nan(s);
4126     }
4127 
4128     /* +-normal, +-subnormal */
4129     uint64_t val = frec7(f, exp_size, frac_size, s);
4130     return make_float64(val);
4131 }
4132 
4133 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4134 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4135 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4136 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4137 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4138 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4139 
4140 /* Vector Floating-Point MIN/MAX Instructions */
4141 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4142 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4143 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4144 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4145 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4146 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4147 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4148 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4149 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4150 GEN_VEXT_VF(vfmin_vf_h, 2)
4151 GEN_VEXT_VF(vfmin_vf_w, 4)
4152 GEN_VEXT_VF(vfmin_vf_d, 8)
4153 
4154 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4155 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4156 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4157 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4158 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4159 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4160 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4161 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4162 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4163 GEN_VEXT_VF(vfmax_vf_h, 2)
4164 GEN_VEXT_VF(vfmax_vf_w, 4)
4165 GEN_VEXT_VF(vfmax_vf_d, 8)
4166 
4167 /* Vector Floating-Point Sign-Injection Instructions */
4168 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4169 {
4170     return deposit64(b, 0, 15, a);
4171 }
4172 
4173 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4174 {
4175     return deposit64(b, 0, 31, a);
4176 }
4177 
4178 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4179 {
4180     return deposit64(b, 0, 63, a);
4181 }
4182 
4183 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4184 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4185 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4186 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4187 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4188 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4189 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4190 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4191 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4192 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4193 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4194 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4195 
4196 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4197 {
4198     return deposit64(~b, 0, 15, a);
4199 }
4200 
4201 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4202 {
4203     return deposit64(~b, 0, 31, a);
4204 }
4205 
4206 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4207 {
4208     return deposit64(~b, 0, 63, a);
4209 }
4210 
4211 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4212 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4213 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4214 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4215 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4216 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4217 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4218 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4219 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4220 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4221 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4222 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4223 
4224 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4225 {
4226     return deposit64(b ^ a, 0, 15, a);
4227 }
4228 
4229 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4230 {
4231     return deposit64(b ^ a, 0, 31, a);
4232 }
4233 
4234 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4235 {
4236     return deposit64(b ^ a, 0, 63, a);
4237 }
4238 
4239 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4240 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4241 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4242 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4243 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4244 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4245 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4246 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4247 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4248 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4249 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4250 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4251 
4252 /* Vector Floating-Point Compare Instructions */
4253 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4254 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4255                   CPURISCVState *env, uint32_t desc)          \
4256 {                                                             \
4257     uint32_t vm = vext_vm(desc);                              \
4258     uint32_t vl = env->vl;                                    \
4259     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4260     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4261     uint32_t vma = vext_vma(desc);                            \
4262     uint32_t i;                                               \
4263                                                               \
4264     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4265                                                               \
4266     for (i = env->vstart; i < vl; i++) {                      \
4267         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4268         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4269         if (!vm && !vext_elem_mask(v0, i)) {                  \
4270             /* set masked-off elements to 1s */               \
4271             if (vma) {                                        \
4272                 vext_set_elem_mask(vd, i, 1);                 \
4273             }                                                 \
4274             continue;                                         \
4275         }                                                     \
4276         vext_set_elem_mask(vd, i,                             \
4277                            DO_OP(s2, s1, &env->fp_status));   \
4278     }                                                         \
4279     env->vstart = 0;                                          \
4280     /*
4281      * mask destination register are always tail-agnostic
4282      * set tail elements to 1s
4283      */                                                       \
4284     if (vta_all_1s) {                                         \
4285         for (; i < total_elems; i++) {                        \
4286             vext_set_elem_mask(vd, i, 1);                     \
4287         }                                                     \
4288     }                                                         \
4289 }
4290 
4291 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4292 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4293 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4294 
4295 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4296 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4297                   CPURISCVState *env, uint32_t desc)                \
4298 {                                                                   \
4299     uint32_t vm = vext_vm(desc);                                    \
4300     uint32_t vl = env->vl;                                          \
4301     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4302     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4303     uint32_t vma = vext_vma(desc);                                  \
4304     uint32_t i;                                                     \
4305                                                                     \
4306     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4307                                                                     \
4308     for (i = env->vstart; i < vl; i++) {                            \
4309         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4310         if (!vm && !vext_elem_mask(v0, i)) {                        \
4311             /* set masked-off elements to 1s */                     \
4312             if (vma) {                                              \
4313                 vext_set_elem_mask(vd, i, 1);                       \
4314             }                                                       \
4315             continue;                                               \
4316         }                                                           \
4317         vext_set_elem_mask(vd, i,                                   \
4318                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4319     }                                                               \
4320     env->vstart = 0;                                                \
4321     /*
4322      * mask destination register are always tail-agnostic
4323      * set tail elements to 1s
4324      */                                                             \
4325     if (vta_all_1s) {                                               \
4326         for (; i < total_elems; i++) {                              \
4327             vext_set_elem_mask(vd, i, 1);                           \
4328         }                                                           \
4329     }                                                               \
4330 }
4331 
4332 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4333 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4334 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4335 
4336 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4337 {
4338     FloatRelation compare = float16_compare_quiet(a, b, s);
4339     return compare != float_relation_equal;
4340 }
4341 
4342 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4343 {
4344     FloatRelation compare = float32_compare_quiet(a, b, s);
4345     return compare != float_relation_equal;
4346 }
4347 
4348 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4349 {
4350     FloatRelation compare = float64_compare_quiet(a, b, s);
4351     return compare != float_relation_equal;
4352 }
4353 
4354 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4355 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4356 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4357 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4358 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4359 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4360 
4361 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4362 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4363 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4364 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4365 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4366 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4367 
4368 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4369 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4370 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4371 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4372 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4373 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4374 
4375 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4376 {
4377     FloatRelation compare = float16_compare(a, b, s);
4378     return compare == float_relation_greater;
4379 }
4380 
4381 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4382 {
4383     FloatRelation compare = float32_compare(a, b, s);
4384     return compare == float_relation_greater;
4385 }
4386 
4387 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4388 {
4389     FloatRelation compare = float64_compare(a, b, s);
4390     return compare == float_relation_greater;
4391 }
4392 
4393 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4394 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4395 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4396 
4397 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4398 {
4399     FloatRelation compare = float16_compare(a, b, s);
4400     return compare == float_relation_greater ||
4401            compare == float_relation_equal;
4402 }
4403 
4404 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4405 {
4406     FloatRelation compare = float32_compare(a, b, s);
4407     return compare == float_relation_greater ||
4408            compare == float_relation_equal;
4409 }
4410 
4411 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4412 {
4413     FloatRelation compare = float64_compare(a, b, s);
4414     return compare == float_relation_greater ||
4415            compare == float_relation_equal;
4416 }
4417 
4418 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4419 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4420 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4421 
4422 /* Vector Floating-Point Classify Instruction */
4423 target_ulong fclass_h(uint64_t frs1)
4424 {
4425     float16 f = frs1;
4426     bool sign = float16_is_neg(f);
4427 
4428     if (float16_is_infinity(f)) {
4429         return sign ? 1 << 0 : 1 << 7;
4430     } else if (float16_is_zero(f)) {
4431         return sign ? 1 << 3 : 1 << 4;
4432     } else if (float16_is_zero_or_denormal(f)) {
4433         return sign ? 1 << 2 : 1 << 5;
4434     } else if (float16_is_any_nan(f)) {
4435         float_status s = { }; /* for snan_bit_is_one */
4436         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4437     } else {
4438         return sign ? 1 << 1 : 1 << 6;
4439     }
4440 }
4441 
4442 target_ulong fclass_s(uint64_t frs1)
4443 {
4444     float32 f = frs1;
4445     bool sign = float32_is_neg(f);
4446 
4447     if (float32_is_infinity(f)) {
4448         return sign ? 1 << 0 : 1 << 7;
4449     } else if (float32_is_zero(f)) {
4450         return sign ? 1 << 3 : 1 << 4;
4451     } else if (float32_is_zero_or_denormal(f)) {
4452         return sign ? 1 << 2 : 1 << 5;
4453     } else if (float32_is_any_nan(f)) {
4454         float_status s = { }; /* for snan_bit_is_one */
4455         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4456     } else {
4457         return sign ? 1 << 1 : 1 << 6;
4458     }
4459 }
4460 
4461 target_ulong fclass_d(uint64_t frs1)
4462 {
4463     float64 f = frs1;
4464     bool sign = float64_is_neg(f);
4465 
4466     if (float64_is_infinity(f)) {
4467         return sign ? 1 << 0 : 1 << 7;
4468     } else if (float64_is_zero(f)) {
4469         return sign ? 1 << 3 : 1 << 4;
4470     } else if (float64_is_zero_or_denormal(f)) {
4471         return sign ? 1 << 2 : 1 << 5;
4472     } else if (float64_is_any_nan(f)) {
4473         float_status s = { }; /* for snan_bit_is_one */
4474         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4475     } else {
4476         return sign ? 1 << 1 : 1 << 6;
4477     }
4478 }
4479 
4480 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4481 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4482 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4483 GEN_VEXT_V(vfclass_v_h, 2)
4484 GEN_VEXT_V(vfclass_v_w, 4)
4485 GEN_VEXT_V(vfclass_v_d, 8)
4486 
4487 /* Vector Floating-Point Merge Instruction */
4488 
4489 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4490 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4491                   CPURISCVState *env, uint32_t desc)          \
4492 {                                                             \
4493     uint32_t vm = vext_vm(desc);                              \
4494     uint32_t vl = env->vl;                                    \
4495     uint32_t esz = sizeof(ETYPE);                             \
4496     uint32_t total_elems =                                    \
4497         vext_get_total_elems(env, desc, esz);                 \
4498     uint32_t vta = vext_vta(desc);                            \
4499     uint32_t i;                                               \
4500                                                               \
4501     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4502                                                               \
4503     for (i = env->vstart; i < vl; i++) {                      \
4504         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4505         *((ETYPE *)vd + H(i)) =                               \
4506             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4507     }                                                         \
4508     env->vstart = 0;                                          \
4509     /* set tail elements to 1s */                             \
4510     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4511 }
4512 
4513 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4514 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4515 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4516 
4517 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4518 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4519 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4520 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4521 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4522 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4523 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4524 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4525 
4526 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4527 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4528 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4529 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4530 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4531 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4532 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4533 
4534 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4535 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4536 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4537 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4538 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4539 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4540 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4541 
4542 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4543 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4544 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4545 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4546 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4547 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4548 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4549 
4550 /* Widening Floating-Point/Integer Type-Convert Instructions */
4551 /* (TD, T2, TX2) */
4552 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4553 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4554 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4555 /*
4556  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4557  */
4558 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4559 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4560 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4561 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4562 
4563 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4564 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4565 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4566 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4567 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4568 
4569 /*
4570  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4571  */
4572 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4573 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4574 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4575 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4576 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4577 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4578 
4579 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4580 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4581 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4582 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4583 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4584 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4585 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4586 
4587 /*
4588  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4589  */
4590 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4591 {
4592     return float16_to_float32(a, true, s);
4593 }
4594 
4595 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4596 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4597 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4598 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4599 
4600 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4601 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4602 
4603 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4604 /* (TD, T2, TX2) */
4605 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4606 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4607 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4608 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4609 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4610 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4611 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4612 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4613 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4614 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4615 
4616 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4617 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4618 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4619 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4620 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4621 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4622 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4623 
4624 /*
4625  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4626  */
4627 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4628 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4629 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4630 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4631 
4632 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4633 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4634 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4635 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4636 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4637 
4638 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4639 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4640 {
4641     return float32_to_float16(a, true, s);
4642 }
4643 
4644 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4645 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4646 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4647 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4648 
4649 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4650 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4651 
4652 /*
4653  * Vector Reduction Operations
4654  */
4655 /* Vector Single-Width Integer Reduction Instructions */
4656 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4657 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4658                   void *vs2, CPURISCVState *env,          \
4659                   uint32_t desc)                          \
4660 {                                                         \
4661     uint32_t vm = vext_vm(desc);                          \
4662     uint32_t vl = env->vl;                                \
4663     uint32_t esz = sizeof(TD);                            \
4664     uint32_t vlenb = simd_maxsz(desc);                    \
4665     uint32_t vta = vext_vta(desc);                        \
4666     uint32_t i;                                           \
4667     TD s1 =  *((TD *)vs1 + HD(0));                        \
4668                                                           \
4669     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4670                                                           \
4671     for (i = env->vstart; i < vl; i++) {                  \
4672         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4673         if (!vm && !vext_elem_mask(v0, i)) {              \
4674             continue;                                     \
4675         }                                                 \
4676         s1 = OP(s1, (TD)s2);                              \
4677     }                                                     \
4678     if (vl > 0) {                                         \
4679         *((TD *)vd + HD(0)) = s1;                         \
4680     }                                                     \
4681     env->vstart = 0;                                      \
4682     /* set tail elements to 1s */                         \
4683     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4684 }
4685 
4686 /* vd[0] = sum(vs1[0], vs2[*]) */
4687 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4688 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4689 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4690 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4691 
4692 /* vd[0] = maxu(vs1[0], vs2[*]) */
4693 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4694 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4695 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4696 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4697 
4698 /* vd[0] = max(vs1[0], vs2[*]) */
4699 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4700 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4701 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4702 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4703 
4704 /* vd[0] = minu(vs1[0], vs2[*]) */
4705 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4706 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4707 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4708 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4709 
4710 /* vd[0] = min(vs1[0], vs2[*]) */
4711 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4712 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4713 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4714 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4715 
4716 /* vd[0] = and(vs1[0], vs2[*]) */
4717 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4718 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4719 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4720 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4721 
4722 /* vd[0] = or(vs1[0], vs2[*]) */
4723 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4724 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4725 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4726 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4727 
4728 /* vd[0] = xor(vs1[0], vs2[*]) */
4729 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4730 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4731 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4732 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4733 
4734 /* Vector Widening Integer Reduction Instructions */
4735 /* signed sum reduction into double-width accumulator */
4736 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4737 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4738 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4739 
4740 /* Unsigned sum reduction into double-width accumulator */
4741 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4742 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4743 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4744 
4745 /* Vector Single-Width Floating-Point Reduction Instructions */
4746 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4747 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4748                   void *vs2, CPURISCVState *env,           \
4749                   uint32_t desc)                           \
4750 {                                                          \
4751     uint32_t vm = vext_vm(desc);                           \
4752     uint32_t vl = env->vl;                                 \
4753     uint32_t esz = sizeof(TD);                             \
4754     uint32_t vlenb = simd_maxsz(desc);                     \
4755     uint32_t vta = vext_vta(desc);                         \
4756     uint32_t i;                                            \
4757     TD s1 =  *((TD *)vs1 + HD(0));                         \
4758                                                            \
4759     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4760                                                            \
4761     for (i = env->vstart; i < vl; i++) {                   \
4762         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4763         if (!vm && !vext_elem_mask(v0, i)) {               \
4764             continue;                                      \
4765         }                                                  \
4766         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4767     }                                                      \
4768     if (vl > 0) {                                          \
4769         *((TD *)vd + HD(0)) = s1;                          \
4770     }                                                      \
4771     env->vstart = 0;                                       \
4772     /* set tail elements to 1s */                          \
4773     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4774 }
4775 
4776 /* Unordered sum */
4777 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4778 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4779 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4780 
4781 /* Ordered sum */
4782 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4783 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4784 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4785 
4786 /* Maximum value */
4787 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4788               float16_maximum_number)
4789 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4790               float32_maximum_number)
4791 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4792               float64_maximum_number)
4793 
4794 /* Minimum value */
4795 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4796               float16_minimum_number)
4797 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4798               float32_minimum_number)
4799 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4800               float64_minimum_number)
4801 
4802 /* Vector Widening Floating-Point Add Instructions */
4803 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4804 {
4805     return float32_add(a, float16_to_float32(b, true, s), s);
4806 }
4807 
4808 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4809 {
4810     return float64_add(a, float32_to_float64(b, s), s);
4811 }
4812 
4813 /* Vector Widening Floating-Point Reduction Instructions */
4814 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4815 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4816 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4817 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4818 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4819 
4820 /*
4821  * Vector Mask Operations
4822  */
4823 /* Vector Mask-Register Logical Instructions */
4824 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4825 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4826                   void *vs2, CPURISCVState *env,          \
4827                   uint32_t desc)                          \
4828 {                                                         \
4829     uint32_t vl = env->vl;                                \
4830     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4831     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4832     uint32_t i;                                           \
4833     int a, b;                                             \
4834                                                           \
4835     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4836                                                           \
4837     for (i = env->vstart; i < vl; i++) {                  \
4838         a = vext_elem_mask(vs1, i);                       \
4839         b = vext_elem_mask(vs2, i);                       \
4840         vext_set_elem_mask(vd, i, OP(b, a));              \
4841     }                                                     \
4842     env->vstart = 0;                                      \
4843     /*
4844      * mask destination register are always tail-agnostic
4845      * set tail elements to 1s
4846      */                                                   \
4847     if (vta_all_1s) {                                     \
4848         for (; i < total_elems; i++) {                    \
4849             vext_set_elem_mask(vd, i, 1);                 \
4850         }                                                 \
4851     }                                                     \
4852 }
4853 
4854 #define DO_NAND(N, M)  (!(N & M))
4855 #define DO_ANDNOT(N, M)  (N & !M)
4856 #define DO_NOR(N, M)  (!(N | M))
4857 #define DO_ORNOT(N, M)  (N | !M)
4858 #define DO_XNOR(N, M)  (!(N ^ M))
4859 
4860 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4861 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4862 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4863 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4864 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4865 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4866 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4867 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4868 
4869 /* Vector count population in mask vcpop */
4870 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4871                              uint32_t desc)
4872 {
4873     target_ulong cnt = 0;
4874     uint32_t vm = vext_vm(desc);
4875     uint32_t vl = env->vl;
4876     int i;
4877 
4878     for (i = env->vstart; i < vl; i++) {
4879         if (vm || vext_elem_mask(v0, i)) {
4880             if (vext_elem_mask(vs2, i)) {
4881                 cnt++;
4882             }
4883         }
4884     }
4885     env->vstart = 0;
4886     return cnt;
4887 }
4888 
4889 /* vfirst find-first-set mask bit */
4890 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4891                               uint32_t desc)
4892 {
4893     uint32_t vm = vext_vm(desc);
4894     uint32_t vl = env->vl;
4895     int i;
4896 
4897     for (i = env->vstart; i < vl; i++) {
4898         if (vm || vext_elem_mask(v0, i)) {
4899             if (vext_elem_mask(vs2, i)) {
4900                 return i;
4901             }
4902         }
4903     }
4904     env->vstart = 0;
4905     return -1LL;
4906 }
4907 
4908 enum set_mask_type {
4909     ONLY_FIRST = 1,
4910     INCLUDE_FIRST,
4911     BEFORE_FIRST,
4912 };
4913 
4914 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4915                    uint32_t desc, enum set_mask_type type)
4916 {
4917     uint32_t vm = vext_vm(desc);
4918     uint32_t vl = env->vl;
4919     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4920     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4921     uint32_t vma = vext_vma(desc);
4922     int i;
4923     bool first_mask_bit = false;
4924 
4925     VSTART_CHECK_EARLY_EXIT(env, vl);
4926 
4927     for (i = env->vstart; i < vl; i++) {
4928         if (!vm && !vext_elem_mask(v0, i)) {
4929             /* set masked-off elements to 1s */
4930             if (vma) {
4931                 vext_set_elem_mask(vd, i, 1);
4932             }
4933             continue;
4934         }
4935         /* write a zero to all following active elements */
4936         if (first_mask_bit) {
4937             vext_set_elem_mask(vd, i, 0);
4938             continue;
4939         }
4940         if (vext_elem_mask(vs2, i)) {
4941             first_mask_bit = true;
4942             if (type == BEFORE_FIRST) {
4943                 vext_set_elem_mask(vd, i, 0);
4944             } else {
4945                 vext_set_elem_mask(vd, i, 1);
4946             }
4947         } else {
4948             if (type == ONLY_FIRST) {
4949                 vext_set_elem_mask(vd, i, 0);
4950             } else {
4951                 vext_set_elem_mask(vd, i, 1);
4952             }
4953         }
4954     }
4955     env->vstart = 0;
4956     /*
4957      * mask destination register are always tail-agnostic
4958      * set tail elements to 1s
4959      */
4960     if (vta_all_1s) {
4961         for (; i < total_elems; i++) {
4962             vext_set_elem_mask(vd, i, 1);
4963         }
4964     }
4965 }
4966 
4967 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4968                      uint32_t desc)
4969 {
4970     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4971 }
4972 
4973 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4974                      uint32_t desc)
4975 {
4976     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4977 }
4978 
4979 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4980                      uint32_t desc)
4981 {
4982     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4983 }
4984 
4985 /* Vector Iota Instruction */
4986 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4987 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4988                   uint32_t desc)                                          \
4989 {                                                                         \
4990     uint32_t vm = vext_vm(desc);                                          \
4991     uint32_t vl = env->vl;                                                \
4992     uint32_t esz = sizeof(ETYPE);                                         \
4993     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4994     uint32_t vta = vext_vta(desc);                                        \
4995     uint32_t vma = vext_vma(desc);                                        \
4996     uint32_t sum = 0;                                                     \
4997     int i;                                                                \
4998                                                                           \
4999     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5000                                                                           \
5001     for (i = env->vstart; i < vl; i++) {                                  \
5002         if (!vm && !vext_elem_mask(v0, i)) {                              \
5003             /* set masked-off elements to 1s */                           \
5004             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5005             continue;                                                     \
5006         }                                                                 \
5007         *((ETYPE *)vd + H(i)) = sum;                                      \
5008         if (vext_elem_mask(vs2, i)) {                                     \
5009             sum++;                                                        \
5010         }                                                                 \
5011     }                                                                     \
5012     env->vstart = 0;                                                      \
5013     /* set tail elements to 1s */                                         \
5014     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5015 }
5016 
5017 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5018 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5019 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5020 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5021 
5022 /* Vector Element Index Instruction */
5023 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5024 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5025 {                                                                         \
5026     uint32_t vm = vext_vm(desc);                                          \
5027     uint32_t vl = env->vl;                                                \
5028     uint32_t esz = sizeof(ETYPE);                                         \
5029     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5030     uint32_t vta = vext_vta(desc);                                        \
5031     uint32_t vma = vext_vma(desc);                                        \
5032     int i;                                                                \
5033                                                                           \
5034     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5035                                                                           \
5036     for (i = env->vstart; i < vl; i++) {                                  \
5037         if (!vm && !vext_elem_mask(v0, i)) {                              \
5038             /* set masked-off elements to 1s */                           \
5039             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5040             continue;                                                     \
5041         }                                                                 \
5042         *((ETYPE *)vd + H(i)) = i;                                        \
5043     }                                                                     \
5044     env->vstart = 0;                                                      \
5045     /* set tail elements to 1s */                                         \
5046     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5047 }
5048 
5049 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5050 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5051 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5052 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5053 
5054 /*
5055  * Vector Permutation Instructions
5056  */
5057 
5058 /* Vector Slide Instructions */
5059 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5060 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5061                   CPURISCVState *env, uint32_t desc)                      \
5062 {                                                                         \
5063     uint32_t vm = vext_vm(desc);                                          \
5064     uint32_t vl = env->vl;                                                \
5065     uint32_t esz = sizeof(ETYPE);                                         \
5066     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5067     uint32_t vta = vext_vta(desc);                                        \
5068     uint32_t vma = vext_vma(desc);                                        \
5069     target_ulong offset = s1, i_min, i;                                   \
5070                                                                           \
5071     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5072                                                                           \
5073     i_min = MAX(env->vstart, offset);                                     \
5074     for (i = i_min; i < vl; i++) {                                        \
5075         if (!vm && !vext_elem_mask(v0, i)) {                              \
5076             /* set masked-off elements to 1s */                           \
5077             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5078             continue;                                                     \
5079         }                                                                 \
5080         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5081     }                                                                     \
5082     env->vstart = 0;                                                      \
5083     /* set tail elements to 1s */                                         \
5084     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5085 }
5086 
5087 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5088 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5089 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5090 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5091 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5092 
5093 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5094 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5095                   CPURISCVState *env, uint32_t desc)                      \
5096 {                                                                         \
5097     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5098     uint32_t vm = vext_vm(desc);                                          \
5099     uint32_t vl = env->vl;                                                \
5100     uint32_t esz = sizeof(ETYPE);                                         \
5101     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5102     uint32_t vta = vext_vta(desc);                                        \
5103     uint32_t vma = vext_vma(desc);                                        \
5104     target_ulong i_max, i_min, i;                                         \
5105                                                                           \
5106     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5107                                                                           \
5108     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5109     i_max = MAX(i_min, env->vstart);                                      \
5110     for (i = env->vstart; i < i_max; ++i) {                               \
5111         if (!vm && !vext_elem_mask(v0, i)) {                              \
5112             /* set masked-off elements to 1s */                           \
5113             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5114             continue;                                                     \
5115         }                                                                 \
5116         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5117     }                                                                     \
5118                                                                           \
5119     for (i = i_max; i < vl; ++i) {                                        \
5120         if (vm || vext_elem_mask(v0, i)) {                                \
5121             *((ETYPE *)vd + H(i)) = 0;                                    \
5122         }                                                                 \
5123     }                                                                     \
5124                                                                           \
5125     env->vstart = 0;                                                      \
5126     /* set tail elements to 1s */                                         \
5127     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5128 }
5129 
5130 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5131 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5132 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5133 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5134 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5135 
5136 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5137 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5138                                  void *vs2, CPURISCVState *env,             \
5139                                  uint32_t desc)                             \
5140 {                                                                           \
5141     typedef uint##BITWIDTH##_t ETYPE;                                       \
5142     uint32_t vm = vext_vm(desc);                                            \
5143     uint32_t vl = env->vl;                                                  \
5144     uint32_t esz = sizeof(ETYPE);                                           \
5145     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5146     uint32_t vta = vext_vta(desc);                                          \
5147     uint32_t vma = vext_vma(desc);                                          \
5148     uint32_t i;                                                             \
5149                                                                             \
5150     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5151                                                                             \
5152     for (i = env->vstart; i < vl; i++) {                                    \
5153         if (!vm && !vext_elem_mask(v0, i)) {                                \
5154             /* set masked-off elements to 1s */                             \
5155             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5156             continue;                                                       \
5157         }                                                                   \
5158         if (i == 0) {                                                       \
5159             *((ETYPE *)vd + H(i)) = s1;                                     \
5160         } else {                                                            \
5161             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5162         }                                                                   \
5163     }                                                                       \
5164     env->vstart = 0;                                                        \
5165     /* set tail elements to 1s */                                           \
5166     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5167 }
5168 
5169 GEN_VEXT_VSLIE1UP(8,  H1)
5170 GEN_VEXT_VSLIE1UP(16, H2)
5171 GEN_VEXT_VSLIE1UP(32, H4)
5172 GEN_VEXT_VSLIE1UP(64, H8)
5173 
5174 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5175 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5176                   CPURISCVState *env, uint32_t desc)              \
5177 {                                                                 \
5178     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5179 }
5180 
5181 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5182 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5183 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5184 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5185 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5186 
5187 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5188 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5189                                    void *vs2, CPURISCVState *env,             \
5190                                    uint32_t desc)                             \
5191 {                                                                             \
5192     typedef uint##BITWIDTH##_t ETYPE;                                         \
5193     uint32_t vm = vext_vm(desc);                                              \
5194     uint32_t vl = env->vl;                                                    \
5195     uint32_t esz = sizeof(ETYPE);                                             \
5196     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5197     uint32_t vta = vext_vta(desc);                                            \
5198     uint32_t vma = vext_vma(desc);                                            \
5199     uint32_t i;                                                               \
5200                                                                               \
5201     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5202                                                                               \
5203     for (i = env->vstart; i < vl; i++) {                                      \
5204         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5205             /* set masked-off elements to 1s */                               \
5206             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5207             continue;                                                         \
5208         }                                                                     \
5209         if (i == vl - 1) {                                                    \
5210             *((ETYPE *)vd + H(i)) = s1;                                       \
5211         } else {                                                              \
5212             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5213         }                                                                     \
5214     }                                                                         \
5215     env->vstart = 0;                                                          \
5216     /* set tail elements to 1s */                                             \
5217     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5218 }
5219 
5220 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5221 GEN_VEXT_VSLIDE1DOWN(16, H2)
5222 GEN_VEXT_VSLIDE1DOWN(32, H4)
5223 GEN_VEXT_VSLIDE1DOWN(64, H8)
5224 
5225 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5226 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5227                   CPURISCVState *env, uint32_t desc)              \
5228 {                                                                 \
5229     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5230 }
5231 
5232 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5233 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5234 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5235 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5236 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5237 
5238 /* Vector Floating-Point Slide Instructions */
5239 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5240 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5241                   CPURISCVState *env, uint32_t desc)          \
5242 {                                                             \
5243     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5244 }
5245 
5246 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5247 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5248 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5249 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5250 
5251 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5252 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5253                   CPURISCVState *env, uint32_t desc)          \
5254 {                                                             \
5255     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5256 }
5257 
5258 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5259 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5260 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5261 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5262 
5263 /* Vector Register Gather Instruction */
5264 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5265 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5266                   CPURISCVState *env, uint32_t desc)                      \
5267 {                                                                         \
5268     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5269     uint32_t vm = vext_vm(desc);                                          \
5270     uint32_t vl = env->vl;                                                \
5271     uint32_t esz = sizeof(TS2);                                           \
5272     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5273     uint32_t vta = vext_vta(desc);                                        \
5274     uint32_t vma = vext_vma(desc);                                        \
5275     uint64_t index;                                                       \
5276     uint32_t i;                                                           \
5277                                                                           \
5278     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5279                                                                           \
5280     for (i = env->vstart; i < vl; i++) {                                  \
5281         if (!vm && !vext_elem_mask(v0, i)) {                              \
5282             /* set masked-off elements to 1s */                           \
5283             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5284             continue;                                                     \
5285         }                                                                 \
5286         index = *((TS1 *)vs1 + HS1(i));                                   \
5287         if (index >= vlmax) {                                             \
5288             *((TS2 *)vd + HS2(i)) = 0;                                    \
5289         } else {                                                          \
5290             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5291         }                                                                 \
5292     }                                                                     \
5293     env->vstart = 0;                                                      \
5294     /* set tail elements to 1s */                                         \
5295     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5296 }
5297 
5298 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5299 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5300 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5301 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5302 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5303 
5304 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5305 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5306 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5307 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5308 
5309 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5310 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5311                   CPURISCVState *env, uint32_t desc)                      \
5312 {                                                                         \
5313     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5314     uint32_t vm = vext_vm(desc);                                          \
5315     uint32_t vl = env->vl;                                                \
5316     uint32_t esz = sizeof(ETYPE);                                         \
5317     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5318     uint32_t vta = vext_vta(desc);                                        \
5319     uint32_t vma = vext_vma(desc);                                        \
5320     uint64_t index = s1;                                                  \
5321     uint32_t i;                                                           \
5322                                                                           \
5323     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5324                                                                           \
5325     for (i = env->vstart; i < vl; i++) {                                  \
5326         if (!vm && !vext_elem_mask(v0, i)) {                              \
5327             /* set masked-off elements to 1s */                           \
5328             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5329             continue;                                                     \
5330         }                                                                 \
5331         if (index >= vlmax) {                                             \
5332             *((ETYPE *)vd + H(i)) = 0;                                    \
5333         } else {                                                          \
5334             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5335         }                                                                 \
5336     }                                                                     \
5337     env->vstart = 0;                                                      \
5338     /* set tail elements to 1s */                                         \
5339     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5340 }
5341 
5342 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5343 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5344 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5345 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5346 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5347 
5348 /* Vector Compress Instruction */
5349 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5350 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5351                   CPURISCVState *env, uint32_t desc)                      \
5352 {                                                                         \
5353     uint32_t vl = env->vl;                                                \
5354     uint32_t esz = sizeof(ETYPE);                                         \
5355     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5356     uint32_t vta = vext_vta(desc);                                        \
5357     uint32_t num = 0, i;                                                  \
5358                                                                           \
5359     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5360                                                                           \
5361     for (i = env->vstart; i < vl; i++) {                                  \
5362         if (!vext_elem_mask(vs1, i)) {                                    \
5363             continue;                                                     \
5364         }                                                                 \
5365         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5366         num++;                                                            \
5367     }                                                                     \
5368     env->vstart = 0;                                                      \
5369     /* set tail elements to 1s */                                         \
5370     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5371 }
5372 
5373 /* Compress into vd elements of vs2 where vs1 is enabled */
5374 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5375 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5376 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5377 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5378 
5379 /* Vector Whole Register Move */
5380 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5381 {
5382     /* EEW = SEW */
5383     uint32_t maxsz = simd_maxsz(desc);
5384     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5385     uint32_t startb = env->vstart * sewb;
5386     uint32_t i = startb;
5387 
5388     if (startb >= maxsz) {
5389         env->vstart = 0;
5390         return;
5391     }
5392 
5393     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5394         uint32_t j = ROUND_UP(i, 8);
5395         memcpy((uint8_t *)vd + H1(j - 1),
5396                (uint8_t *)vs2 + H1(j - 1),
5397                j - i);
5398         i = j;
5399     }
5400 
5401     memcpy((uint8_t *)vd + H1(i),
5402            (uint8_t *)vs2 + H1(i),
5403            maxsz - i);
5404 
5405     env->vstart = 0;
5406 }
5407 
5408 /* Vector Integer Extension */
5409 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5410 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5411                   CPURISCVState *env, uint32_t desc)             \
5412 {                                                                \
5413     uint32_t vl = env->vl;                                       \
5414     uint32_t vm = vext_vm(desc);                                 \
5415     uint32_t esz = sizeof(ETYPE);                                \
5416     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5417     uint32_t vta = vext_vta(desc);                               \
5418     uint32_t vma = vext_vma(desc);                               \
5419     uint32_t i;                                                  \
5420                                                                  \
5421     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5422                                                                  \
5423     for (i = env->vstart; i < vl; i++) {                         \
5424         if (!vm && !vext_elem_mask(v0, i)) {                     \
5425             /* set masked-off elements to 1s */                  \
5426             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5427             continue;                                            \
5428         }                                                        \
5429         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5430     }                                                            \
5431     env->vstart = 0;                                             \
5432     /* set tail elements to 1s */                                \
5433     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5434 }
5435 
5436 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5437 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5438 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5439 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5440 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5441 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5442 
5443 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5444 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5445 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5446 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5447 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5448 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5449