xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 5814c08467937154745c6cb2b3400800b98ff897)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg-gvec-desc.h"
30 #include "internals.h"
31 #include "vector_internals.h"
32 #include <math.h>
33 
34 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                             target_ulong s2)
36 {
37     int vlmax, vl;
38     RISCVCPU *cpu = env_archcpu(env);
39     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41     uint16_t sew = 8 << vsew;
42     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43     int xlen = riscv_cpu_xlen(env);
44     bool vill = (s2 >> (xlen - 1)) & 0x1;
45     target_ulong reserved = s2 &
46                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48     uint16_t vlen = cpu->cfg.vlenb << 3;
49     int8_t lmul;
50 
51     if (vlmul & 4) {
52         /*
53          * Fractional LMUL, check:
54          *
55          * VLEN * LMUL >= SEW
56          * VLEN >> (8 - lmul) >= sew
57          * (vlenb << 3) >> (8 - lmul) >= sew
58          */
59         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
79         vl = (s1 + 1) >> 1;
80     } else {
81         vl = vlmax;
82     }
83     env->vl = vl;
84     env->vtype = s2;
85     env->vstart = 0;
86     env->vill = 0;
87     return vl;
88 }
89 
90 /*
91  * Get the maximum number of elements can be operated.
92  *
93  * log2_esz: log2 of element size in bytes.
94  */
95 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
96 {
97     /*
98      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
99      * so vlen in bytes (vlenb) is encoded as maxsz.
100      */
101     uint32_t vlenb = simd_maxsz(desc);
102 
103     /* Return VLMAX */
104     int scale = vext_lmul(desc) - log2_esz;
105     return scale < 0 ? vlenb >> -scale : vlenb << scale;
106 }
107 
108 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
109 {
110     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
111 }
112 
113 /*
114  * This function checks watchpoint before real load operation.
115  *
116  * In system mode, the TLB API probe_access is enough for watchpoint check.
117  * In user mode, there is no watchpoint support now.
118  *
119  * It will trigger an exception if there is no mapping in TLB
120  * and page table walk can't fill the TLB entry. Then the guest
121  * software can return here after process the exception or never return.
122  */
123 static void probe_pages(CPURISCVState *env, target_ulong addr,
124                         target_ulong len, uintptr_t ra,
125                         MMUAccessType access_type)
126 {
127     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
128     target_ulong curlen = MIN(pagelen, len);
129     int mmu_index = riscv_env_mmu_index(env, false);
130 
131     probe_access(env, adjust_addr(env, addr), curlen, access_type,
132                  mmu_index, ra);
133     if (len > curlen) {
134         addr += curlen;
135         curlen = len - curlen;
136         probe_access(env, adjust_addr(env, addr), curlen, access_type,
137                      mmu_index, ra);
138     }
139 }
140 
141 static inline void vext_set_elem_mask(void *v0, int index,
142                                       uint8_t value)
143 {
144     int idx = index / 64;
145     int pos = index % 64;
146     uint64_t old = ((uint64_t *)v0)[idx];
147     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
148 }
149 
150 /* elements operations for load and store */
151 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
152                                    uint32_t idx, void *vd, uintptr_t retaddr);
153 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
154 
155 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
156 static inline QEMU_ALWAYS_INLINE                            \
157 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
158                 uint32_t idx, void *vd, uintptr_t retaddr)  \
159 {                                                           \
160     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
161     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
162 }                                                           \
163                                                             \
164 static inline QEMU_ALWAYS_INLINE                            \
165 void NAME##_host(void *vd, uint32_t idx, void *host)        \
166 {                                                           \
167     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
168     *cur = (ETYPE)LDSUF##_p(host);                          \
169 }
170 
171 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
172 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
173 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
174 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
175 
176 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
177 static inline QEMU_ALWAYS_INLINE                            \
178 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
179                 uint32_t idx, void *vd, uintptr_t retaddr)  \
180 {                                                           \
181     ETYPE data = *((ETYPE *)vd + H(idx));                   \
182     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
183 }                                                           \
184                                                             \
185 static inline QEMU_ALWAYS_INLINE                            \
186 void NAME##_host(void *vd, uint32_t idx, void *host)        \
187 {                                                           \
188     ETYPE data = *((ETYPE *)vd + H(idx));                   \
189     STSUF##_p(host, data);                                  \
190 }
191 
192 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
193 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
194 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
195 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
196 
197 static inline QEMU_ALWAYS_INLINE void
198 vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
199                        void *vd, uint32_t evl, target_ulong addr,
200                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
201                        bool is_load)
202 {
203     uint32_t i;
204     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
205         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
206     }
207 }
208 
209 static inline QEMU_ALWAYS_INLINE void
210 vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
211                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
212                         uint32_t esz, bool is_load)
213 {
214 #if HOST_BIG_ENDIAN
215     for (; reg_start < evl; reg_start++, host += esz) {
216         ldst_host(vd, reg_start, host);
217     }
218 #else
219     if (esz == 1) {
220         uint32_t byte_offset = reg_start * esz;
221         uint32_t size = (evl - reg_start) * esz;
222 
223         if (is_load) {
224             memcpy(vd + byte_offset, host, size);
225         } else {
226             memcpy(host, vd + byte_offset, size);
227         }
228     } else {
229         for (; reg_start < evl; reg_start++, host += esz) {
230             ldst_host(vd, reg_start, host);
231         }
232     }
233 #endif
234 }
235 
236 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
237                                    uint32_t desc, uint32_t nf,
238                                    uint32_t esz, uint32_t max_elems)
239 {
240     uint32_t vta = vext_vta(desc);
241     int k;
242 
243     if (vta == 0) {
244         return;
245     }
246 
247     for (k = 0; k < nf; ++k) {
248         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
249                           (k * max_elems + max_elems) * esz);
250     }
251 }
252 
253 /*
254  * stride: access vector element from strided memory
255  */
256 static void
257 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
258                  CPURISCVState *env, uint32_t desc, uint32_t vm,
259                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
260                  uintptr_t ra)
261 {
262     uint32_t i, k;
263     uint32_t nf = vext_nf(desc);
264     uint32_t max_elems = vext_max_elems(desc, log2_esz);
265     uint32_t esz = 1 << log2_esz;
266     uint32_t vma = vext_vma(desc);
267 
268     VSTART_CHECK_EARLY_EXIT(env);
269 
270     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
271         k = 0;
272         while (k < nf) {
273             if (!vm && !vext_elem_mask(v0, i)) {
274                 /* set masked-off elements to 1s */
275                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
276                                   (i + k * max_elems + 1) * esz);
277                 k++;
278                 continue;
279             }
280             target_ulong addr = base + stride * i + (k << log2_esz);
281             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
282             k++;
283         }
284     }
285     env->vstart = 0;
286 
287     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
288 }
289 
290 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
291 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
292                   target_ulong stride, CPURISCVState *env,              \
293                   uint32_t desc)                                        \
294 {                                                                       \
295     uint32_t vm = vext_vm(desc);                                        \
296     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
297                      ctzl(sizeof(ETYPE)), GETPC());                     \
298 }
299 
300 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
301 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
302 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
303 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
304 
305 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
306 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
307                   target_ulong stride, CPURISCVState *env,              \
308                   uint32_t desc)                                        \
309 {                                                                       \
310     uint32_t vm = vext_vm(desc);                                        \
311     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
312                      ctzl(sizeof(ETYPE)), GETPC());                     \
313 }
314 
315 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
316 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
317 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
318 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
319 
320 /*
321  * unit-stride: access elements stored contiguously in memory
322  */
323 
324 /* unmasked unit-stride load and store operation */
325 static inline QEMU_ALWAYS_INLINE void
326 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
327                   uint32_t elems, uint32_t nf, uint32_t max_elems,
328                   uint32_t log2_esz, bool is_load, int mmu_index,
329                   vext_ldst_elem_fn_tlb *ldst_tlb,
330                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
331 {
332     void *host;
333     int i, k, flags;
334     uint32_t esz = 1 << log2_esz;
335     uint32_t size = (elems * nf) << log2_esz;
336     uint32_t evl = env->vstart + elems;
337     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
338 
339     /* Check page permission/pmp/watchpoint/etc. */
340     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
341                                mmu_index, true, &host, ra);
342 
343     if (flags == 0) {
344         if (nf == 1) {
345             vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, host,
346                                     esz, is_load);
347         } else {
348             for (i = env->vstart; i < evl; ++i) {
349                 k = 0;
350                 while (k < nf) {
351                     ldst_host(vd, i + k * max_elems, host);
352                     host += esz;
353                     k++;
354                 }
355             }
356         }
357         env->vstart += elems;
358     } else {
359         if (nf == 1) {
360             vext_continus_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
361                                    ra, esz, is_load);
362         } else {
363             /* load bytes from guest memory */
364             for (i = env->vstart; i < evl; env->vstart = ++i) {
365                 k = 0;
366                 while (k < nf) {
367                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
368                              vd, ra);
369                     addr += esz;
370                     k++;
371                 }
372             }
373         }
374     }
375 }
376 
377 static inline QEMU_ALWAYS_INLINE void
378 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
379              vext_ldst_elem_fn_tlb *ldst_tlb,
380              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
381              uint32_t evl, uintptr_t ra, bool is_load)
382 {
383     uint32_t k;
384     target_ulong page_split, elems, addr;
385     uint32_t nf = vext_nf(desc);
386     uint32_t max_elems = vext_max_elems(desc, log2_esz);
387     uint32_t esz = 1 << log2_esz;
388     uint32_t msize = nf * esz;
389     int mmu_index = riscv_env_mmu_index(env, false);
390 
391     if (env->vstart >= evl) {
392         env->vstart = 0;
393         return;
394     }
395 
396     /* Calculate the page range of first page */
397     addr = base + ((env->vstart * nf) << log2_esz);
398     page_split = -(addr | TARGET_PAGE_MASK);
399     /* Get number of elements */
400     elems = page_split / msize;
401     if (unlikely(env->vstart + elems >= evl)) {
402         elems = evl - env->vstart;
403     }
404 
405     /* Load/store elements in the first page */
406     if (likely(elems)) {
407         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
408                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
409     }
410 
411     /* Load/store elements in the second page */
412     if (unlikely(env->vstart < evl)) {
413         /* Cross page element */
414         if (unlikely(page_split % msize)) {
415             for (k = 0; k < nf; k++) {
416                 addr = base + ((env->vstart * nf + k) << log2_esz);
417                 ldst_tlb(env, adjust_addr(env, addr),
418                         env->vstart + k * max_elems, vd, ra);
419             }
420             env->vstart++;
421         }
422 
423         addr = base + ((env->vstart * nf) << log2_esz);
424         /* Get number of elements of second page */
425         elems = evl - env->vstart;
426 
427         /* Load/store elements in the second page */
428         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
429                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
430     }
431 
432     env->vstart = 0;
433     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
434 }
435 
436 /*
437  * masked unit-stride load and store operation will be a special case of
438  * stride, stride = NF * sizeof (ETYPE)
439  */
440 
441 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
442 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
443                          CPURISCVState *env, uint32_t desc)         \
444 {                                                                   \
445     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
446     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
447                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
448 }                                                                   \
449                                                                     \
450 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
451                   CPURISCVState *env, uint32_t desc)                \
452 {                                                                   \
453     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
454                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
455 }
456 
457 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
458 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
459 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
460 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
461 
462 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
463 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
464                          CPURISCVState *env, uint32_t desc)              \
465 {                                                                        \
466     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
467     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
468                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
469 }                                                                        \
470                                                                          \
471 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
472                   CPURISCVState *env, uint32_t desc)                     \
473 {                                                                        \
474     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
475                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
476 }
477 
478 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
479 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
480 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
481 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
482 
483 /*
484  * unit stride mask load and store, EEW = 1
485  */
486 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
487                     CPURISCVState *env, uint32_t desc)
488 {
489     /* evl = ceil(vl/8) */
490     uint8_t evl = (env->vl + 7) >> 3;
491     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
492                  0, evl, GETPC(), true);
493 }
494 
495 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
496                     CPURISCVState *env, uint32_t desc)
497 {
498     /* evl = ceil(vl/8) */
499     uint8_t evl = (env->vl + 7) >> 3;
500     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
501                  0, evl, GETPC(), false);
502 }
503 
504 /*
505  * index: access vector element from indexed memory
506  */
507 typedef target_ulong vext_get_index_addr(target_ulong base,
508         uint32_t idx, void *vs2);
509 
510 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
511 static target_ulong NAME(target_ulong base,            \
512                          uint32_t idx, void *vs2)      \
513 {                                                      \
514     return (base + *((ETYPE *)vs2 + H(idx)));          \
515 }
516 
517 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
518 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
519 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
520 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
521 
522 static inline void
523 vext_ldst_index(void *vd, void *v0, target_ulong base,
524                 void *vs2, CPURISCVState *env, uint32_t desc,
525                 vext_get_index_addr get_index_addr,
526                 vext_ldst_elem_fn_tlb *ldst_elem,
527                 uint32_t log2_esz, uintptr_t ra)
528 {
529     uint32_t i, k;
530     uint32_t nf = vext_nf(desc);
531     uint32_t vm = vext_vm(desc);
532     uint32_t max_elems = vext_max_elems(desc, log2_esz);
533     uint32_t esz = 1 << log2_esz;
534     uint32_t vma = vext_vma(desc);
535 
536     VSTART_CHECK_EARLY_EXIT(env);
537 
538     /* load bytes from guest memory */
539     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
540         k = 0;
541         while (k < nf) {
542             if (!vm && !vext_elem_mask(v0, i)) {
543                 /* set masked-off elements to 1s */
544                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
545                                   (i + k * max_elems + 1) * esz);
546                 k++;
547                 continue;
548             }
549             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
550             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
551             k++;
552         }
553     }
554     env->vstart = 0;
555 
556     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
557 }
558 
559 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
560 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
561                   void *vs2, CPURISCVState *env, uint32_t desc)            \
562 {                                                                          \
563     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
564                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
565 }
566 
567 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
568 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
569 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
570 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
571 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
572 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
573 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
574 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
575 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
576 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
577 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
578 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
579 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
580 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
581 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
582 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
583 
584 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
585 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
586                   void *vs2, CPURISCVState *env, uint32_t desc)  \
587 {                                                                \
588     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
589                     STORE_FN, ctzl(sizeof(ETYPE)),               \
590                     GETPC());                                    \
591 }
592 
593 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
594 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
595 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
596 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
597 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
598 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
599 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
600 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
601 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
602 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
603 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
604 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
605 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
606 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
607 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
608 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
609 
610 /*
611  * unit-stride fault-only-fisrt load instructions
612  */
613 static inline void
614 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
615           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
616           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
617 {
618     uint32_t i, k, vl = 0;
619     uint32_t nf = vext_nf(desc);
620     uint32_t vm = vext_vm(desc);
621     uint32_t max_elems = vext_max_elems(desc, log2_esz);
622     uint32_t esz = 1 << log2_esz;
623     uint32_t msize = nf * esz;
624     uint32_t vma = vext_vma(desc);
625     target_ulong addr, offset, remain, page_split, elems;
626     int mmu_index = riscv_env_mmu_index(env, false);
627 
628     VSTART_CHECK_EARLY_EXIT(env);
629 
630     /* probe every access */
631     for (i = env->vstart; i < env->vl; i++) {
632         if (!vm && !vext_elem_mask(v0, i)) {
633             continue;
634         }
635         addr = adjust_addr(env, base + i * (nf << log2_esz));
636         if (i == 0) {
637             /* Allow fault on first element. */
638             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
639         } else {
640             remain = nf << log2_esz;
641             while (remain > 0) {
642                 void *host;
643                 int flags;
644 
645                 offset = -(addr | TARGET_PAGE_MASK);
646 
647                 /* Probe nonfault on subsequent elements. */
648                 flags = probe_access_flags(env, addr, offset, MMU_DATA_LOAD,
649                                            mmu_index, true, &host, 0);
650 
651                 /*
652                  * Stop if invalid (unmapped) or mmio (transaction may fail).
653                  * Do not stop if watchpoint, as the spec says that
654                  * first-fault should continue to access the same
655                  * elements regardless of any watchpoint.
656                  */
657                 if (flags & ~TLB_WATCHPOINT) {
658                     vl = i;
659                     goto ProbeSuccess;
660                 }
661                 if (remain <= offset) {
662                     break;
663                 }
664                 remain -= offset;
665                 addr = adjust_addr(env, addr + offset);
666             }
667         }
668     }
669 ProbeSuccess:
670     /* load bytes from guest memory */
671     if (vl != 0) {
672         env->vl = vl;
673     }
674 
675     if (env->vstart < env->vl) {
676         if (vm) {
677             /* Calculate the page range of first page */
678             addr = base + ((env->vstart * nf) << log2_esz);
679             page_split = -(addr | TARGET_PAGE_MASK);
680             /* Get number of elements */
681             elems = page_split / msize;
682             if (unlikely(env->vstart + elems >= env->vl)) {
683                 elems = env->vl - env->vstart;
684             }
685 
686             /* Load/store elements in the first page */
687             if (likely(elems)) {
688                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
689                                   log2_esz, true, mmu_index, ldst_tlb,
690                                   ldst_host, ra);
691             }
692 
693             /* Load/store elements in the second page */
694             if (unlikely(env->vstart < env->vl)) {
695                 /* Cross page element */
696                 if (unlikely(page_split % msize)) {
697                     for (k = 0; k < nf; k++) {
698                         addr = base + ((env->vstart * nf + k) << log2_esz);
699                         ldst_tlb(env, adjust_addr(env, addr),
700                                  env->vstart + k * max_elems, vd, ra);
701                     }
702                     env->vstart++;
703                 }
704 
705                 addr = base + ((env->vstart * nf) << log2_esz);
706                 /* Get number of elements of second page */
707                 elems = env->vl - env->vstart;
708 
709                 /* Load/store elements in the second page */
710                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
711                                   log2_esz, true, mmu_index, ldst_tlb,
712                                   ldst_host, ra);
713             }
714         } else {
715             for (i = env->vstart; i < env->vl; i++) {
716                 k = 0;
717                 while (k < nf) {
718                     if (!vext_elem_mask(v0, i)) {
719                         /* set masked-off elements to 1s */
720                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
721                                           (i + k * max_elems + 1) * esz);
722                         k++;
723                         continue;
724                     }
725                     addr = base + ((i * nf + k) << log2_esz);
726                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
727                              vd, ra);
728                     k++;
729                 }
730             }
731         }
732     }
733     env->vstart = 0;
734 
735     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
736 }
737 
738 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
739 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
740                   CPURISCVState *env, uint32_t desc)            \
741 {                                                               \
742     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
743               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
744 }
745 
746 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
747 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
748 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
749 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
750 
751 #define DO_SWAP(N, M) (M)
752 #define DO_AND(N, M)  (N & M)
753 #define DO_XOR(N, M)  (N ^ M)
754 #define DO_OR(N, M)   (N | M)
755 #define DO_ADD(N, M)  (N + M)
756 
757 /* Signed min/max */
758 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
759 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
760 
761 /*
762  * load and store whole register instructions
763  */
764 static inline QEMU_ALWAYS_INLINE void
765 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
766                 vext_ldst_elem_fn_tlb *ldst_tlb,
767                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
768                 uintptr_t ra, bool is_load)
769 {
770     target_ulong page_split, elems, addr;
771     uint32_t nf = vext_nf(desc);
772     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
773     uint32_t max_elems = vlenb >> log2_esz;
774     uint32_t evl = nf * max_elems;
775     uint32_t esz = 1 << log2_esz;
776     int mmu_index = riscv_env_mmu_index(env, false);
777 
778     /* Calculate the page range of first page */
779     addr = base + (env->vstart << log2_esz);
780     page_split = -(addr | TARGET_PAGE_MASK);
781     /* Get number of elements */
782     elems = page_split / esz;
783     if (unlikely(env->vstart + elems >= evl)) {
784         elems = evl - env->vstart;
785     }
786 
787     /* Load/store elements in the first page */
788     if (likely(elems)) {
789         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
790                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
791     }
792 
793     /* Load/store elements in the second page */
794     if (unlikely(env->vstart < evl)) {
795         /* Cross page element */
796         if (unlikely(page_split % esz)) {
797             addr = base + (env->vstart << log2_esz);
798             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
799             env->vstart++;
800         }
801 
802         addr = base + (env->vstart << log2_esz);
803         /* Get number of elements of second page */
804         elems = evl - env->vstart;
805 
806         /* Load/store elements in the second page */
807         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
808                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
809     }
810 
811     env->vstart = 0;
812 }
813 
814 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
815 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
816                   uint32_t desc)                                    \
817 {                                                                   \
818     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
819                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
820 }
821 
822 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
823 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
824 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
825 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
826 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
827 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
828 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
829 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
830 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
831 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
832 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
833 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
834 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
835 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
836 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
837 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
838 
839 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
840 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
841                   uint32_t desc)                                        \
842 {                                                                       \
843     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
844                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
845 }
846 
847 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
848 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
849 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
850 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
851 
852 /*
853  * Vector Integer Arithmetic Instructions
854  */
855 
856 /* (TD, T1, T2, TX1, TX2) */
857 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
858 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
859 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
860 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
861 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
862 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
863 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
864 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
865 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
866 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
867 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
868 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
869 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
870 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
871 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
872 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
873 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
874 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
875 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
876 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
877 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
878 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
879 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
880 
881 #define DO_SUB(N, M) (N - M)
882 #define DO_RSUB(N, M) (M - N)
883 
884 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
885 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
886 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
887 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
888 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
889 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
890 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
891 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
892 
893 GEN_VEXT_VV(vadd_vv_b, 1)
894 GEN_VEXT_VV(vadd_vv_h, 2)
895 GEN_VEXT_VV(vadd_vv_w, 4)
896 GEN_VEXT_VV(vadd_vv_d, 8)
897 GEN_VEXT_VV(vsub_vv_b, 1)
898 GEN_VEXT_VV(vsub_vv_h, 2)
899 GEN_VEXT_VV(vsub_vv_w, 4)
900 GEN_VEXT_VV(vsub_vv_d, 8)
901 
902 
903 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
904 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
905 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
906 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
907 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
908 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
909 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
910 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
911 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
912 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
913 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
914 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
915 
916 GEN_VEXT_VX(vadd_vx_b, 1)
917 GEN_VEXT_VX(vadd_vx_h, 2)
918 GEN_VEXT_VX(vadd_vx_w, 4)
919 GEN_VEXT_VX(vadd_vx_d, 8)
920 GEN_VEXT_VX(vsub_vx_b, 1)
921 GEN_VEXT_VX(vsub_vx_h, 2)
922 GEN_VEXT_VX(vsub_vx_w, 4)
923 GEN_VEXT_VX(vsub_vx_d, 8)
924 GEN_VEXT_VX(vrsub_vx_b, 1)
925 GEN_VEXT_VX(vrsub_vx_h, 2)
926 GEN_VEXT_VX(vrsub_vx_w, 4)
927 GEN_VEXT_VX(vrsub_vx_d, 8)
928 
929 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
930 {
931     intptr_t oprsz = simd_oprsz(desc);
932     intptr_t i;
933 
934     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
935         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
936     }
937 }
938 
939 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
940 {
941     intptr_t oprsz = simd_oprsz(desc);
942     intptr_t i;
943 
944     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
945         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
946     }
947 }
948 
949 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
950 {
951     intptr_t oprsz = simd_oprsz(desc);
952     intptr_t i;
953 
954     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
955         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
956     }
957 }
958 
959 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
960 {
961     intptr_t oprsz = simd_oprsz(desc);
962     intptr_t i;
963 
964     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
965         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
966     }
967 }
968 
969 /* Vector Widening Integer Add/Subtract */
970 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
971 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
972 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
973 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
974 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
975 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
976 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
977 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
978 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
979 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
980 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
981 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
982 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
983 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
984 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
985 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
986 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
987 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
988 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
989 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
990 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
991 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
992 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
993 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
994 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
996 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
997 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
999 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1000 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1002 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1003 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1005 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1006 GEN_VEXT_VV(vwaddu_vv_b, 2)
1007 GEN_VEXT_VV(vwaddu_vv_h, 4)
1008 GEN_VEXT_VV(vwaddu_vv_w, 8)
1009 GEN_VEXT_VV(vwsubu_vv_b, 2)
1010 GEN_VEXT_VV(vwsubu_vv_h, 4)
1011 GEN_VEXT_VV(vwsubu_vv_w, 8)
1012 GEN_VEXT_VV(vwadd_vv_b, 2)
1013 GEN_VEXT_VV(vwadd_vv_h, 4)
1014 GEN_VEXT_VV(vwadd_vv_w, 8)
1015 GEN_VEXT_VV(vwsub_vv_b, 2)
1016 GEN_VEXT_VV(vwsub_vv_h, 4)
1017 GEN_VEXT_VV(vwsub_vv_w, 8)
1018 GEN_VEXT_VV(vwaddu_wv_b, 2)
1019 GEN_VEXT_VV(vwaddu_wv_h, 4)
1020 GEN_VEXT_VV(vwaddu_wv_w, 8)
1021 GEN_VEXT_VV(vwsubu_wv_b, 2)
1022 GEN_VEXT_VV(vwsubu_wv_h, 4)
1023 GEN_VEXT_VV(vwsubu_wv_w, 8)
1024 GEN_VEXT_VV(vwadd_wv_b, 2)
1025 GEN_VEXT_VV(vwadd_wv_h, 4)
1026 GEN_VEXT_VV(vwadd_wv_w, 8)
1027 GEN_VEXT_VV(vwsub_wv_b, 2)
1028 GEN_VEXT_VV(vwsub_wv_h, 4)
1029 GEN_VEXT_VV(vwsub_wv_w, 8)
1030 
1031 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1032 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1034 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1035 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1037 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1038 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1040 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1041 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1043 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1045 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1046 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1048 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1049 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1051 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1052 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1054 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1055 GEN_VEXT_VX(vwaddu_vx_b, 2)
1056 GEN_VEXT_VX(vwaddu_vx_h, 4)
1057 GEN_VEXT_VX(vwaddu_vx_w, 8)
1058 GEN_VEXT_VX(vwsubu_vx_b, 2)
1059 GEN_VEXT_VX(vwsubu_vx_h, 4)
1060 GEN_VEXT_VX(vwsubu_vx_w, 8)
1061 GEN_VEXT_VX(vwadd_vx_b, 2)
1062 GEN_VEXT_VX(vwadd_vx_h, 4)
1063 GEN_VEXT_VX(vwadd_vx_w, 8)
1064 GEN_VEXT_VX(vwsub_vx_b, 2)
1065 GEN_VEXT_VX(vwsub_vx_h, 4)
1066 GEN_VEXT_VX(vwsub_vx_w, 8)
1067 GEN_VEXT_VX(vwaddu_wx_b, 2)
1068 GEN_VEXT_VX(vwaddu_wx_h, 4)
1069 GEN_VEXT_VX(vwaddu_wx_w, 8)
1070 GEN_VEXT_VX(vwsubu_wx_b, 2)
1071 GEN_VEXT_VX(vwsubu_wx_h, 4)
1072 GEN_VEXT_VX(vwsubu_wx_w, 8)
1073 GEN_VEXT_VX(vwadd_wx_b, 2)
1074 GEN_VEXT_VX(vwadd_wx_h, 4)
1075 GEN_VEXT_VX(vwadd_wx_w, 8)
1076 GEN_VEXT_VX(vwsub_wx_b, 2)
1077 GEN_VEXT_VX(vwsub_wx_h, 4)
1078 GEN_VEXT_VX(vwsub_wx_w, 8)
1079 
1080 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1081 #define DO_VADC(N, M, C) (N + M + C)
1082 #define DO_VSBC(N, M, C) (N - M - C)
1083 
1084 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1085 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1086                   CPURISCVState *env, uint32_t desc)          \
1087 {                                                             \
1088     uint32_t vl = env->vl;                                    \
1089     uint32_t esz = sizeof(ETYPE);                             \
1090     uint32_t total_elems =                                    \
1091         vext_get_total_elems(env, desc, esz);                 \
1092     uint32_t vta = vext_vta(desc);                            \
1093     uint32_t i;                                               \
1094                                                               \
1095     VSTART_CHECK_EARLY_EXIT(env);                             \
1096                                                               \
1097     for (i = env->vstart; i < vl; i++) {                      \
1098         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1099         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1100         ETYPE carry = vext_elem_mask(v0, i);                  \
1101                                                               \
1102         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1103     }                                                         \
1104     env->vstart = 0;                                          \
1105     /* set tail elements to 1s */                             \
1106     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1107 }
1108 
1109 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1110 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1111 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1112 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1113 
1114 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1115 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1116 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1117 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1118 
1119 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1120 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1121                   CPURISCVState *env, uint32_t desc)                     \
1122 {                                                                        \
1123     uint32_t vl = env->vl;                                               \
1124     uint32_t esz = sizeof(ETYPE);                                        \
1125     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1126     uint32_t vta = vext_vta(desc);                                       \
1127     uint32_t i;                                                          \
1128                                                                          \
1129     VSTART_CHECK_EARLY_EXIT(env);                                        \
1130                                                                          \
1131     for (i = env->vstart; i < vl; i++) {                                 \
1132         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1133         ETYPE carry = vext_elem_mask(v0, i);                             \
1134                                                                          \
1135         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1136     }                                                                    \
1137     env->vstart = 0;                                                     \
1138     /* set tail elements to 1s */                                        \
1139     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1140 }
1141 
1142 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1143 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1144 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1145 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1146 
1147 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1148 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1149 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1150 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1151 
1152 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1153                           (__typeof(N))(N + M) < N)
1154 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1155 
1156 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1157 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1158                   CPURISCVState *env, uint32_t desc)          \
1159 {                                                             \
1160     uint32_t vl = env->vl;                                    \
1161     uint32_t vm = vext_vm(desc);                              \
1162     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1163     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1164     uint32_t i;                                               \
1165                                                               \
1166     VSTART_CHECK_EARLY_EXIT(env);                             \
1167                                                               \
1168     for (i = env->vstart; i < vl; i++) {                      \
1169         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1170         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1171         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1172         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1173     }                                                         \
1174     env->vstart = 0;                                          \
1175     /*
1176      * mask destination register are always tail-agnostic
1177      * set tail elements to 1s
1178      */                                                       \
1179     if (vta_all_1s) {                                         \
1180         for (; i < total_elems; i++) {                        \
1181             vext_set_elem_mask(vd, i, 1);                     \
1182         }                                                     \
1183     }                                                         \
1184 }
1185 
1186 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1187 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1188 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1189 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1190 
1191 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1192 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1193 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1194 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1195 
1196 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1197 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1198                   void *vs2, CPURISCVState *env, uint32_t desc) \
1199 {                                                               \
1200     uint32_t vl = env->vl;                                      \
1201     uint32_t vm = vext_vm(desc);                                \
1202     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1203     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1204     uint32_t i;                                                 \
1205                                                                 \
1206     VSTART_CHECK_EARLY_EXIT(env);                               \
1207                                                                 \
1208     for (i = env->vstart; i < vl; i++) {                        \
1209         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1210         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1211         vext_set_elem_mask(vd, i,                               \
1212                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1213     }                                                           \
1214     env->vstart = 0;                                            \
1215     /*
1216      * mask destination register are always tail-agnostic
1217      * set tail elements to 1s
1218      */                                                         \
1219     if (vta_all_1s) {                                           \
1220         for (; i < total_elems; i++) {                          \
1221             vext_set_elem_mask(vd, i, 1);                       \
1222         }                                                       \
1223     }                                                           \
1224 }
1225 
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230 
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235 
1236 /* Vector Bitwise Logical Instructions */
1237 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249 GEN_VEXT_VV(vand_vv_b, 1)
1250 GEN_VEXT_VV(vand_vv_h, 2)
1251 GEN_VEXT_VV(vand_vv_w, 4)
1252 GEN_VEXT_VV(vand_vv_d, 8)
1253 GEN_VEXT_VV(vor_vv_b, 1)
1254 GEN_VEXT_VV(vor_vv_h, 2)
1255 GEN_VEXT_VV(vor_vv_w, 4)
1256 GEN_VEXT_VV(vor_vv_d, 8)
1257 GEN_VEXT_VV(vxor_vv_b, 1)
1258 GEN_VEXT_VV(vxor_vv_h, 2)
1259 GEN_VEXT_VV(vxor_vv_w, 4)
1260 GEN_VEXT_VV(vxor_vv_d, 8)
1261 
1262 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274 GEN_VEXT_VX(vand_vx_b, 1)
1275 GEN_VEXT_VX(vand_vx_h, 2)
1276 GEN_VEXT_VX(vand_vx_w, 4)
1277 GEN_VEXT_VX(vand_vx_d, 8)
1278 GEN_VEXT_VX(vor_vx_b, 1)
1279 GEN_VEXT_VX(vor_vx_h, 2)
1280 GEN_VEXT_VX(vor_vx_w, 4)
1281 GEN_VEXT_VX(vor_vx_d, 8)
1282 GEN_VEXT_VX(vxor_vx_b, 1)
1283 GEN_VEXT_VX(vxor_vx_h, 2)
1284 GEN_VEXT_VX(vxor_vx_w, 4)
1285 GEN_VEXT_VX(vxor_vx_d, 8)
1286 
1287 /* Vector Single-Width Bit Shift Instructions */
1288 #define DO_SLL(N, M)  (N << (M))
1289 #define DO_SRL(N, M)  (N >> (M))
1290 
1291 /* generate the helpers for shift instructions with two vector operators */
1292 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1295 {                                                                         \
1296     uint32_t vm = vext_vm(desc);                                          \
1297     uint32_t vl = env->vl;                                                \
1298     uint32_t esz = sizeof(TS1);                                           \
1299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300     uint32_t vta = vext_vta(desc);                                        \
1301     uint32_t vma = vext_vma(desc);                                        \
1302     uint32_t i;                                                           \
1303                                                                           \
1304     VSTART_CHECK_EARLY_EXIT(env);                                         \
1305                                                                           \
1306     for (i = env->vstart; i < vl; i++) {                                  \
1307         if (!vm && !vext_elem_mask(v0, i)) {                              \
1308             /* set masked-off elements to 1s */                           \
1309             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1310             continue;                                                     \
1311         }                                                                 \
1312         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1313         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1314         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1315     }                                                                     \
1316     env->vstart = 0;                                                      \
1317     /* set tail elements to 1s */                                         \
1318     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1319 }
1320 
1321 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1322 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1323 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1324 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1325 
1326 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1327 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1328 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1329 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1330 
1331 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1332 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1333 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1334 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1335 
1336 /*
1337  * generate the helpers for shift instructions with one vector and one scalar
1338  */
1339 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1340 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1341                   void *vs2, CPURISCVState *env,            \
1342                   uint32_t desc)                            \
1343 {                                                           \
1344     uint32_t vm = vext_vm(desc);                            \
1345     uint32_t vl = env->vl;                                  \
1346     uint32_t esz = sizeof(TD);                              \
1347     uint32_t total_elems =                                  \
1348         vext_get_total_elems(env, desc, esz);               \
1349     uint32_t vta = vext_vta(desc);                          \
1350     uint32_t vma = vext_vma(desc);                          \
1351     uint32_t i;                                             \
1352                                                             \
1353     VSTART_CHECK_EARLY_EXIT(env);                           \
1354                                                             \
1355     for (i = env->vstart; i < vl; i++) {                    \
1356         if (!vm && !vext_elem_mask(v0, i)) {                \
1357             /* set masked-off elements to 1s */             \
1358             vext_set_elems_1s(vd, vma, i * esz,             \
1359                               (i + 1) * esz);               \
1360             continue;                                       \
1361         }                                                   \
1362         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1363         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1364     }                                                       \
1365     env->vstart = 0;                                        \
1366     /* set tail elements to 1s */                           \
1367     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1368 }
1369 
1370 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1371 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1372 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1373 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1374 
1375 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1376 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1377 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1378 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1379 
1380 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1381 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1382 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1383 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1384 
1385 /* Vector Narrowing Integer Right Shift Instructions */
1386 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1387 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1388 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1389 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1390 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1391 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1392 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1393 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1394 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1395 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1396 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1397 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1398 
1399 /* Vector Integer Comparison Instructions */
1400 #define DO_MSEQ(N, M) (N == M)
1401 #define DO_MSNE(N, M) (N != M)
1402 #define DO_MSLT(N, M) (N < M)
1403 #define DO_MSLE(N, M) (N <= M)
1404 #define DO_MSGT(N, M) (N > M)
1405 
1406 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1407 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1408                   CPURISCVState *env, uint32_t desc)          \
1409 {                                                             \
1410     uint32_t vm = vext_vm(desc);                              \
1411     uint32_t vl = env->vl;                                    \
1412     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1413     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1414     uint32_t vma = vext_vma(desc);                            \
1415     uint32_t i;                                               \
1416                                                               \
1417     VSTART_CHECK_EARLY_EXIT(env);                             \
1418                                                               \
1419     for (i = env->vstart; i < vl; i++) {                      \
1420         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1421         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1422         if (!vm && !vext_elem_mask(v0, i)) {                  \
1423             /* set masked-off elements to 1s */               \
1424             if (vma) {                                        \
1425                 vext_set_elem_mask(vd, i, 1);                 \
1426             }                                                 \
1427             continue;                                         \
1428         }                                                     \
1429         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1430     }                                                         \
1431     env->vstart = 0;                                          \
1432     /*
1433      * mask destination register are always tail-agnostic
1434      * set tail elements to 1s
1435      */                                                       \
1436     if (vta_all_1s) {                                         \
1437         for (; i < total_elems; i++) {                        \
1438             vext_set_elem_mask(vd, i, 1);                     \
1439         }                                                     \
1440     }                                                         \
1441 }
1442 
1443 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1444 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1445 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1446 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1447 
1448 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1449 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1450 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1451 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1452 
1453 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1454 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1455 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1456 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1457 
1458 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1459 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1460 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1461 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1462 
1463 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1464 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1465 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1466 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1467 
1468 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1469 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1470 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1471 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1472 
1473 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1474 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1475                   CPURISCVState *env, uint32_t desc)                \
1476 {                                                                   \
1477     uint32_t vm = vext_vm(desc);                                    \
1478     uint32_t vl = env->vl;                                          \
1479     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1480     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1481     uint32_t vma = vext_vma(desc);                                  \
1482     uint32_t i;                                                     \
1483                                                                     \
1484     VSTART_CHECK_EARLY_EXIT(env);                                   \
1485                                                                     \
1486     for (i = env->vstart; i < vl; i++) {                            \
1487         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1488         if (!vm && !vext_elem_mask(v0, i)) {                        \
1489             /* set masked-off elements to 1s */                     \
1490             if (vma) {                                              \
1491                 vext_set_elem_mask(vd, i, 1);                       \
1492             }                                                       \
1493             continue;                                               \
1494         }                                                           \
1495         vext_set_elem_mask(vd, i,                                   \
1496                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1497     }                                                               \
1498     env->vstart = 0;                                                \
1499     /*
1500      * mask destination register are always tail-agnostic
1501      * set tail elements to 1s
1502      */                                                             \
1503     if (vta_all_1s) {                                               \
1504         for (; i < total_elems; i++) {                              \
1505             vext_set_elem_mask(vd, i, 1);                           \
1506         }                                                           \
1507     }                                                               \
1508 }
1509 
1510 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1511 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1512 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1513 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1514 
1515 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1516 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1517 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1518 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1519 
1520 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1521 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1522 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1523 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1524 
1525 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1526 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1527 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1528 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1529 
1530 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1531 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1532 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1533 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1534 
1535 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1536 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1537 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1538 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1539 
1540 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1541 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1542 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1543 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1544 
1545 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1546 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1547 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1548 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1549 
1550 /* Vector Integer Min/Max Instructions */
1551 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1552 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1553 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1554 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1555 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1556 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1557 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1558 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1559 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1560 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1561 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1562 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1563 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1564 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1565 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1566 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1567 GEN_VEXT_VV(vminu_vv_b, 1)
1568 GEN_VEXT_VV(vminu_vv_h, 2)
1569 GEN_VEXT_VV(vminu_vv_w, 4)
1570 GEN_VEXT_VV(vminu_vv_d, 8)
1571 GEN_VEXT_VV(vmin_vv_b, 1)
1572 GEN_VEXT_VV(vmin_vv_h, 2)
1573 GEN_VEXT_VV(vmin_vv_w, 4)
1574 GEN_VEXT_VV(vmin_vv_d, 8)
1575 GEN_VEXT_VV(vmaxu_vv_b, 1)
1576 GEN_VEXT_VV(vmaxu_vv_h, 2)
1577 GEN_VEXT_VV(vmaxu_vv_w, 4)
1578 GEN_VEXT_VV(vmaxu_vv_d, 8)
1579 GEN_VEXT_VV(vmax_vv_b, 1)
1580 GEN_VEXT_VV(vmax_vv_h, 2)
1581 GEN_VEXT_VV(vmax_vv_w, 4)
1582 GEN_VEXT_VV(vmax_vv_d, 8)
1583 
1584 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1585 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1586 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1587 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1588 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1589 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1590 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1591 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1592 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1593 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1594 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1595 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1596 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1597 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1598 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1599 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1600 GEN_VEXT_VX(vminu_vx_b, 1)
1601 GEN_VEXT_VX(vminu_vx_h, 2)
1602 GEN_VEXT_VX(vminu_vx_w, 4)
1603 GEN_VEXT_VX(vminu_vx_d, 8)
1604 GEN_VEXT_VX(vmin_vx_b, 1)
1605 GEN_VEXT_VX(vmin_vx_h, 2)
1606 GEN_VEXT_VX(vmin_vx_w, 4)
1607 GEN_VEXT_VX(vmin_vx_d, 8)
1608 GEN_VEXT_VX(vmaxu_vx_b, 1)
1609 GEN_VEXT_VX(vmaxu_vx_h, 2)
1610 GEN_VEXT_VX(vmaxu_vx_w, 4)
1611 GEN_VEXT_VX(vmaxu_vx_d, 8)
1612 GEN_VEXT_VX(vmax_vx_b, 1)
1613 GEN_VEXT_VX(vmax_vx_h, 2)
1614 GEN_VEXT_VX(vmax_vx_w, 4)
1615 GEN_VEXT_VX(vmax_vx_d, 8)
1616 
1617 /* Vector Single-Width Integer Multiply Instructions */
1618 #define DO_MUL(N, M) (N * M)
1619 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1620 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1621 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1622 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1623 GEN_VEXT_VV(vmul_vv_b, 1)
1624 GEN_VEXT_VV(vmul_vv_h, 2)
1625 GEN_VEXT_VV(vmul_vv_w, 4)
1626 GEN_VEXT_VV(vmul_vv_d, 8)
1627 
1628 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1629 {
1630     return (int16_t)s2 * (int16_t)s1 >> 8;
1631 }
1632 
1633 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1634 {
1635     return (int32_t)s2 * (int32_t)s1 >> 16;
1636 }
1637 
1638 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1639 {
1640     return (int64_t)s2 * (int64_t)s1 >> 32;
1641 }
1642 
1643 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1644 {
1645     uint64_t hi_64, lo_64;
1646 
1647     muls64(&lo_64, &hi_64, s1, s2);
1648     return hi_64;
1649 }
1650 
1651 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1652 {
1653     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1654 }
1655 
1656 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1657 {
1658     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1659 }
1660 
1661 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1662 {
1663     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1664 }
1665 
1666 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1667 {
1668     uint64_t hi_64, lo_64;
1669 
1670     mulu64(&lo_64, &hi_64, s2, s1);
1671     return hi_64;
1672 }
1673 
1674 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1675 {
1676     return (int16_t)s2 * (uint16_t)s1 >> 8;
1677 }
1678 
1679 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1680 {
1681     return (int32_t)s2 * (uint32_t)s1 >> 16;
1682 }
1683 
1684 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1685 {
1686     return (int64_t)s2 * (uint64_t)s1 >> 32;
1687 }
1688 
1689 /*
1690  * Let  A = signed operand,
1691  *      B = unsigned operand
1692  *      P = mulu64(A, B), unsigned product
1693  *
1694  * LET  X = 2 ** 64  - A, 2's complement of A
1695  *      SP = signed product
1696  * THEN
1697  *      IF A < 0
1698  *          SP = -X * B
1699  *             = -(2 ** 64 - A) * B
1700  *             = A * B - 2 ** 64 * B
1701  *             = P - 2 ** 64 * B
1702  *      ELSE
1703  *          SP = P
1704  * THEN
1705  *      HI_P -= (A < 0 ? B : 0)
1706  */
1707 
1708 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1709 {
1710     uint64_t hi_64, lo_64;
1711 
1712     mulu64(&lo_64, &hi_64, s2, s1);
1713 
1714     hi_64 -= s2 < 0 ? s1 : 0;
1715     return hi_64;
1716 }
1717 
1718 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1719 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1720 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1721 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1722 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1723 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1724 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1725 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1726 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1727 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1728 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1729 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1730 GEN_VEXT_VV(vmulh_vv_b, 1)
1731 GEN_VEXT_VV(vmulh_vv_h, 2)
1732 GEN_VEXT_VV(vmulh_vv_w, 4)
1733 GEN_VEXT_VV(vmulh_vv_d, 8)
1734 GEN_VEXT_VV(vmulhu_vv_b, 1)
1735 GEN_VEXT_VV(vmulhu_vv_h, 2)
1736 GEN_VEXT_VV(vmulhu_vv_w, 4)
1737 GEN_VEXT_VV(vmulhu_vv_d, 8)
1738 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1739 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1740 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1741 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1742 
1743 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1744 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1745 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1746 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1747 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1748 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1749 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1750 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1751 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1752 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1753 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1754 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1755 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1756 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1757 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1758 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1759 GEN_VEXT_VX(vmul_vx_b, 1)
1760 GEN_VEXT_VX(vmul_vx_h, 2)
1761 GEN_VEXT_VX(vmul_vx_w, 4)
1762 GEN_VEXT_VX(vmul_vx_d, 8)
1763 GEN_VEXT_VX(vmulh_vx_b, 1)
1764 GEN_VEXT_VX(vmulh_vx_h, 2)
1765 GEN_VEXT_VX(vmulh_vx_w, 4)
1766 GEN_VEXT_VX(vmulh_vx_d, 8)
1767 GEN_VEXT_VX(vmulhu_vx_b, 1)
1768 GEN_VEXT_VX(vmulhu_vx_h, 2)
1769 GEN_VEXT_VX(vmulhu_vx_w, 4)
1770 GEN_VEXT_VX(vmulhu_vx_d, 8)
1771 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1772 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1773 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1774 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1775 
1776 /* Vector Integer Divide Instructions */
1777 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1778 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1779 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1780         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1781 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1782         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1783 
1784 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1785 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1786 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1787 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1788 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1789 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1790 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1791 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1792 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1793 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1794 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1795 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1796 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1797 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1798 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1799 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1800 GEN_VEXT_VV(vdivu_vv_b, 1)
1801 GEN_VEXT_VV(vdivu_vv_h, 2)
1802 GEN_VEXT_VV(vdivu_vv_w, 4)
1803 GEN_VEXT_VV(vdivu_vv_d, 8)
1804 GEN_VEXT_VV(vdiv_vv_b, 1)
1805 GEN_VEXT_VV(vdiv_vv_h, 2)
1806 GEN_VEXT_VV(vdiv_vv_w, 4)
1807 GEN_VEXT_VV(vdiv_vv_d, 8)
1808 GEN_VEXT_VV(vremu_vv_b, 1)
1809 GEN_VEXT_VV(vremu_vv_h, 2)
1810 GEN_VEXT_VV(vremu_vv_w, 4)
1811 GEN_VEXT_VV(vremu_vv_d, 8)
1812 GEN_VEXT_VV(vrem_vv_b, 1)
1813 GEN_VEXT_VV(vrem_vv_h, 2)
1814 GEN_VEXT_VV(vrem_vv_w, 4)
1815 GEN_VEXT_VV(vrem_vv_d, 8)
1816 
1817 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1818 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1819 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1820 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1821 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1822 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1823 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1824 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1825 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1826 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1827 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1828 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1829 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1830 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1831 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1832 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1833 GEN_VEXT_VX(vdivu_vx_b, 1)
1834 GEN_VEXT_VX(vdivu_vx_h, 2)
1835 GEN_VEXT_VX(vdivu_vx_w, 4)
1836 GEN_VEXT_VX(vdivu_vx_d, 8)
1837 GEN_VEXT_VX(vdiv_vx_b, 1)
1838 GEN_VEXT_VX(vdiv_vx_h, 2)
1839 GEN_VEXT_VX(vdiv_vx_w, 4)
1840 GEN_VEXT_VX(vdiv_vx_d, 8)
1841 GEN_VEXT_VX(vremu_vx_b, 1)
1842 GEN_VEXT_VX(vremu_vx_h, 2)
1843 GEN_VEXT_VX(vremu_vx_w, 4)
1844 GEN_VEXT_VX(vremu_vx_d, 8)
1845 GEN_VEXT_VX(vrem_vx_b, 1)
1846 GEN_VEXT_VX(vrem_vx_h, 2)
1847 GEN_VEXT_VX(vrem_vx_w, 4)
1848 GEN_VEXT_VX(vrem_vx_d, 8)
1849 
1850 /* Vector Widening Integer Multiply Instructions */
1851 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1852 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1853 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1854 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1855 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1856 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1857 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1858 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1859 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1860 GEN_VEXT_VV(vwmul_vv_b, 2)
1861 GEN_VEXT_VV(vwmul_vv_h, 4)
1862 GEN_VEXT_VV(vwmul_vv_w, 8)
1863 GEN_VEXT_VV(vwmulu_vv_b, 2)
1864 GEN_VEXT_VV(vwmulu_vv_h, 4)
1865 GEN_VEXT_VV(vwmulu_vv_w, 8)
1866 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1867 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1868 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1869 
1870 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1871 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1872 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1873 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1874 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1875 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1876 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1877 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1878 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1879 GEN_VEXT_VX(vwmul_vx_b, 2)
1880 GEN_VEXT_VX(vwmul_vx_h, 4)
1881 GEN_VEXT_VX(vwmul_vx_w, 8)
1882 GEN_VEXT_VX(vwmulu_vx_b, 2)
1883 GEN_VEXT_VX(vwmulu_vx_h, 4)
1884 GEN_VEXT_VX(vwmulu_vx_w, 8)
1885 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1886 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1887 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1888 
1889 /* Vector Single-Width Integer Multiply-Add Instructions */
1890 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1891 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1892 {                                                                  \
1893     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1894     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1895     TD d = *((TD *)vd + HD(i));                                    \
1896     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1897 }
1898 
1899 #define DO_MACC(N, M, D) (M * N + D)
1900 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1901 #define DO_MADD(N, M, D) (M * D + N)
1902 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1903 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1904 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1905 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1906 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1907 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1908 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1909 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1910 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1911 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1912 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1913 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1914 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1915 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1916 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1917 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1918 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1919 GEN_VEXT_VV(vmacc_vv_b, 1)
1920 GEN_VEXT_VV(vmacc_vv_h, 2)
1921 GEN_VEXT_VV(vmacc_vv_w, 4)
1922 GEN_VEXT_VV(vmacc_vv_d, 8)
1923 GEN_VEXT_VV(vnmsac_vv_b, 1)
1924 GEN_VEXT_VV(vnmsac_vv_h, 2)
1925 GEN_VEXT_VV(vnmsac_vv_w, 4)
1926 GEN_VEXT_VV(vnmsac_vv_d, 8)
1927 GEN_VEXT_VV(vmadd_vv_b, 1)
1928 GEN_VEXT_VV(vmadd_vv_h, 2)
1929 GEN_VEXT_VV(vmadd_vv_w, 4)
1930 GEN_VEXT_VV(vmadd_vv_d, 8)
1931 GEN_VEXT_VV(vnmsub_vv_b, 1)
1932 GEN_VEXT_VV(vnmsub_vv_h, 2)
1933 GEN_VEXT_VV(vnmsub_vv_w, 4)
1934 GEN_VEXT_VV(vnmsub_vv_d, 8)
1935 
1936 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1937 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1938 {                                                                   \
1939     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1940     TD d = *((TD *)vd + HD(i));                                     \
1941     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1942 }
1943 
1944 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1945 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1946 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1947 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1948 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1949 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1950 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1951 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1952 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1953 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1954 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1955 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1956 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1957 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1958 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1959 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1960 GEN_VEXT_VX(vmacc_vx_b, 1)
1961 GEN_VEXT_VX(vmacc_vx_h, 2)
1962 GEN_VEXT_VX(vmacc_vx_w, 4)
1963 GEN_VEXT_VX(vmacc_vx_d, 8)
1964 GEN_VEXT_VX(vnmsac_vx_b, 1)
1965 GEN_VEXT_VX(vnmsac_vx_h, 2)
1966 GEN_VEXT_VX(vnmsac_vx_w, 4)
1967 GEN_VEXT_VX(vnmsac_vx_d, 8)
1968 GEN_VEXT_VX(vmadd_vx_b, 1)
1969 GEN_VEXT_VX(vmadd_vx_h, 2)
1970 GEN_VEXT_VX(vmadd_vx_w, 4)
1971 GEN_VEXT_VX(vmadd_vx_d, 8)
1972 GEN_VEXT_VX(vnmsub_vx_b, 1)
1973 GEN_VEXT_VX(vnmsub_vx_h, 2)
1974 GEN_VEXT_VX(vnmsub_vx_w, 4)
1975 GEN_VEXT_VX(vnmsub_vx_d, 8)
1976 
1977 /* Vector Widening Integer Multiply-Add Instructions */
1978 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1979 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1980 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1981 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1982 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1983 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1984 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1985 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1986 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1987 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1988 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1989 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1990 GEN_VEXT_VV(vwmacc_vv_b, 2)
1991 GEN_VEXT_VV(vwmacc_vv_h, 4)
1992 GEN_VEXT_VV(vwmacc_vv_w, 8)
1993 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1994 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1995 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1996 
1997 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1998 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1999 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2000 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2001 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2002 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2003 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2004 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2005 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2006 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2007 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2008 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2009 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2010 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2011 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2012 GEN_VEXT_VX(vwmacc_vx_b, 2)
2013 GEN_VEXT_VX(vwmacc_vx_h, 4)
2014 GEN_VEXT_VX(vwmacc_vx_w, 8)
2015 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2016 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2017 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2018 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2019 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2020 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2021 
2022 /* Vector Integer Merge and Move Instructions */
2023 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2024 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2025                   uint32_t desc)                                     \
2026 {                                                                    \
2027     uint32_t vl = env->vl;                                           \
2028     uint32_t esz = sizeof(ETYPE);                                    \
2029     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2030     uint32_t vta = vext_vta(desc);                                   \
2031     uint32_t i;                                                      \
2032                                                                      \
2033     VSTART_CHECK_EARLY_EXIT(env);                                    \
2034                                                                      \
2035     for (i = env->vstart; i < vl; i++) {                             \
2036         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2037         *((ETYPE *)vd + H(i)) = s1;                                  \
2038     }                                                                \
2039     env->vstart = 0;                                                 \
2040     /* set tail elements to 1s */                                    \
2041     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2042 }
2043 
2044 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2045 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2046 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2047 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2048 
2049 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2050 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2051                   uint32_t desc)                                     \
2052 {                                                                    \
2053     uint32_t vl = env->vl;                                           \
2054     uint32_t esz = sizeof(ETYPE);                                    \
2055     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2056     uint32_t vta = vext_vta(desc);                                   \
2057     uint32_t i;                                                      \
2058                                                                      \
2059     VSTART_CHECK_EARLY_EXIT(env);                                    \
2060                                                                      \
2061     for (i = env->vstart; i < vl; i++) {                             \
2062         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2063     }                                                                \
2064     env->vstart = 0;                                                 \
2065     /* set tail elements to 1s */                                    \
2066     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2067 }
2068 
2069 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2070 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2071 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2072 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2073 
2074 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2075 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2076                   CPURISCVState *env, uint32_t desc)                 \
2077 {                                                                    \
2078     uint32_t vl = env->vl;                                           \
2079     uint32_t esz = sizeof(ETYPE);                                    \
2080     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2081     uint32_t vta = vext_vta(desc);                                   \
2082     uint32_t i;                                                      \
2083                                                                      \
2084     VSTART_CHECK_EARLY_EXIT(env);                                    \
2085                                                                      \
2086     for (i = env->vstart; i < vl; i++) {                             \
2087         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2088         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2089     }                                                                \
2090     env->vstart = 0;                                                 \
2091     /* set tail elements to 1s */                                    \
2092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2093 }
2094 
2095 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2096 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2097 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2098 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2099 
2100 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2101 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2102                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2103 {                                                                    \
2104     uint32_t vl = env->vl;                                           \
2105     uint32_t esz = sizeof(ETYPE);                                    \
2106     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2107     uint32_t vta = vext_vta(desc);                                   \
2108     uint32_t i;                                                      \
2109                                                                      \
2110     VSTART_CHECK_EARLY_EXIT(env);                                    \
2111                                                                      \
2112     for (i = env->vstart; i < vl; i++) {                             \
2113         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2114         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2115                    (ETYPE)(target_long)s1);                          \
2116         *((ETYPE *)vd + H(i)) = d;                                   \
2117     }                                                                \
2118     env->vstart = 0;                                                 \
2119     /* set tail elements to 1s */                                    \
2120     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2121 }
2122 
2123 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2124 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2125 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2126 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2127 
2128 /*
2129  * Vector Fixed-Point Arithmetic Instructions
2130  */
2131 
2132 /* Vector Single-Width Saturating Add and Subtract */
2133 
2134 /*
2135  * As fixed point instructions probably have round mode and saturation,
2136  * define common macros for fixed point here.
2137  */
2138 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2139                           CPURISCVState *env, int vxrm);
2140 
2141 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2142 static inline void                                                  \
2143 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2144           CPURISCVState *env, int vxrm)                             \
2145 {                                                                   \
2146     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2147     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2148     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2149 }
2150 
2151 static inline void
2152 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2153              CPURISCVState *env,
2154              uint32_t vl, uint32_t vm, int vxrm,
2155              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2156 {
2157     VSTART_CHECK_EARLY_EXIT(env);
2158 
2159     for (uint32_t i = env->vstart; i < vl; i++) {
2160         if (!vm && !vext_elem_mask(v0, i)) {
2161             /* set masked-off elements to 1s */
2162             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2163             continue;
2164         }
2165         fn(vd, vs1, vs2, i, env, vxrm);
2166     }
2167     env->vstart = 0;
2168 }
2169 
2170 static inline void
2171 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2172              CPURISCVState *env,
2173              uint32_t desc,
2174              opivv2_rm_fn *fn, uint32_t esz)
2175 {
2176     uint32_t vm = vext_vm(desc);
2177     uint32_t vl = env->vl;
2178     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2179     uint32_t vta = vext_vta(desc);
2180     uint32_t vma = vext_vma(desc);
2181 
2182     switch (env->vxrm) {
2183     case 0: /* rnu */
2184         vext_vv_rm_1(vd, v0, vs1, vs2,
2185                      env, vl, vm, 0, fn, vma, esz);
2186         break;
2187     case 1: /* rne */
2188         vext_vv_rm_1(vd, v0, vs1, vs2,
2189                      env, vl, vm, 1, fn, vma, esz);
2190         break;
2191     case 2: /* rdn */
2192         vext_vv_rm_1(vd, v0, vs1, vs2,
2193                      env, vl, vm, 2, fn, vma, esz);
2194         break;
2195     default: /* rod */
2196         vext_vv_rm_1(vd, v0, vs1, vs2,
2197                      env, vl, vm, 3, fn, vma, esz);
2198         break;
2199     }
2200     /* set tail elements to 1s */
2201     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2202 }
2203 
2204 /* generate helpers for fixed point instructions with OPIVV format */
2205 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2206 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2207                   CPURISCVState *env, uint32_t desc)            \
2208 {                                                               \
2209     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2210                  do_##NAME, ESZ);                               \
2211 }
2212 
2213 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2214                              uint8_t b)
2215 {
2216     uint8_t res = a + b;
2217     if (res < a) {
2218         res = UINT8_MAX;
2219         env->vxsat = 0x1;
2220     }
2221     return res;
2222 }
2223 
2224 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2225                                uint16_t b)
2226 {
2227     uint16_t res = a + b;
2228     if (res < a) {
2229         res = UINT16_MAX;
2230         env->vxsat = 0x1;
2231     }
2232     return res;
2233 }
2234 
2235 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2236                                uint32_t b)
2237 {
2238     uint32_t res = a + b;
2239     if (res < a) {
2240         res = UINT32_MAX;
2241         env->vxsat = 0x1;
2242     }
2243     return res;
2244 }
2245 
2246 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2247                                uint64_t b)
2248 {
2249     uint64_t res = a + b;
2250     if (res < a) {
2251         res = UINT64_MAX;
2252         env->vxsat = 0x1;
2253     }
2254     return res;
2255 }
2256 
2257 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2258 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2259 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2260 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2261 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2262 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2263 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2264 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2265 
2266 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2267                           CPURISCVState *env, int vxrm);
2268 
2269 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2270 static inline void                                                  \
2271 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2272           CPURISCVState *env, int vxrm)                             \
2273 {                                                                   \
2274     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2275     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2276 }
2277 
2278 static inline void
2279 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2280              CPURISCVState *env,
2281              uint32_t vl, uint32_t vm, int vxrm,
2282              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2283 {
2284     VSTART_CHECK_EARLY_EXIT(env);
2285 
2286     for (uint32_t i = env->vstart; i < vl; i++) {
2287         if (!vm && !vext_elem_mask(v0, i)) {
2288             /* set masked-off elements to 1s */
2289             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2290             continue;
2291         }
2292         fn(vd, s1, vs2, i, env, vxrm);
2293     }
2294     env->vstart = 0;
2295 }
2296 
2297 static inline void
2298 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2299              CPURISCVState *env,
2300              uint32_t desc,
2301              opivx2_rm_fn *fn, uint32_t esz)
2302 {
2303     uint32_t vm = vext_vm(desc);
2304     uint32_t vl = env->vl;
2305     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2306     uint32_t vta = vext_vta(desc);
2307     uint32_t vma = vext_vma(desc);
2308 
2309     switch (env->vxrm) {
2310     case 0: /* rnu */
2311         vext_vx_rm_1(vd, v0, s1, vs2,
2312                      env, vl, vm, 0, fn, vma, esz);
2313         break;
2314     case 1: /* rne */
2315         vext_vx_rm_1(vd, v0, s1, vs2,
2316                      env, vl, vm, 1, fn, vma, esz);
2317         break;
2318     case 2: /* rdn */
2319         vext_vx_rm_1(vd, v0, s1, vs2,
2320                      env, vl, vm, 2, fn, vma, esz);
2321         break;
2322     default: /* rod */
2323         vext_vx_rm_1(vd, v0, s1, vs2,
2324                      env, vl, vm, 3, fn, vma, esz);
2325         break;
2326     }
2327     /* set tail elements to 1s */
2328     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2329 }
2330 
2331 /* generate helpers for fixed point instructions with OPIVX format */
2332 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2333 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2334                   void *vs2, CPURISCVState *env,          \
2335                   uint32_t desc)                          \
2336 {                                                         \
2337     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2338                  do_##NAME, ESZ);                         \
2339 }
2340 
2341 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2342 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2343 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2344 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2345 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2346 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2347 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2348 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2349 
2350 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2351 {
2352     int8_t res = a + b;
2353     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2354         res = a > 0 ? INT8_MAX : INT8_MIN;
2355         env->vxsat = 0x1;
2356     }
2357     return res;
2358 }
2359 
2360 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2361                              int16_t b)
2362 {
2363     int16_t res = a + b;
2364     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2365         res = a > 0 ? INT16_MAX : INT16_MIN;
2366         env->vxsat = 0x1;
2367     }
2368     return res;
2369 }
2370 
2371 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2372                              int32_t b)
2373 {
2374     int32_t res = a + b;
2375     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2376         res = a > 0 ? INT32_MAX : INT32_MIN;
2377         env->vxsat = 0x1;
2378     }
2379     return res;
2380 }
2381 
2382 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2383                              int64_t b)
2384 {
2385     int64_t res = a + b;
2386     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2387         res = a > 0 ? INT64_MAX : INT64_MIN;
2388         env->vxsat = 0x1;
2389     }
2390     return res;
2391 }
2392 
2393 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2394 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2395 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2396 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2397 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2398 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2399 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2400 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2401 
2402 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2403 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2404 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2405 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2406 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2407 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2408 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2409 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2410 
2411 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2412                              uint8_t b)
2413 {
2414     uint8_t res = a - b;
2415     if (res > a) {
2416         res = 0;
2417         env->vxsat = 0x1;
2418     }
2419     return res;
2420 }
2421 
2422 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2423                                uint16_t b)
2424 {
2425     uint16_t res = a - b;
2426     if (res > a) {
2427         res = 0;
2428         env->vxsat = 0x1;
2429     }
2430     return res;
2431 }
2432 
2433 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2434                                uint32_t b)
2435 {
2436     uint32_t res = a - b;
2437     if (res > a) {
2438         res = 0;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2445                                uint64_t b)
2446 {
2447     uint64_t res = a - b;
2448     if (res > a) {
2449         res = 0;
2450         env->vxsat = 0x1;
2451     }
2452     return res;
2453 }
2454 
2455 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2456 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2457 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2458 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2459 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2460 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2461 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2462 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2463 
2464 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2465 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2466 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2467 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2468 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2469 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2470 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2471 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2472 
2473 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2474 {
2475     int8_t res = a - b;
2476     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2477         res = a >= 0 ? INT8_MAX : INT8_MIN;
2478         env->vxsat = 0x1;
2479     }
2480     return res;
2481 }
2482 
2483 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2484                              int16_t b)
2485 {
2486     int16_t res = a - b;
2487     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2488         res = a >= 0 ? INT16_MAX : INT16_MIN;
2489         env->vxsat = 0x1;
2490     }
2491     return res;
2492 }
2493 
2494 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2495                              int32_t b)
2496 {
2497     int32_t res = a - b;
2498     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2499         res = a >= 0 ? INT32_MAX : INT32_MIN;
2500         env->vxsat = 0x1;
2501     }
2502     return res;
2503 }
2504 
2505 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2506                              int64_t b)
2507 {
2508     int64_t res = a - b;
2509     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2510         res = a >= 0 ? INT64_MAX : INT64_MIN;
2511         env->vxsat = 0x1;
2512     }
2513     return res;
2514 }
2515 
2516 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2517 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2518 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2519 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2520 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2521 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2522 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2523 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2524 
2525 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2526 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2527 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2528 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2529 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2530 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2531 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2532 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2533 
2534 /* Vector Single-Width Averaging Add and Subtract */
2535 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2536 {
2537     uint8_t d = extract64(v, shift, 1);
2538     uint8_t d1;
2539     uint64_t D1, D2;
2540 
2541     if (shift == 0 || shift > 64) {
2542         return 0;
2543     }
2544 
2545     d1 = extract64(v, shift - 1, 1);
2546     D1 = extract64(v, 0, shift);
2547     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2548         return d1;
2549     } else if (vxrm == 1) { /* round-to-nearest-even */
2550         if (shift > 1) {
2551             D2 = extract64(v, 0, shift - 1);
2552             return d1 & ((D2 != 0) | d);
2553         } else {
2554             return d1 & d;
2555         }
2556     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2557         return !d & (D1 != 0);
2558     }
2559     return 0; /* round-down (truncate) */
2560 }
2561 
2562 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2563                              int32_t b)
2564 {
2565     int64_t res = (int64_t)a + b;
2566     uint8_t round = get_round(vxrm, res, 1);
2567 
2568     return (res >> 1) + round;
2569 }
2570 
2571 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2572                              int64_t b)
2573 {
2574     int64_t res = a + b;
2575     uint8_t round = get_round(vxrm, res, 1);
2576     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2577 
2578     /* With signed overflow, bit 64 is inverse of bit 63. */
2579     return ((res >> 1) ^ over) + round;
2580 }
2581 
2582 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2583 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2584 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2585 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2586 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2587 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2588 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2589 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2590 
2591 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2592 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2593 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2594 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2595 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2596 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2597 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2598 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2599 
2600 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2601                                uint32_t a, uint32_t b)
2602 {
2603     uint64_t res = (uint64_t)a + b;
2604     uint8_t round = get_round(vxrm, res, 1);
2605 
2606     return (res >> 1) + round;
2607 }
2608 
2609 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2610                                uint64_t a, uint64_t b)
2611 {
2612     uint64_t res = a + b;
2613     uint8_t round = get_round(vxrm, res, 1);
2614     uint64_t over = (uint64_t)(res < a) << 63;
2615 
2616     return ((res >> 1) | over) + round;
2617 }
2618 
2619 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2620 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2621 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2622 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2623 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2624 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2625 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2626 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2627 
2628 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2629 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2630 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2631 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2632 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2633 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2634 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2635 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2636 
2637 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2638                              int32_t b)
2639 {
2640     int64_t res = (int64_t)a - b;
2641     uint8_t round = get_round(vxrm, res, 1);
2642 
2643     return (res >> 1) + round;
2644 }
2645 
2646 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2647                              int64_t b)
2648 {
2649     int64_t res = (int64_t)a - b;
2650     uint8_t round = get_round(vxrm, res, 1);
2651     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2652 
2653     /* With signed overflow, bit 64 is inverse of bit 63. */
2654     return ((res >> 1) ^ over) + round;
2655 }
2656 
2657 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2658 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2659 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2660 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2661 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2662 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2663 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2664 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2665 
2666 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2667 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2668 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2669 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2670 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2671 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2672 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2673 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2674 
2675 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2676                                uint32_t a, uint32_t b)
2677 {
2678     int64_t res = (int64_t)a - b;
2679     uint8_t round = get_round(vxrm, res, 1);
2680 
2681     return (res >> 1) + round;
2682 }
2683 
2684 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2685                                uint64_t a, uint64_t b)
2686 {
2687     uint64_t res = (uint64_t)a - b;
2688     uint8_t round = get_round(vxrm, res, 1);
2689     uint64_t over = (uint64_t)(res > a) << 63;
2690 
2691     return ((res >> 1) | over) + round;
2692 }
2693 
2694 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2695 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2696 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2697 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2698 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2699 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2700 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2701 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2702 
2703 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2704 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2705 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2706 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2707 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2708 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2709 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2710 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2711 
2712 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2713 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2714 {
2715     uint8_t round;
2716     int16_t res;
2717 
2718     res = (int16_t)a * (int16_t)b;
2719     round = get_round(vxrm, res, 7);
2720     res = (res >> 7) + round;
2721 
2722     if (res > INT8_MAX) {
2723         env->vxsat = 0x1;
2724         return INT8_MAX;
2725     } else if (res < INT8_MIN) {
2726         env->vxsat = 0x1;
2727         return INT8_MIN;
2728     } else {
2729         return res;
2730     }
2731 }
2732 
2733 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2734 {
2735     uint8_t round;
2736     int32_t res;
2737 
2738     res = (int32_t)a * (int32_t)b;
2739     round = get_round(vxrm, res, 15);
2740     res = (res >> 15) + round;
2741 
2742     if (res > INT16_MAX) {
2743         env->vxsat = 0x1;
2744         return INT16_MAX;
2745     } else if (res < INT16_MIN) {
2746         env->vxsat = 0x1;
2747         return INT16_MIN;
2748     } else {
2749         return res;
2750     }
2751 }
2752 
2753 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2754 {
2755     uint8_t round;
2756     int64_t res;
2757 
2758     res = (int64_t)a * (int64_t)b;
2759     round = get_round(vxrm, res, 31);
2760     res = (res >> 31) + round;
2761 
2762     if (res > INT32_MAX) {
2763         env->vxsat = 0x1;
2764         return INT32_MAX;
2765     } else if (res < INT32_MIN) {
2766         env->vxsat = 0x1;
2767         return INT32_MIN;
2768     } else {
2769         return res;
2770     }
2771 }
2772 
2773 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2774 {
2775     uint8_t round;
2776     uint64_t hi_64, lo_64;
2777     int64_t res;
2778 
2779     if (a == INT64_MIN && b == INT64_MIN) {
2780         env->vxsat = 1;
2781         return INT64_MAX;
2782     }
2783 
2784     muls64(&lo_64, &hi_64, a, b);
2785     round = get_round(vxrm, lo_64, 63);
2786     /*
2787      * Cannot overflow, as there are always
2788      * 2 sign bits after multiply.
2789      */
2790     res = (hi_64 << 1) | (lo_64 >> 63);
2791     if (round) {
2792         if (res == INT64_MAX) {
2793             env->vxsat = 1;
2794         } else {
2795             res += 1;
2796         }
2797     }
2798     return res;
2799 }
2800 
2801 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2802 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2803 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2804 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2805 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2806 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2807 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2808 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2809 
2810 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2811 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2812 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2813 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2814 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2815 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2816 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2817 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2818 
2819 /* Vector Single-Width Scaling Shift Instructions */
2820 static inline uint8_t
2821 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2822 {
2823     uint8_t round, shift = b & 0x7;
2824     uint8_t res;
2825 
2826     round = get_round(vxrm, a, shift);
2827     res = (a >> shift) + round;
2828     return res;
2829 }
2830 static inline uint16_t
2831 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2832 {
2833     uint8_t round, shift = b & 0xf;
2834 
2835     round = get_round(vxrm, a, shift);
2836     return (a >> shift) + round;
2837 }
2838 static inline uint32_t
2839 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2840 {
2841     uint8_t round, shift = b & 0x1f;
2842 
2843     round = get_round(vxrm, a, shift);
2844     return (a >> shift) + round;
2845 }
2846 static inline uint64_t
2847 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2848 {
2849     uint8_t round, shift = b & 0x3f;
2850 
2851     round = get_round(vxrm, a, shift);
2852     return (a >> shift) + round;
2853 }
2854 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2855 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2856 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2857 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2858 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2859 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2860 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2861 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2862 
2863 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2864 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2865 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2866 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2867 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2868 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2869 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2870 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2871 
2872 static inline int8_t
2873 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2874 {
2875     uint8_t round, shift = b & 0x7;
2876 
2877     round = get_round(vxrm, a, shift);
2878     return (a >> shift) + round;
2879 }
2880 static inline int16_t
2881 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2882 {
2883     uint8_t round, shift = b & 0xf;
2884 
2885     round = get_round(vxrm, a, shift);
2886     return (a >> shift) + round;
2887 }
2888 static inline int32_t
2889 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2890 {
2891     uint8_t round, shift = b & 0x1f;
2892 
2893     round = get_round(vxrm, a, shift);
2894     return (a >> shift) + round;
2895 }
2896 static inline int64_t
2897 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2898 {
2899     uint8_t round, shift = b & 0x3f;
2900 
2901     round = get_round(vxrm, a, shift);
2902     return (a >> shift) + round;
2903 }
2904 
2905 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2906 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2907 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2908 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2909 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2910 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2911 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2912 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2913 
2914 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2915 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2916 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2917 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2918 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2919 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2920 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2921 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2922 
2923 /* Vector Narrowing Fixed-Point Clip Instructions */
2924 static inline int8_t
2925 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2926 {
2927     uint8_t round, shift = b & 0xf;
2928     int16_t res;
2929 
2930     round = get_round(vxrm, a, shift);
2931     res = (a >> shift) + round;
2932     if (res > INT8_MAX) {
2933         env->vxsat = 0x1;
2934         return INT8_MAX;
2935     } else if (res < INT8_MIN) {
2936         env->vxsat = 0x1;
2937         return INT8_MIN;
2938     } else {
2939         return res;
2940     }
2941 }
2942 
2943 static inline int16_t
2944 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2945 {
2946     uint8_t round, shift = b & 0x1f;
2947     int32_t res;
2948 
2949     round = get_round(vxrm, a, shift);
2950     res = (a >> shift) + round;
2951     if (res > INT16_MAX) {
2952         env->vxsat = 0x1;
2953         return INT16_MAX;
2954     } else if (res < INT16_MIN) {
2955         env->vxsat = 0x1;
2956         return INT16_MIN;
2957     } else {
2958         return res;
2959     }
2960 }
2961 
2962 static inline int32_t
2963 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2964 {
2965     uint8_t round, shift = b & 0x3f;
2966     int64_t res;
2967 
2968     round = get_round(vxrm, a, shift);
2969     res = (a >> shift) + round;
2970     if (res > INT32_MAX) {
2971         env->vxsat = 0x1;
2972         return INT32_MAX;
2973     } else if (res < INT32_MIN) {
2974         env->vxsat = 0x1;
2975         return INT32_MIN;
2976     } else {
2977         return res;
2978     }
2979 }
2980 
2981 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2982 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2983 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2984 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2985 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2986 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2987 
2988 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2989 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2990 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2991 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2992 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2993 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2994 
2995 static inline uint8_t
2996 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2997 {
2998     uint8_t round, shift = b & 0xf;
2999     uint16_t res;
3000 
3001     round = get_round(vxrm, a, shift);
3002     res = (a >> shift) + round;
3003     if (res > UINT8_MAX) {
3004         env->vxsat = 0x1;
3005         return UINT8_MAX;
3006     } else {
3007         return res;
3008     }
3009 }
3010 
3011 static inline uint16_t
3012 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3013 {
3014     uint8_t round, shift = b & 0x1f;
3015     uint32_t res;
3016 
3017     round = get_round(vxrm, a, shift);
3018     res = (a >> shift) + round;
3019     if (res > UINT16_MAX) {
3020         env->vxsat = 0x1;
3021         return UINT16_MAX;
3022     } else {
3023         return res;
3024     }
3025 }
3026 
3027 static inline uint32_t
3028 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3029 {
3030     uint8_t round, shift = b & 0x3f;
3031     uint64_t res;
3032 
3033     round = get_round(vxrm, a, shift);
3034     res = (a >> shift) + round;
3035     if (res > UINT32_MAX) {
3036         env->vxsat = 0x1;
3037         return UINT32_MAX;
3038     } else {
3039         return res;
3040     }
3041 }
3042 
3043 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3044 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3045 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3046 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3047 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3048 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3049 
3050 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3051 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3052 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3053 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3054 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3055 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3056 
3057 /*
3058  * Vector Float Point Arithmetic Instructions
3059  */
3060 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3061 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3062 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3063                       CPURISCVState *env)                      \
3064 {                                                              \
3065     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3066     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3067     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3068 }
3069 
3070 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3071 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3072                   void *vs2, CPURISCVState *env,          \
3073                   uint32_t desc)                          \
3074 {                                                         \
3075     uint32_t vm = vext_vm(desc);                          \
3076     uint32_t vl = env->vl;                                \
3077     uint32_t total_elems =                                \
3078         vext_get_total_elems(env, desc, ESZ);             \
3079     uint32_t vta = vext_vta(desc);                        \
3080     uint32_t vma = vext_vma(desc);                        \
3081     uint32_t i;                                           \
3082                                                           \
3083     VSTART_CHECK_EARLY_EXIT(env);                         \
3084                                                           \
3085     for (i = env->vstart; i < vl; i++) {                  \
3086         if (!vm && !vext_elem_mask(v0, i)) {              \
3087             /* set masked-off elements to 1s */           \
3088             vext_set_elems_1s(vd, vma, i * ESZ,           \
3089                               (i + 1) * ESZ);             \
3090             continue;                                     \
3091         }                                                 \
3092         do_##NAME(vd, vs1, vs2, i, env);                  \
3093     }                                                     \
3094     env->vstart = 0;                                      \
3095     /* set tail elements to 1s */                         \
3096     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3097                       total_elems * ESZ);                 \
3098 }
3099 
3100 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3101 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3102 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3103 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3104 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3105 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3106 
3107 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3108 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3109                       CPURISCVState *env)                      \
3110 {                                                              \
3111     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3112     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3113 }
3114 
3115 #define GEN_VEXT_VF(NAME, ESZ)                            \
3116 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3117                   void *vs2, CPURISCVState *env,          \
3118                   uint32_t desc)                          \
3119 {                                                         \
3120     uint32_t vm = vext_vm(desc);                          \
3121     uint32_t vl = env->vl;                                \
3122     uint32_t total_elems =                                \
3123         vext_get_total_elems(env, desc, ESZ);             \
3124     uint32_t vta = vext_vta(desc);                        \
3125     uint32_t vma = vext_vma(desc);                        \
3126     uint32_t i;                                           \
3127                                                           \
3128     VSTART_CHECK_EARLY_EXIT(env);                         \
3129                                                           \
3130     for (i = env->vstart; i < vl; i++) {                  \
3131         if (!vm && !vext_elem_mask(v0, i)) {              \
3132             /* set masked-off elements to 1s */           \
3133             vext_set_elems_1s(vd, vma, i * ESZ,           \
3134                               (i + 1) * ESZ);             \
3135             continue;                                     \
3136         }                                                 \
3137         do_##NAME(vd, s1, vs2, i, env);                   \
3138     }                                                     \
3139     env->vstart = 0;                                      \
3140     /* set tail elements to 1s */                         \
3141     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3142                       total_elems * ESZ);                 \
3143 }
3144 
3145 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3146 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3147 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3148 GEN_VEXT_VF(vfadd_vf_h, 2)
3149 GEN_VEXT_VF(vfadd_vf_w, 4)
3150 GEN_VEXT_VF(vfadd_vf_d, 8)
3151 
3152 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3153 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3154 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3155 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3156 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3157 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3158 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3159 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3160 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3161 GEN_VEXT_VF(vfsub_vf_h, 2)
3162 GEN_VEXT_VF(vfsub_vf_w, 4)
3163 GEN_VEXT_VF(vfsub_vf_d, 8)
3164 
3165 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3166 {
3167     return float16_sub(b, a, s);
3168 }
3169 
3170 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3171 {
3172     return float32_sub(b, a, s);
3173 }
3174 
3175 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3176 {
3177     return float64_sub(b, a, s);
3178 }
3179 
3180 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3181 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3182 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3183 GEN_VEXT_VF(vfrsub_vf_h, 2)
3184 GEN_VEXT_VF(vfrsub_vf_w, 4)
3185 GEN_VEXT_VF(vfrsub_vf_d, 8)
3186 
3187 /* Vector Widening Floating-Point Add/Subtract Instructions */
3188 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3189 {
3190     return float32_add(float16_to_float32(a, true, s),
3191                        float16_to_float32(b, true, s), s);
3192 }
3193 
3194 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3195 {
3196     return float64_add(float32_to_float64(a, s),
3197                        float32_to_float64(b, s), s);
3198 
3199 }
3200 
3201 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3202 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3203 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3204 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3205 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3206 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3207 GEN_VEXT_VF(vfwadd_vf_h, 4)
3208 GEN_VEXT_VF(vfwadd_vf_w, 8)
3209 
3210 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3211 {
3212     return float32_sub(float16_to_float32(a, true, s),
3213                        float16_to_float32(b, true, s), s);
3214 }
3215 
3216 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3217 {
3218     return float64_sub(float32_to_float64(a, s),
3219                        float32_to_float64(b, s), s);
3220 
3221 }
3222 
3223 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3224 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3225 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3226 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3227 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3228 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3229 GEN_VEXT_VF(vfwsub_vf_h, 4)
3230 GEN_VEXT_VF(vfwsub_vf_w, 8)
3231 
3232 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3233 {
3234     return float32_add(a, float16_to_float32(b, true, s), s);
3235 }
3236 
3237 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3238 {
3239     return float64_add(a, float32_to_float64(b, s), s);
3240 }
3241 
3242 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3243 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3244 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3245 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3246 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3247 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3248 GEN_VEXT_VF(vfwadd_wf_h, 4)
3249 GEN_VEXT_VF(vfwadd_wf_w, 8)
3250 
3251 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3252 {
3253     return float32_sub(a, float16_to_float32(b, true, s), s);
3254 }
3255 
3256 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3257 {
3258     return float64_sub(a, float32_to_float64(b, s), s);
3259 }
3260 
3261 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3262 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3263 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3264 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3265 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3266 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3267 GEN_VEXT_VF(vfwsub_wf_h, 4)
3268 GEN_VEXT_VF(vfwsub_wf_w, 8)
3269 
3270 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3271 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3272 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3273 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3274 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3275 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3276 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3277 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3278 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3279 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3280 GEN_VEXT_VF(vfmul_vf_h, 2)
3281 GEN_VEXT_VF(vfmul_vf_w, 4)
3282 GEN_VEXT_VF(vfmul_vf_d, 8)
3283 
3284 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3285 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3286 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3287 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3288 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3289 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3290 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3291 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3292 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3293 GEN_VEXT_VF(vfdiv_vf_h, 2)
3294 GEN_VEXT_VF(vfdiv_vf_w, 4)
3295 GEN_VEXT_VF(vfdiv_vf_d, 8)
3296 
3297 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3298 {
3299     return float16_div(b, a, s);
3300 }
3301 
3302 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3303 {
3304     return float32_div(b, a, s);
3305 }
3306 
3307 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3308 {
3309     return float64_div(b, a, s);
3310 }
3311 
3312 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3313 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3314 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3315 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3316 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3317 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3318 
3319 /* Vector Widening Floating-Point Multiply */
3320 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3321 {
3322     return float32_mul(float16_to_float32(a, true, s),
3323                        float16_to_float32(b, true, s), s);
3324 }
3325 
3326 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3327 {
3328     return float64_mul(float32_to_float64(a, s),
3329                        float32_to_float64(b, s), s);
3330 
3331 }
3332 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3333 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3334 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3335 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3336 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3337 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3338 GEN_VEXT_VF(vfwmul_vf_h, 4)
3339 GEN_VEXT_VF(vfwmul_vf_w, 8)
3340 
3341 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3342 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3343 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3344                       CPURISCVState *env)                          \
3345 {                                                                  \
3346     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3347     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3348     TD d = *((TD *)vd + HD(i));                                    \
3349     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3350 }
3351 
3352 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3353 {
3354     return float16_muladd(a, b, d, 0, s);
3355 }
3356 
3357 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3358 {
3359     return float32_muladd(a, b, d, 0, s);
3360 }
3361 
3362 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3363 {
3364     return float64_muladd(a, b, d, 0, s);
3365 }
3366 
3367 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3368 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3369 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3370 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3371 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3372 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3373 
3374 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3375 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3376                       CPURISCVState *env)                         \
3377 {                                                                 \
3378     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3379     TD d = *((TD *)vd + HD(i));                                   \
3380     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3381 }
3382 
3383 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3384 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3385 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3386 GEN_VEXT_VF(vfmacc_vf_h, 2)
3387 GEN_VEXT_VF(vfmacc_vf_w, 4)
3388 GEN_VEXT_VF(vfmacc_vf_d, 8)
3389 
3390 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3391 {
3392     return float16_muladd(a, b, d, float_muladd_negate_c |
3393                                    float_muladd_negate_product, s);
3394 }
3395 
3396 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3397 {
3398     return float32_muladd(a, b, d, float_muladd_negate_c |
3399                                    float_muladd_negate_product, s);
3400 }
3401 
3402 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3403 {
3404     return float64_muladd(a, b, d, float_muladd_negate_c |
3405                                    float_muladd_negate_product, s);
3406 }
3407 
3408 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3409 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3410 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3411 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3412 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3413 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3414 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3415 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3416 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3417 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3418 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3419 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3420 
3421 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3422 {
3423     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3424 }
3425 
3426 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3427 {
3428     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3429 }
3430 
3431 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3432 {
3433     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3434 }
3435 
3436 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3437 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3438 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3439 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3440 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3441 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3442 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3443 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3444 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3445 GEN_VEXT_VF(vfmsac_vf_h, 2)
3446 GEN_VEXT_VF(vfmsac_vf_w, 4)
3447 GEN_VEXT_VF(vfmsac_vf_d, 8)
3448 
3449 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3450 {
3451     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3452 }
3453 
3454 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3455 {
3456     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3457 }
3458 
3459 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3460 {
3461     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3462 }
3463 
3464 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3465 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3466 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3467 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3468 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3469 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3470 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3471 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3472 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3473 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3474 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3475 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3476 
3477 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3478 {
3479     return float16_muladd(d, b, a, 0, s);
3480 }
3481 
3482 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3483 {
3484     return float32_muladd(d, b, a, 0, s);
3485 }
3486 
3487 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3488 {
3489     return float64_muladd(d, b, a, 0, s);
3490 }
3491 
3492 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3493 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3494 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3495 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3496 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3497 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3498 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3499 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3500 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3501 GEN_VEXT_VF(vfmadd_vf_h, 2)
3502 GEN_VEXT_VF(vfmadd_vf_w, 4)
3503 GEN_VEXT_VF(vfmadd_vf_d, 8)
3504 
3505 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3506 {
3507     return float16_muladd(d, b, a, float_muladd_negate_c |
3508                                    float_muladd_negate_product, s);
3509 }
3510 
3511 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3512 {
3513     return float32_muladd(d, b, a, float_muladd_negate_c |
3514                                    float_muladd_negate_product, s);
3515 }
3516 
3517 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3518 {
3519     return float64_muladd(d, b, a, float_muladd_negate_c |
3520                                    float_muladd_negate_product, s);
3521 }
3522 
3523 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3524 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3525 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3526 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3527 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3528 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3529 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3530 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3531 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3532 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3533 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3534 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3535 
3536 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3537 {
3538     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3539 }
3540 
3541 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3542 {
3543     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3544 }
3545 
3546 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3547 {
3548     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3549 }
3550 
3551 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3552 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3553 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3554 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3555 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3556 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3557 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3558 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3559 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3560 GEN_VEXT_VF(vfmsub_vf_h, 2)
3561 GEN_VEXT_VF(vfmsub_vf_w, 4)
3562 GEN_VEXT_VF(vfmsub_vf_d, 8)
3563 
3564 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3565 {
3566     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3567 }
3568 
3569 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3570 {
3571     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3572 }
3573 
3574 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3575 {
3576     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3577 }
3578 
3579 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3580 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3581 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3582 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3583 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3584 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3585 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3586 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3587 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3588 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3589 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3590 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3591 
3592 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3593 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3594 {
3595     return float32_muladd(float16_to_float32(a, true, s),
3596                           float16_to_float32(b, true, s), d, 0, s);
3597 }
3598 
3599 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3600 {
3601     return float64_muladd(float32_to_float64(a, s),
3602                           float32_to_float64(b, s), d, 0, s);
3603 }
3604 
3605 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3606 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3607 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3608 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3609 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3610 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3611 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3612 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3613 
3614 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3615 {
3616     return float32_muladd(bfloat16_to_float32(a, s),
3617                           bfloat16_to_float32(b, s), d, 0, s);
3618 }
3619 
3620 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3621 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3622 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3623 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3624 
3625 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3626 {
3627     return float32_muladd(float16_to_float32(a, true, s),
3628                           float16_to_float32(b, true, s), d,
3629                           float_muladd_negate_c | float_muladd_negate_product,
3630                           s);
3631 }
3632 
3633 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3634 {
3635     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3636                           d, float_muladd_negate_c |
3637                              float_muladd_negate_product, s);
3638 }
3639 
3640 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3641 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3642 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3643 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3644 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3645 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3646 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3647 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3648 
3649 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3650 {
3651     return float32_muladd(float16_to_float32(a, true, s),
3652                           float16_to_float32(b, true, s), d,
3653                           float_muladd_negate_c, s);
3654 }
3655 
3656 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3657 {
3658     return float64_muladd(float32_to_float64(a, s),
3659                           float32_to_float64(b, s), d,
3660                           float_muladd_negate_c, s);
3661 }
3662 
3663 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3664 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3665 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3666 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3667 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3668 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3669 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3670 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3671 
3672 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3673 {
3674     return float32_muladd(float16_to_float32(a, true, s),
3675                           float16_to_float32(b, true, s), d,
3676                           float_muladd_negate_product, s);
3677 }
3678 
3679 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3680 {
3681     return float64_muladd(float32_to_float64(a, s),
3682                           float32_to_float64(b, s), d,
3683                           float_muladd_negate_product, s);
3684 }
3685 
3686 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3687 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3688 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3689 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3690 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3691 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3692 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3693 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3694 
3695 /* Vector Floating-Point Square-Root Instruction */
3696 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3697 static void do_##NAME(void *vd, void *vs2, int i,      \
3698                       CPURISCVState *env)              \
3699 {                                                      \
3700     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3701     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3702 }
3703 
3704 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3705 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3706                   CPURISCVState *env, uint32_t desc)   \
3707 {                                                      \
3708     uint32_t vm = vext_vm(desc);                       \
3709     uint32_t vl = env->vl;                             \
3710     uint32_t total_elems =                             \
3711         vext_get_total_elems(env, desc, ESZ);          \
3712     uint32_t vta = vext_vta(desc);                     \
3713     uint32_t vma = vext_vma(desc);                     \
3714     uint32_t i;                                        \
3715                                                        \
3716     VSTART_CHECK_EARLY_EXIT(env);                      \
3717                                                        \
3718     if (vl == 0) {                                     \
3719         return;                                        \
3720     }                                                  \
3721     for (i = env->vstart; i < vl; i++) {               \
3722         if (!vm && !vext_elem_mask(v0, i)) {           \
3723             /* set masked-off elements to 1s */        \
3724             vext_set_elems_1s(vd, vma, i * ESZ,        \
3725                               (i + 1) * ESZ);          \
3726             continue;                                  \
3727         }                                              \
3728         do_##NAME(vd, vs2, i, env);                    \
3729     }                                                  \
3730     env->vstart = 0;                                   \
3731     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3732                       total_elems * ESZ);              \
3733 }
3734 
3735 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3736 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3737 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3738 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3739 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3740 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3741 
3742 /*
3743  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3744  *
3745  * Adapted from riscv-v-spec recip.c:
3746  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3747  */
3748 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3749 {
3750     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3751     uint64_t exp = extract64(f, frac_size, exp_size);
3752     uint64_t frac = extract64(f, 0, frac_size);
3753 
3754     const uint8_t lookup_table[] = {
3755         52, 51, 50, 48, 47, 46, 44, 43,
3756         42, 41, 40, 39, 38, 36, 35, 34,
3757         33, 32, 31, 30, 30, 29, 28, 27,
3758         26, 25, 24, 23, 23, 22, 21, 20,
3759         19, 19, 18, 17, 16, 16, 15, 14,
3760         14, 13, 12, 12, 11, 10, 10, 9,
3761         9, 8, 7, 7, 6, 6, 5, 4,
3762         4, 3, 3, 2, 2, 1, 1, 0,
3763         127, 125, 123, 121, 119, 118, 116, 114,
3764         113, 111, 109, 108, 106, 105, 103, 102,
3765         100, 99, 97, 96, 95, 93, 92, 91,
3766         90, 88, 87, 86, 85, 84, 83, 82,
3767         80, 79, 78, 77, 76, 75, 74, 73,
3768         72, 71, 70, 70, 69, 68, 67, 66,
3769         65, 64, 63, 63, 62, 61, 60, 59,
3770         59, 58, 57, 56, 56, 55, 54, 53
3771     };
3772     const int precision = 7;
3773 
3774     if (exp == 0 && frac != 0) { /* subnormal */
3775         /* Normalize the subnormal. */
3776         while (extract64(frac, frac_size - 1, 1) == 0) {
3777             exp--;
3778             frac <<= 1;
3779         }
3780 
3781         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3782     }
3783 
3784     int idx = ((exp & 1) << (precision - 1)) |
3785               (frac >> (frac_size - precision + 1));
3786     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3787                         (frac_size - precision);
3788     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3789 
3790     uint64_t val = 0;
3791     val = deposit64(val, 0, frac_size, out_frac);
3792     val = deposit64(val, frac_size, exp_size, out_exp);
3793     val = deposit64(val, frac_size + exp_size, 1, sign);
3794     return val;
3795 }
3796 
3797 static float16 frsqrt7_h(float16 f, float_status *s)
3798 {
3799     int exp_size = 5, frac_size = 10;
3800     bool sign = float16_is_neg(f);
3801 
3802     /*
3803      * frsqrt7(sNaN) = canonical NaN
3804      * frsqrt7(-inf) = canonical NaN
3805      * frsqrt7(-normal) = canonical NaN
3806      * frsqrt7(-subnormal) = canonical NaN
3807      */
3808     if (float16_is_signaling_nan(f, s) ||
3809         (float16_is_infinity(f) && sign) ||
3810         (float16_is_normal(f) && sign) ||
3811         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3812         s->float_exception_flags |= float_flag_invalid;
3813         return float16_default_nan(s);
3814     }
3815 
3816     /* frsqrt7(qNaN) = canonical NaN */
3817     if (float16_is_quiet_nan(f, s)) {
3818         return float16_default_nan(s);
3819     }
3820 
3821     /* frsqrt7(+-0) = +-inf */
3822     if (float16_is_zero(f)) {
3823         s->float_exception_flags |= float_flag_divbyzero;
3824         return float16_set_sign(float16_infinity, sign);
3825     }
3826 
3827     /* frsqrt7(+inf) = +0 */
3828     if (float16_is_infinity(f) && !sign) {
3829         return float16_set_sign(float16_zero, sign);
3830     }
3831 
3832     /* +normal, +subnormal */
3833     uint64_t val = frsqrt7(f, exp_size, frac_size);
3834     return make_float16(val);
3835 }
3836 
3837 static float32 frsqrt7_s(float32 f, float_status *s)
3838 {
3839     int exp_size = 8, frac_size = 23;
3840     bool sign = float32_is_neg(f);
3841 
3842     /*
3843      * frsqrt7(sNaN) = canonical NaN
3844      * frsqrt7(-inf) = canonical NaN
3845      * frsqrt7(-normal) = canonical NaN
3846      * frsqrt7(-subnormal) = canonical NaN
3847      */
3848     if (float32_is_signaling_nan(f, s) ||
3849         (float32_is_infinity(f) && sign) ||
3850         (float32_is_normal(f) && sign) ||
3851         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3852         s->float_exception_flags |= float_flag_invalid;
3853         return float32_default_nan(s);
3854     }
3855 
3856     /* frsqrt7(qNaN) = canonical NaN */
3857     if (float32_is_quiet_nan(f, s)) {
3858         return float32_default_nan(s);
3859     }
3860 
3861     /* frsqrt7(+-0) = +-inf */
3862     if (float32_is_zero(f)) {
3863         s->float_exception_flags |= float_flag_divbyzero;
3864         return float32_set_sign(float32_infinity, sign);
3865     }
3866 
3867     /* frsqrt7(+inf) = +0 */
3868     if (float32_is_infinity(f) && !sign) {
3869         return float32_set_sign(float32_zero, sign);
3870     }
3871 
3872     /* +normal, +subnormal */
3873     uint64_t val = frsqrt7(f, exp_size, frac_size);
3874     return make_float32(val);
3875 }
3876 
3877 static float64 frsqrt7_d(float64 f, float_status *s)
3878 {
3879     int exp_size = 11, frac_size = 52;
3880     bool sign = float64_is_neg(f);
3881 
3882     /*
3883      * frsqrt7(sNaN) = canonical NaN
3884      * frsqrt7(-inf) = canonical NaN
3885      * frsqrt7(-normal) = canonical NaN
3886      * frsqrt7(-subnormal) = canonical NaN
3887      */
3888     if (float64_is_signaling_nan(f, s) ||
3889         (float64_is_infinity(f) && sign) ||
3890         (float64_is_normal(f) && sign) ||
3891         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3892         s->float_exception_flags |= float_flag_invalid;
3893         return float64_default_nan(s);
3894     }
3895 
3896     /* frsqrt7(qNaN) = canonical NaN */
3897     if (float64_is_quiet_nan(f, s)) {
3898         return float64_default_nan(s);
3899     }
3900 
3901     /* frsqrt7(+-0) = +-inf */
3902     if (float64_is_zero(f)) {
3903         s->float_exception_flags |= float_flag_divbyzero;
3904         return float64_set_sign(float64_infinity, sign);
3905     }
3906 
3907     /* frsqrt7(+inf) = +0 */
3908     if (float64_is_infinity(f) && !sign) {
3909         return float64_set_sign(float64_zero, sign);
3910     }
3911 
3912     /* +normal, +subnormal */
3913     uint64_t val = frsqrt7(f, exp_size, frac_size);
3914     return make_float64(val);
3915 }
3916 
3917 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3918 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3919 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3920 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3921 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3922 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3923 
3924 /*
3925  * Vector Floating-Point Reciprocal Estimate Instruction
3926  *
3927  * Adapted from riscv-v-spec recip.c:
3928  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3929  */
3930 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3931                       float_status *s)
3932 {
3933     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3934     uint64_t exp = extract64(f, frac_size, exp_size);
3935     uint64_t frac = extract64(f, 0, frac_size);
3936 
3937     const uint8_t lookup_table[] = {
3938         127, 125, 123, 121, 119, 117, 116, 114,
3939         112, 110, 109, 107, 105, 104, 102, 100,
3940         99, 97, 96, 94, 93, 91, 90, 88,
3941         87, 85, 84, 83, 81, 80, 79, 77,
3942         76, 75, 74, 72, 71, 70, 69, 68,
3943         66, 65, 64, 63, 62, 61, 60, 59,
3944         58, 57, 56, 55, 54, 53, 52, 51,
3945         50, 49, 48, 47, 46, 45, 44, 43,
3946         42, 41, 40, 40, 39, 38, 37, 36,
3947         35, 35, 34, 33, 32, 31, 31, 30,
3948         29, 28, 28, 27, 26, 25, 25, 24,
3949         23, 23, 22, 21, 21, 20, 19, 19,
3950         18, 17, 17, 16, 15, 15, 14, 14,
3951         13, 12, 12, 11, 11, 10, 9, 9,
3952         8, 8, 7, 7, 6, 5, 5, 4,
3953         4, 3, 3, 2, 2, 1, 1, 0
3954     };
3955     const int precision = 7;
3956 
3957     if (exp == 0 && frac != 0) { /* subnormal */
3958         /* Normalize the subnormal. */
3959         while (extract64(frac, frac_size - 1, 1) == 0) {
3960             exp--;
3961             frac <<= 1;
3962         }
3963 
3964         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3965 
3966         if (exp != 0 && exp != UINT64_MAX) {
3967             /*
3968              * Overflow to inf or max value of same sign,
3969              * depending on sign and rounding mode.
3970              */
3971             s->float_exception_flags |= (float_flag_inexact |
3972                                          float_flag_overflow);
3973 
3974             if ((s->float_rounding_mode == float_round_to_zero) ||
3975                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3976                 ((s->float_rounding_mode == float_round_up) && sign)) {
3977                 /* Return greatest/negative finite value. */
3978                 return (sign << (exp_size + frac_size)) |
3979                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3980             } else {
3981                 /* Return +-inf. */
3982                 return (sign << (exp_size + frac_size)) |
3983                        MAKE_64BIT_MASK(frac_size, exp_size);
3984             }
3985         }
3986     }
3987 
3988     int idx = frac >> (frac_size - precision);
3989     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3990                         (frac_size - precision);
3991     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3992 
3993     if (out_exp == 0 || out_exp == UINT64_MAX) {
3994         /*
3995          * The result is subnormal, but don't raise the underflow exception,
3996          * because there's no additional loss of precision.
3997          */
3998         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3999         if (out_exp == UINT64_MAX) {
4000             out_frac >>= 1;
4001             out_exp = 0;
4002         }
4003     }
4004 
4005     uint64_t val = 0;
4006     val = deposit64(val, 0, frac_size, out_frac);
4007     val = deposit64(val, frac_size, exp_size, out_exp);
4008     val = deposit64(val, frac_size + exp_size, 1, sign);
4009     return val;
4010 }
4011 
4012 static float16 frec7_h(float16 f, float_status *s)
4013 {
4014     int exp_size = 5, frac_size = 10;
4015     bool sign = float16_is_neg(f);
4016 
4017     /* frec7(+-inf) = +-0 */
4018     if (float16_is_infinity(f)) {
4019         return float16_set_sign(float16_zero, sign);
4020     }
4021 
4022     /* frec7(+-0) = +-inf */
4023     if (float16_is_zero(f)) {
4024         s->float_exception_flags |= float_flag_divbyzero;
4025         return float16_set_sign(float16_infinity, sign);
4026     }
4027 
4028     /* frec7(sNaN) = canonical NaN */
4029     if (float16_is_signaling_nan(f, s)) {
4030         s->float_exception_flags |= float_flag_invalid;
4031         return float16_default_nan(s);
4032     }
4033 
4034     /* frec7(qNaN) = canonical NaN */
4035     if (float16_is_quiet_nan(f, s)) {
4036         return float16_default_nan(s);
4037     }
4038 
4039     /* +-normal, +-subnormal */
4040     uint64_t val = frec7(f, exp_size, frac_size, s);
4041     return make_float16(val);
4042 }
4043 
4044 static float32 frec7_s(float32 f, float_status *s)
4045 {
4046     int exp_size = 8, frac_size = 23;
4047     bool sign = float32_is_neg(f);
4048 
4049     /* frec7(+-inf) = +-0 */
4050     if (float32_is_infinity(f)) {
4051         return float32_set_sign(float32_zero, sign);
4052     }
4053 
4054     /* frec7(+-0) = +-inf */
4055     if (float32_is_zero(f)) {
4056         s->float_exception_flags |= float_flag_divbyzero;
4057         return float32_set_sign(float32_infinity, sign);
4058     }
4059 
4060     /* frec7(sNaN) = canonical NaN */
4061     if (float32_is_signaling_nan(f, s)) {
4062         s->float_exception_flags |= float_flag_invalid;
4063         return float32_default_nan(s);
4064     }
4065 
4066     /* frec7(qNaN) = canonical NaN */
4067     if (float32_is_quiet_nan(f, s)) {
4068         return float32_default_nan(s);
4069     }
4070 
4071     /* +-normal, +-subnormal */
4072     uint64_t val = frec7(f, exp_size, frac_size, s);
4073     return make_float32(val);
4074 }
4075 
4076 static float64 frec7_d(float64 f, float_status *s)
4077 {
4078     int exp_size = 11, frac_size = 52;
4079     bool sign = float64_is_neg(f);
4080 
4081     /* frec7(+-inf) = +-0 */
4082     if (float64_is_infinity(f)) {
4083         return float64_set_sign(float64_zero, sign);
4084     }
4085 
4086     /* frec7(+-0) = +-inf */
4087     if (float64_is_zero(f)) {
4088         s->float_exception_flags |= float_flag_divbyzero;
4089         return float64_set_sign(float64_infinity, sign);
4090     }
4091 
4092     /* frec7(sNaN) = canonical NaN */
4093     if (float64_is_signaling_nan(f, s)) {
4094         s->float_exception_flags |= float_flag_invalid;
4095         return float64_default_nan(s);
4096     }
4097 
4098     /* frec7(qNaN) = canonical NaN */
4099     if (float64_is_quiet_nan(f, s)) {
4100         return float64_default_nan(s);
4101     }
4102 
4103     /* +-normal, +-subnormal */
4104     uint64_t val = frec7(f, exp_size, frac_size, s);
4105     return make_float64(val);
4106 }
4107 
4108 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4109 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4110 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4111 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4112 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4113 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4114 
4115 /* Vector Floating-Point MIN/MAX Instructions */
4116 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4117 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4118 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4119 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4120 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4121 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4122 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4123 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4124 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4125 GEN_VEXT_VF(vfmin_vf_h, 2)
4126 GEN_VEXT_VF(vfmin_vf_w, 4)
4127 GEN_VEXT_VF(vfmin_vf_d, 8)
4128 
4129 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4130 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4131 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4132 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4133 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4134 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4135 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4136 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4137 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4138 GEN_VEXT_VF(vfmax_vf_h, 2)
4139 GEN_VEXT_VF(vfmax_vf_w, 4)
4140 GEN_VEXT_VF(vfmax_vf_d, 8)
4141 
4142 /* Vector Floating-Point Sign-Injection Instructions */
4143 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4144 {
4145     return deposit64(b, 0, 15, a);
4146 }
4147 
4148 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4149 {
4150     return deposit64(b, 0, 31, a);
4151 }
4152 
4153 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4154 {
4155     return deposit64(b, 0, 63, a);
4156 }
4157 
4158 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4159 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4160 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4161 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4162 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4163 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4164 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4165 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4166 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4167 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4168 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4169 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4170 
4171 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4172 {
4173     return deposit64(~b, 0, 15, a);
4174 }
4175 
4176 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4177 {
4178     return deposit64(~b, 0, 31, a);
4179 }
4180 
4181 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4182 {
4183     return deposit64(~b, 0, 63, a);
4184 }
4185 
4186 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4187 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4188 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4189 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4190 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4191 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4192 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4193 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4194 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4195 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4196 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4197 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4198 
4199 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4200 {
4201     return deposit64(b ^ a, 0, 15, a);
4202 }
4203 
4204 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4205 {
4206     return deposit64(b ^ a, 0, 31, a);
4207 }
4208 
4209 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4210 {
4211     return deposit64(b ^ a, 0, 63, a);
4212 }
4213 
4214 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4215 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4216 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4217 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4218 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4219 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4220 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4221 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4222 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4223 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4224 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4225 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4226 
4227 /* Vector Floating-Point Compare Instructions */
4228 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4229 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4230                   CPURISCVState *env, uint32_t desc)          \
4231 {                                                             \
4232     uint32_t vm = vext_vm(desc);                              \
4233     uint32_t vl = env->vl;                                    \
4234     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4235     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4236     uint32_t vma = vext_vma(desc);                            \
4237     uint32_t i;                                               \
4238                                                               \
4239     VSTART_CHECK_EARLY_EXIT(env);                             \
4240                                                               \
4241     for (i = env->vstart; i < vl; i++) {                      \
4242         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4243         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4244         if (!vm && !vext_elem_mask(v0, i)) {                  \
4245             /* set masked-off elements to 1s */               \
4246             if (vma) {                                        \
4247                 vext_set_elem_mask(vd, i, 1);                 \
4248             }                                                 \
4249             continue;                                         \
4250         }                                                     \
4251         vext_set_elem_mask(vd, i,                             \
4252                            DO_OP(s2, s1, &env->fp_status));   \
4253     }                                                         \
4254     env->vstart = 0;                                          \
4255     /*
4256      * mask destination register are always tail-agnostic
4257      * set tail elements to 1s
4258      */                                                       \
4259     if (vta_all_1s) {                                         \
4260         for (; i < total_elems; i++) {                        \
4261             vext_set_elem_mask(vd, i, 1);                     \
4262         }                                                     \
4263     }                                                         \
4264 }
4265 
4266 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4267 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4268 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4269 
4270 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4271 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4272                   CPURISCVState *env, uint32_t desc)                \
4273 {                                                                   \
4274     uint32_t vm = vext_vm(desc);                                    \
4275     uint32_t vl = env->vl;                                          \
4276     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4277     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4278     uint32_t vma = vext_vma(desc);                                  \
4279     uint32_t i;                                                     \
4280                                                                     \
4281     VSTART_CHECK_EARLY_EXIT(env);                                   \
4282                                                                     \
4283     for (i = env->vstart; i < vl; i++) {                            \
4284         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4285         if (!vm && !vext_elem_mask(v0, i)) {                        \
4286             /* set masked-off elements to 1s */                     \
4287             if (vma) {                                              \
4288                 vext_set_elem_mask(vd, i, 1);                       \
4289             }                                                       \
4290             continue;                                               \
4291         }                                                           \
4292         vext_set_elem_mask(vd, i,                                   \
4293                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4294     }                                                               \
4295     env->vstart = 0;                                                \
4296     /*
4297      * mask destination register are always tail-agnostic
4298      * set tail elements to 1s
4299      */                                                             \
4300     if (vta_all_1s) {                                               \
4301         for (; i < total_elems; i++) {                              \
4302             vext_set_elem_mask(vd, i, 1);                           \
4303         }                                                           \
4304     }                                                               \
4305 }
4306 
4307 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4308 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4309 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4310 
4311 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4312 {
4313     FloatRelation compare = float16_compare_quiet(a, b, s);
4314     return compare != float_relation_equal;
4315 }
4316 
4317 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4318 {
4319     FloatRelation compare = float32_compare_quiet(a, b, s);
4320     return compare != float_relation_equal;
4321 }
4322 
4323 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4324 {
4325     FloatRelation compare = float64_compare_quiet(a, b, s);
4326     return compare != float_relation_equal;
4327 }
4328 
4329 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4330 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4331 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4332 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4333 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4334 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4335 
4336 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4337 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4338 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4339 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4340 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4341 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4342 
4343 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4344 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4345 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4346 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4347 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4348 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4349 
4350 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4351 {
4352     FloatRelation compare = float16_compare(a, b, s);
4353     return compare == float_relation_greater;
4354 }
4355 
4356 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4357 {
4358     FloatRelation compare = float32_compare(a, b, s);
4359     return compare == float_relation_greater;
4360 }
4361 
4362 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4363 {
4364     FloatRelation compare = float64_compare(a, b, s);
4365     return compare == float_relation_greater;
4366 }
4367 
4368 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4369 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4370 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4371 
4372 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4373 {
4374     FloatRelation compare = float16_compare(a, b, s);
4375     return compare == float_relation_greater ||
4376            compare == float_relation_equal;
4377 }
4378 
4379 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4380 {
4381     FloatRelation compare = float32_compare(a, b, s);
4382     return compare == float_relation_greater ||
4383            compare == float_relation_equal;
4384 }
4385 
4386 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4387 {
4388     FloatRelation compare = float64_compare(a, b, s);
4389     return compare == float_relation_greater ||
4390            compare == float_relation_equal;
4391 }
4392 
4393 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4394 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4395 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4396 
4397 /* Vector Floating-Point Classify Instruction */
4398 target_ulong fclass_h(uint64_t frs1)
4399 {
4400     float16 f = frs1;
4401     bool sign = float16_is_neg(f);
4402 
4403     if (float16_is_infinity(f)) {
4404         return sign ? 1 << 0 : 1 << 7;
4405     } else if (float16_is_zero(f)) {
4406         return sign ? 1 << 3 : 1 << 4;
4407     } else if (float16_is_zero_or_denormal(f)) {
4408         return sign ? 1 << 2 : 1 << 5;
4409     } else if (float16_is_any_nan(f)) {
4410         float_status s = { }; /* for snan_bit_is_one */
4411         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4412     } else {
4413         return sign ? 1 << 1 : 1 << 6;
4414     }
4415 }
4416 
4417 target_ulong fclass_s(uint64_t frs1)
4418 {
4419     float32 f = frs1;
4420     bool sign = float32_is_neg(f);
4421 
4422     if (float32_is_infinity(f)) {
4423         return sign ? 1 << 0 : 1 << 7;
4424     } else if (float32_is_zero(f)) {
4425         return sign ? 1 << 3 : 1 << 4;
4426     } else if (float32_is_zero_or_denormal(f)) {
4427         return sign ? 1 << 2 : 1 << 5;
4428     } else if (float32_is_any_nan(f)) {
4429         float_status s = { }; /* for snan_bit_is_one */
4430         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4431     } else {
4432         return sign ? 1 << 1 : 1 << 6;
4433     }
4434 }
4435 
4436 target_ulong fclass_d(uint64_t frs1)
4437 {
4438     float64 f = frs1;
4439     bool sign = float64_is_neg(f);
4440 
4441     if (float64_is_infinity(f)) {
4442         return sign ? 1 << 0 : 1 << 7;
4443     } else if (float64_is_zero(f)) {
4444         return sign ? 1 << 3 : 1 << 4;
4445     } else if (float64_is_zero_or_denormal(f)) {
4446         return sign ? 1 << 2 : 1 << 5;
4447     } else if (float64_is_any_nan(f)) {
4448         float_status s = { }; /* for snan_bit_is_one */
4449         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4450     } else {
4451         return sign ? 1 << 1 : 1 << 6;
4452     }
4453 }
4454 
4455 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4456 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4457 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4458 GEN_VEXT_V(vfclass_v_h, 2)
4459 GEN_VEXT_V(vfclass_v_w, 4)
4460 GEN_VEXT_V(vfclass_v_d, 8)
4461 
4462 /* Vector Floating-Point Merge Instruction */
4463 
4464 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4465 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4466                   CPURISCVState *env, uint32_t desc)          \
4467 {                                                             \
4468     uint32_t vm = vext_vm(desc);                              \
4469     uint32_t vl = env->vl;                                    \
4470     uint32_t esz = sizeof(ETYPE);                             \
4471     uint32_t total_elems =                                    \
4472         vext_get_total_elems(env, desc, esz);                 \
4473     uint32_t vta = vext_vta(desc);                            \
4474     uint32_t i;                                               \
4475                                                               \
4476     VSTART_CHECK_EARLY_EXIT(env);                             \
4477                                                               \
4478     for (i = env->vstart; i < vl; i++) {                      \
4479         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4480         *((ETYPE *)vd + H(i)) =                               \
4481             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4482     }                                                         \
4483     env->vstart = 0;                                          \
4484     /* set tail elements to 1s */                             \
4485     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4486 }
4487 
4488 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4489 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4490 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4491 
4492 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4493 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4494 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4495 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4496 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4497 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4498 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4499 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4500 
4501 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4502 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4503 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4504 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4505 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4506 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4507 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4508 
4509 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4510 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4511 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4512 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4513 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4514 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4515 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4516 
4517 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4518 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4519 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4520 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4521 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4522 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4523 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4524 
4525 /* Widening Floating-Point/Integer Type-Convert Instructions */
4526 /* (TD, T2, TX2) */
4527 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4528 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4529 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4530 /*
4531  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4532  */
4533 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4534 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4535 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4536 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4537 
4538 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4539 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4540 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4541 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4542 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4543 
4544 /*
4545  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4546  */
4547 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4548 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4549 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4550 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4551 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4552 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4553 
4554 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4555 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4556 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4557 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4558 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4559 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4560 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4561 
4562 /*
4563  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4564  */
4565 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4566 {
4567     return float16_to_float32(a, true, s);
4568 }
4569 
4570 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4571 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4572 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4573 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4574 
4575 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4576 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4577 
4578 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4579 /* (TD, T2, TX2) */
4580 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4581 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4582 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4583 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4584 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4585 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4586 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4587 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4588 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4589 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4590 
4591 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4592 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4593 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4594 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4595 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4596 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4597 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4598 
4599 /*
4600  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4601  */
4602 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4603 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4604 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4605 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4606 
4607 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4608 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4609 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4610 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4611 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4612 
4613 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4614 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4615 {
4616     return float32_to_float16(a, true, s);
4617 }
4618 
4619 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4620 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4621 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4622 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4623 
4624 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4625 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4626 
4627 /*
4628  * Vector Reduction Operations
4629  */
4630 /* Vector Single-Width Integer Reduction Instructions */
4631 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4632 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4633                   void *vs2, CPURISCVState *env,          \
4634                   uint32_t desc)                          \
4635 {                                                         \
4636     uint32_t vm = vext_vm(desc);                          \
4637     uint32_t vl = env->vl;                                \
4638     uint32_t esz = sizeof(TD);                            \
4639     uint32_t vlenb = simd_maxsz(desc);                    \
4640     uint32_t vta = vext_vta(desc);                        \
4641     uint32_t i;                                           \
4642     TD s1 =  *((TD *)vs1 + HD(0));                        \
4643                                                           \
4644     for (i = env->vstart; i < vl; i++) {                  \
4645         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4646         if (!vm && !vext_elem_mask(v0, i)) {              \
4647             continue;                                     \
4648         }                                                 \
4649         s1 = OP(s1, (TD)s2);                              \
4650     }                                                     \
4651     *((TD *)vd + HD(0)) = s1;                             \
4652     env->vstart = 0;                                      \
4653     /* set tail elements to 1s */                         \
4654     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4655 }
4656 
4657 /* vd[0] = sum(vs1[0], vs2[*]) */
4658 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4659 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4660 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4661 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4662 
4663 /* vd[0] = maxu(vs1[0], vs2[*]) */
4664 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4665 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4666 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4667 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4668 
4669 /* vd[0] = max(vs1[0], vs2[*]) */
4670 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4671 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4672 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4673 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4674 
4675 /* vd[0] = minu(vs1[0], vs2[*]) */
4676 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4677 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4678 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4679 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4680 
4681 /* vd[0] = min(vs1[0], vs2[*]) */
4682 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4683 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4684 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4685 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4686 
4687 /* vd[0] = and(vs1[0], vs2[*]) */
4688 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4689 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4690 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4691 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4692 
4693 /* vd[0] = or(vs1[0], vs2[*]) */
4694 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4695 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4696 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4697 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4698 
4699 /* vd[0] = xor(vs1[0], vs2[*]) */
4700 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4701 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4702 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4703 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4704 
4705 /* Vector Widening Integer Reduction Instructions */
4706 /* signed sum reduction into double-width accumulator */
4707 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4708 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4709 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4710 
4711 /* Unsigned sum reduction into double-width accumulator */
4712 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4713 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4714 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4715 
4716 /* Vector Single-Width Floating-Point Reduction Instructions */
4717 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4718 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4719                   void *vs2, CPURISCVState *env,           \
4720                   uint32_t desc)                           \
4721 {                                                          \
4722     uint32_t vm = vext_vm(desc);                           \
4723     uint32_t vl = env->vl;                                 \
4724     uint32_t esz = sizeof(TD);                             \
4725     uint32_t vlenb = simd_maxsz(desc);                     \
4726     uint32_t vta = vext_vta(desc);                         \
4727     uint32_t i;                                            \
4728     TD s1 =  *((TD *)vs1 + HD(0));                         \
4729                                                            \
4730     for (i = env->vstart; i < vl; i++) {                   \
4731         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4732         if (!vm && !vext_elem_mask(v0, i)) {               \
4733             continue;                                      \
4734         }                                                  \
4735         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4736     }                                                      \
4737     *((TD *)vd + HD(0)) = s1;                              \
4738     env->vstart = 0;                                       \
4739     /* set tail elements to 1s */                          \
4740     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4741 }
4742 
4743 /* Unordered sum */
4744 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4745 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4746 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4747 
4748 /* Ordered sum */
4749 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4750 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4751 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4752 
4753 /* Maximum value */
4754 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4755               float16_maximum_number)
4756 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4757               float32_maximum_number)
4758 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4759               float64_maximum_number)
4760 
4761 /* Minimum value */
4762 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4763               float16_minimum_number)
4764 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4765               float32_minimum_number)
4766 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4767               float64_minimum_number)
4768 
4769 /* Vector Widening Floating-Point Add Instructions */
4770 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4771 {
4772     return float32_add(a, float16_to_float32(b, true, s), s);
4773 }
4774 
4775 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4776 {
4777     return float64_add(a, float32_to_float64(b, s), s);
4778 }
4779 
4780 /* Vector Widening Floating-Point Reduction Instructions */
4781 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4782 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4783 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4784 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4785 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4786 
4787 /*
4788  * Vector Mask Operations
4789  */
4790 /* Vector Mask-Register Logical Instructions */
4791 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4792 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4793                   void *vs2, CPURISCVState *env,          \
4794                   uint32_t desc)                          \
4795 {                                                         \
4796     uint32_t vl = env->vl;                                \
4797     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4798     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4799     uint32_t i;                                           \
4800     int a, b;                                             \
4801                                                           \
4802     VSTART_CHECK_EARLY_EXIT(env);                         \
4803                                                           \
4804     for (i = env->vstart; i < vl; i++) {                  \
4805         a = vext_elem_mask(vs1, i);                       \
4806         b = vext_elem_mask(vs2, i);                       \
4807         vext_set_elem_mask(vd, i, OP(b, a));              \
4808     }                                                     \
4809     env->vstart = 0;                                      \
4810     /*
4811      * mask destination register are always tail-agnostic
4812      * set tail elements to 1s
4813      */                                                   \
4814     if (vta_all_1s) {                                     \
4815         for (; i < total_elems; i++) {                    \
4816             vext_set_elem_mask(vd, i, 1);                 \
4817         }                                                 \
4818     }                                                     \
4819 }
4820 
4821 #define DO_NAND(N, M)  (!(N & M))
4822 #define DO_ANDNOT(N, M)  (N & !M)
4823 #define DO_NOR(N, M)  (!(N | M))
4824 #define DO_ORNOT(N, M)  (N | !M)
4825 #define DO_XNOR(N, M)  (!(N ^ M))
4826 
4827 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4828 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4829 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4830 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4831 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4832 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4833 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4834 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4835 
4836 /* Vector count population in mask vcpop */
4837 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4838                              uint32_t desc)
4839 {
4840     target_ulong cnt = 0;
4841     uint32_t vm = vext_vm(desc);
4842     uint32_t vl = env->vl;
4843     int i;
4844 
4845     for (i = env->vstart; i < vl; i++) {
4846         if (vm || vext_elem_mask(v0, i)) {
4847             if (vext_elem_mask(vs2, i)) {
4848                 cnt++;
4849             }
4850         }
4851     }
4852     env->vstart = 0;
4853     return cnt;
4854 }
4855 
4856 /* vfirst find-first-set mask bit */
4857 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4858                               uint32_t desc)
4859 {
4860     uint32_t vm = vext_vm(desc);
4861     uint32_t vl = env->vl;
4862     int i;
4863 
4864     for (i = env->vstart; i < vl; i++) {
4865         if (vm || vext_elem_mask(v0, i)) {
4866             if (vext_elem_mask(vs2, i)) {
4867                 return i;
4868             }
4869         }
4870     }
4871     env->vstart = 0;
4872     return -1LL;
4873 }
4874 
4875 enum set_mask_type {
4876     ONLY_FIRST = 1,
4877     INCLUDE_FIRST,
4878     BEFORE_FIRST,
4879 };
4880 
4881 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4882                    uint32_t desc, enum set_mask_type type)
4883 {
4884     uint32_t vm = vext_vm(desc);
4885     uint32_t vl = env->vl;
4886     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4887     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4888     uint32_t vma = vext_vma(desc);
4889     int i;
4890     bool first_mask_bit = false;
4891 
4892     for (i = env->vstart; i < vl; i++) {
4893         if (!vm && !vext_elem_mask(v0, i)) {
4894             /* set masked-off elements to 1s */
4895             if (vma) {
4896                 vext_set_elem_mask(vd, i, 1);
4897             }
4898             continue;
4899         }
4900         /* write a zero to all following active elements */
4901         if (first_mask_bit) {
4902             vext_set_elem_mask(vd, i, 0);
4903             continue;
4904         }
4905         if (vext_elem_mask(vs2, i)) {
4906             first_mask_bit = true;
4907             if (type == BEFORE_FIRST) {
4908                 vext_set_elem_mask(vd, i, 0);
4909             } else {
4910                 vext_set_elem_mask(vd, i, 1);
4911             }
4912         } else {
4913             if (type == ONLY_FIRST) {
4914                 vext_set_elem_mask(vd, i, 0);
4915             } else {
4916                 vext_set_elem_mask(vd, i, 1);
4917             }
4918         }
4919     }
4920     env->vstart = 0;
4921     /*
4922      * mask destination register are always tail-agnostic
4923      * set tail elements to 1s
4924      */
4925     if (vta_all_1s) {
4926         for (; i < total_elems; i++) {
4927             vext_set_elem_mask(vd, i, 1);
4928         }
4929     }
4930 }
4931 
4932 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4933                      uint32_t desc)
4934 {
4935     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4936 }
4937 
4938 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4939                      uint32_t desc)
4940 {
4941     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4942 }
4943 
4944 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4945                      uint32_t desc)
4946 {
4947     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4948 }
4949 
4950 /* Vector Iota Instruction */
4951 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4952 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4953                   uint32_t desc)                                          \
4954 {                                                                         \
4955     uint32_t vm = vext_vm(desc);                                          \
4956     uint32_t vl = env->vl;                                                \
4957     uint32_t esz = sizeof(ETYPE);                                         \
4958     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4959     uint32_t vta = vext_vta(desc);                                        \
4960     uint32_t vma = vext_vma(desc);                                        \
4961     uint32_t sum = 0;                                                     \
4962     int i;                                                                \
4963                                                                           \
4964     for (i = env->vstart; i < vl; i++) {                                  \
4965         if (!vm && !vext_elem_mask(v0, i)) {                              \
4966             /* set masked-off elements to 1s */                           \
4967             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4968             continue;                                                     \
4969         }                                                                 \
4970         *((ETYPE *)vd + H(i)) = sum;                                      \
4971         if (vext_elem_mask(vs2, i)) {                                     \
4972             sum++;                                                        \
4973         }                                                                 \
4974     }                                                                     \
4975     env->vstart = 0;                                                      \
4976     /* set tail elements to 1s */                                         \
4977     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4978 }
4979 
4980 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4981 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4982 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4983 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4984 
4985 /* Vector Element Index Instruction */
4986 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4987 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4988 {                                                                         \
4989     uint32_t vm = vext_vm(desc);                                          \
4990     uint32_t vl = env->vl;                                                \
4991     uint32_t esz = sizeof(ETYPE);                                         \
4992     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4993     uint32_t vta = vext_vta(desc);                                        \
4994     uint32_t vma = vext_vma(desc);                                        \
4995     int i;                                                                \
4996                                                                           \
4997     VSTART_CHECK_EARLY_EXIT(env);                                         \
4998                                                                           \
4999     for (i = env->vstart; i < vl; i++) {                                  \
5000         if (!vm && !vext_elem_mask(v0, i)) {                              \
5001             /* set masked-off elements to 1s */                           \
5002             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5003             continue;                                                     \
5004         }                                                                 \
5005         *((ETYPE *)vd + H(i)) = i;                                        \
5006     }                                                                     \
5007     env->vstart = 0;                                                      \
5008     /* set tail elements to 1s */                                         \
5009     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5010 }
5011 
5012 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5013 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5014 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5015 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5016 
5017 /*
5018  * Vector Permutation Instructions
5019  */
5020 
5021 /* Vector Slide Instructions */
5022 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5023 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5024                   CPURISCVState *env, uint32_t desc)                      \
5025 {                                                                         \
5026     uint32_t vm = vext_vm(desc);                                          \
5027     uint32_t vl = env->vl;                                                \
5028     uint32_t esz = sizeof(ETYPE);                                         \
5029     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5030     uint32_t vta = vext_vta(desc);                                        \
5031     uint32_t vma = vext_vma(desc);                                        \
5032     target_ulong offset = s1, i_min, i;                                   \
5033                                                                           \
5034     VSTART_CHECK_EARLY_EXIT(env);                                         \
5035                                                                           \
5036     i_min = MAX(env->vstart, offset);                                     \
5037     for (i = i_min; i < vl; i++) {                                        \
5038         if (!vm && !vext_elem_mask(v0, i)) {                              \
5039             /* set masked-off elements to 1s */                           \
5040             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5041             continue;                                                     \
5042         }                                                                 \
5043         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5044     }                                                                     \
5045     env->vstart = 0;                                                      \
5046     /* set tail elements to 1s */                                         \
5047     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5048 }
5049 
5050 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5051 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5052 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5053 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5054 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5055 
5056 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5057 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5058                   CPURISCVState *env, uint32_t desc)                      \
5059 {                                                                         \
5060     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5061     uint32_t vm = vext_vm(desc);                                          \
5062     uint32_t vl = env->vl;                                                \
5063     uint32_t esz = sizeof(ETYPE);                                         \
5064     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5065     uint32_t vta = vext_vta(desc);                                        \
5066     uint32_t vma = vext_vma(desc);                                        \
5067     target_ulong i_max, i_min, i;                                         \
5068                                                                           \
5069     VSTART_CHECK_EARLY_EXIT(env);                                         \
5070                                                                           \
5071     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5072     i_max = MAX(i_min, env->vstart);                                      \
5073     for (i = env->vstart; i < i_max; ++i) {                               \
5074         if (!vm && !vext_elem_mask(v0, i)) {                              \
5075             /* set masked-off elements to 1s */                           \
5076             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5077             continue;                                                     \
5078         }                                                                 \
5079         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5080     }                                                                     \
5081                                                                           \
5082     for (i = i_max; i < vl; ++i) {                                        \
5083         if (vm || vext_elem_mask(v0, i)) {                                \
5084             *((ETYPE *)vd + H(i)) = 0;                                    \
5085         }                                                                 \
5086     }                                                                     \
5087                                                                           \
5088     env->vstart = 0;                                                      \
5089     /* set tail elements to 1s */                                         \
5090     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5091 }
5092 
5093 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5094 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5095 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5096 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5097 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5098 
5099 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5100 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5101                                  void *vs2, CPURISCVState *env,             \
5102                                  uint32_t desc)                             \
5103 {                                                                           \
5104     typedef uint##BITWIDTH##_t ETYPE;                                       \
5105     uint32_t vm = vext_vm(desc);                                            \
5106     uint32_t vl = env->vl;                                                  \
5107     uint32_t esz = sizeof(ETYPE);                                           \
5108     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5109     uint32_t vta = vext_vta(desc);                                          \
5110     uint32_t vma = vext_vma(desc);                                          \
5111     uint32_t i;                                                             \
5112                                                                             \
5113     VSTART_CHECK_EARLY_EXIT(env);                                           \
5114                                                                             \
5115     for (i = env->vstart; i < vl; i++) {                                    \
5116         if (!vm && !vext_elem_mask(v0, i)) {                                \
5117             /* set masked-off elements to 1s */                             \
5118             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5119             continue;                                                       \
5120         }                                                                   \
5121         if (i == 0) {                                                       \
5122             *((ETYPE *)vd + H(i)) = s1;                                     \
5123         } else {                                                            \
5124             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5125         }                                                                   \
5126     }                                                                       \
5127     env->vstart = 0;                                                        \
5128     /* set tail elements to 1s */                                           \
5129     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5130 }
5131 
5132 GEN_VEXT_VSLIE1UP(8,  H1)
5133 GEN_VEXT_VSLIE1UP(16, H2)
5134 GEN_VEXT_VSLIE1UP(32, H4)
5135 GEN_VEXT_VSLIE1UP(64, H8)
5136 
5137 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5138 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5139                   CPURISCVState *env, uint32_t desc)              \
5140 {                                                                 \
5141     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5142 }
5143 
5144 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5145 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5146 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5147 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5148 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5149 
5150 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5151 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5152                                    void *vs2, CPURISCVState *env,             \
5153                                    uint32_t desc)                             \
5154 {                                                                             \
5155     typedef uint##BITWIDTH##_t ETYPE;                                         \
5156     uint32_t vm = vext_vm(desc);                                              \
5157     uint32_t vl = env->vl;                                                    \
5158     uint32_t esz = sizeof(ETYPE);                                             \
5159     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5160     uint32_t vta = vext_vta(desc);                                            \
5161     uint32_t vma = vext_vma(desc);                                            \
5162     uint32_t i;                                                               \
5163                                                                               \
5164     VSTART_CHECK_EARLY_EXIT(env);                                             \
5165                                                                               \
5166     for (i = env->vstart; i < vl; i++) {                                      \
5167         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5168             /* set masked-off elements to 1s */                               \
5169             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5170             continue;                                                         \
5171         }                                                                     \
5172         if (i == vl - 1) {                                                    \
5173             *((ETYPE *)vd + H(i)) = s1;                                       \
5174         } else {                                                              \
5175             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5176         }                                                                     \
5177     }                                                                         \
5178     env->vstart = 0;                                                          \
5179     /* set tail elements to 1s */                                             \
5180     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5181 }
5182 
5183 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5184 GEN_VEXT_VSLIDE1DOWN(16, H2)
5185 GEN_VEXT_VSLIDE1DOWN(32, H4)
5186 GEN_VEXT_VSLIDE1DOWN(64, H8)
5187 
5188 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5189 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5190                   CPURISCVState *env, uint32_t desc)              \
5191 {                                                                 \
5192     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5193 }
5194 
5195 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5196 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5197 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5198 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5199 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5200 
5201 /* Vector Floating-Point Slide Instructions */
5202 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5203 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5204                   CPURISCVState *env, uint32_t desc)          \
5205 {                                                             \
5206     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5207 }
5208 
5209 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5210 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5211 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5212 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5213 
5214 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5215 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5216                   CPURISCVState *env, uint32_t desc)          \
5217 {                                                             \
5218     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5219 }
5220 
5221 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5222 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5223 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5224 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5225 
5226 /* Vector Register Gather Instruction */
5227 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5228 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5229                   CPURISCVState *env, uint32_t desc)                      \
5230 {                                                                         \
5231     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5232     uint32_t vm = vext_vm(desc);                                          \
5233     uint32_t vl = env->vl;                                                \
5234     uint32_t esz = sizeof(TS2);                                           \
5235     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5236     uint32_t vta = vext_vta(desc);                                        \
5237     uint32_t vma = vext_vma(desc);                                        \
5238     uint64_t index;                                                       \
5239     uint32_t i;                                                           \
5240                                                                           \
5241     VSTART_CHECK_EARLY_EXIT(env);                                         \
5242                                                                           \
5243     for (i = env->vstart; i < vl; i++) {                                  \
5244         if (!vm && !vext_elem_mask(v0, i)) {                              \
5245             /* set masked-off elements to 1s */                           \
5246             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5247             continue;                                                     \
5248         }                                                                 \
5249         index = *((TS1 *)vs1 + HS1(i));                                   \
5250         if (index >= vlmax) {                                             \
5251             *((TS2 *)vd + HS2(i)) = 0;                                    \
5252         } else {                                                          \
5253             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5254         }                                                                 \
5255     }                                                                     \
5256     env->vstart = 0;                                                      \
5257     /* set tail elements to 1s */                                         \
5258     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5259 }
5260 
5261 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5262 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5263 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5264 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5265 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5266 
5267 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5268 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5269 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5270 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5271 
5272 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5273 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5274                   CPURISCVState *env, uint32_t desc)                      \
5275 {                                                                         \
5276     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5277     uint32_t vm = vext_vm(desc);                                          \
5278     uint32_t vl = env->vl;                                                \
5279     uint32_t esz = sizeof(ETYPE);                                         \
5280     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5281     uint32_t vta = vext_vta(desc);                                        \
5282     uint32_t vma = vext_vma(desc);                                        \
5283     uint64_t index = s1;                                                  \
5284     uint32_t i;                                                           \
5285                                                                           \
5286     VSTART_CHECK_EARLY_EXIT(env);                                         \
5287                                                                           \
5288     for (i = env->vstart; i < vl; i++) {                                  \
5289         if (!vm && !vext_elem_mask(v0, i)) {                              \
5290             /* set masked-off elements to 1s */                           \
5291             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5292             continue;                                                     \
5293         }                                                                 \
5294         if (index >= vlmax) {                                             \
5295             *((ETYPE *)vd + H(i)) = 0;                                    \
5296         } else {                                                          \
5297             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5298         }                                                                 \
5299     }                                                                     \
5300     env->vstart = 0;                                                      \
5301     /* set tail elements to 1s */                                         \
5302     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5303 }
5304 
5305 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5306 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5307 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5308 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5309 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5310 
5311 /* Vector Compress Instruction */
5312 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5313 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5314                   CPURISCVState *env, uint32_t desc)                      \
5315 {                                                                         \
5316     uint32_t vl = env->vl;                                                \
5317     uint32_t esz = sizeof(ETYPE);                                         \
5318     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5319     uint32_t vta = vext_vta(desc);                                        \
5320     uint32_t num = 0, i;                                                  \
5321                                                                           \
5322     for (i = env->vstart; i < vl; i++) {                                  \
5323         if (!vext_elem_mask(vs1, i)) {                                    \
5324             continue;                                                     \
5325         }                                                                 \
5326         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5327         num++;                                                            \
5328     }                                                                     \
5329     env->vstart = 0;                                                      \
5330     /* set tail elements to 1s */                                         \
5331     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5332 }
5333 
5334 /* Compress into vd elements of vs2 where vs1 is enabled */
5335 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5336 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5337 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5338 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5339 
5340 /* Vector Whole Register Move */
5341 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5342 {
5343     /* EEW = SEW */
5344     uint32_t maxsz = simd_maxsz(desc);
5345     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5346     uint32_t startb = env->vstart * sewb;
5347     uint32_t i = startb;
5348 
5349     if (startb >= maxsz) {
5350         env->vstart = 0;
5351         return;
5352     }
5353 
5354     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5355         uint32_t j = ROUND_UP(i, 8);
5356         memcpy((uint8_t *)vd + H1(j - 1),
5357                (uint8_t *)vs2 + H1(j - 1),
5358                j - i);
5359         i = j;
5360     }
5361 
5362     memcpy((uint8_t *)vd + H1(i),
5363            (uint8_t *)vs2 + H1(i),
5364            maxsz - i);
5365 
5366     env->vstart = 0;
5367 }
5368 
5369 /* Vector Integer Extension */
5370 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5371 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5372                   CPURISCVState *env, uint32_t desc)             \
5373 {                                                                \
5374     uint32_t vl = env->vl;                                       \
5375     uint32_t vm = vext_vm(desc);                                 \
5376     uint32_t esz = sizeof(ETYPE);                                \
5377     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5378     uint32_t vta = vext_vta(desc);                               \
5379     uint32_t vma = vext_vma(desc);                               \
5380     uint32_t i;                                                  \
5381                                                                  \
5382     VSTART_CHECK_EARLY_EXIT(env);                                \
5383                                                                  \
5384     for (i = env->vstart; i < vl; i++) {                         \
5385         if (!vm && !vext_elem_mask(v0, i)) {                     \
5386             /* set masked-off elements to 1s */                  \
5387             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5388             continue;                                            \
5389         }                                                        \
5390         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5391     }                                                            \
5392     env->vstart = 0;                                             \
5393     /* set tail elements to 1s */                                \
5394     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5395 }
5396 
5397 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5398 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5399 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5400 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5401 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5402 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5403 
5404 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5405 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5406 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5407 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5408 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5409 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5410