xref: /openbmc/qemu/target/riscv/vector_helper.c (revision dbdf841b)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
54         /* only set vill bit. */
55         env->vill = 1;
56         env->vtype = 0;
57         env->vl = 0;
58         env->vstart = 0;
59         return 0;
60     }
61 
62     vlmax = vext_get_vlmax(cpu, s2);
63     if (s1 <= vlmax) {
64         vl = s1;
65     } else {
66         vl = vlmax;
67     }
68     env->vl = vl;
69     env->vtype = s2;
70     env->vstart = 0;
71     env->vill = 0;
72     return vl;
73 }
74 
75 /*
76  * Note that vector data is stored in host-endian 64-bit chunks,
77  * so addressing units smaller than that needs a host-endian fixup.
78  */
79 #if HOST_BIG_ENDIAN
80 #define H1(x)   ((x) ^ 7)
81 #define H1_2(x) ((x) ^ 6)
82 #define H1_4(x) ((x) ^ 4)
83 #define H2(x)   ((x) ^ 3)
84 #define H4(x)   ((x) ^ 1)
85 #define H8(x)   ((x))
86 #else
87 #define H1(x)   (x)
88 #define H1_2(x) (x)
89 #define H1_4(x) (x)
90 #define H2(x)   (x)
91 #define H4(x)   (x)
92 #define H8(x)   (x)
93 #endif
94 
95 static inline uint32_t vext_nf(uint32_t desc)
96 {
97     return FIELD_EX32(simd_data(desc), VDATA, NF);
98 }
99 
100 static inline uint32_t vext_vm(uint32_t desc)
101 {
102     return FIELD_EX32(simd_data(desc), VDATA, VM);
103 }
104 
105 /*
106  * Encode LMUL to lmul as following:
107  *     LMUL    vlmul    lmul
108  *      1       000       0
109  *      2       001       1
110  *      4       010       2
111  *      8       011       3
112  *      -       100       -
113  *     1/8      101      -3
114  *     1/4      110      -2
115  *     1/2      111      -1
116  */
117 static inline int32_t vext_lmul(uint32_t desc)
118 {
119     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
120 }
121 
122 static inline uint32_t vext_vta(uint32_t desc)
123 {
124     return FIELD_EX32(simd_data(desc), VDATA, VTA);
125 }
126 
127 static inline uint32_t vext_vma(uint32_t desc)
128 {
129     return FIELD_EX32(simd_data(desc), VDATA, VMA);
130 }
131 
132 static inline uint32_t vext_vta_all_1s(uint32_t desc)
133 {
134     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
135 }
136 
137 /*
138  * Get the maximum number of elements can be operated.
139  *
140  * log2_esz: log2 of element size in bytes.
141  */
142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
143 {
144     /*
145      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
146      * so vlen in bytes (vlenb) is encoded as maxsz.
147      */
148     uint32_t vlenb = simd_maxsz(desc);
149 
150     /* Return VLMAX */
151     int scale = vext_lmul(desc) - log2_esz;
152     return scale < 0 ? vlenb >> -scale : vlenb << scale;
153 }
154 
155 /*
156  * Get number of total elements, including prestart, body and tail elements.
157  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
158  * are held in the same vector register.
159  */
160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
161                                             uint32_t esz)
162 {
163     uint32_t vlenb = simd_maxsz(desc);
164     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
165     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
166                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
167     return (vlenb << emul) / esz;
168 }
169 
170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
171 {
172     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
173 }
174 
175 /*
176  * This function checks watchpoint before real load operation.
177  *
178  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
179  * In user mode, there is no watchpoint support now.
180  *
181  * It will trigger an exception if there is no mapping in TLB
182  * and page table walk can't fill the TLB entry. Then the guest
183  * software can return here after process the exception or never return.
184  */
185 static void probe_pages(CPURISCVState *env, target_ulong addr,
186                         target_ulong len, uintptr_t ra,
187                         MMUAccessType access_type)
188 {
189     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
190     target_ulong curlen = MIN(pagelen, len);
191 
192     probe_access(env, adjust_addr(env, addr), curlen, access_type,
193                  cpu_mmu_index(env, false), ra);
194     if (len > curlen) {
195         addr += curlen;
196         curlen = len - curlen;
197         probe_access(env, adjust_addr(env, addr), curlen, access_type,
198                      cpu_mmu_index(env, false), ra);
199     }
200 }
201 
202 /* set agnostic elements to 1s */
203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
204                               uint32_t tot)
205 {
206     if (is_agnostic == 0) {
207         /* policy undisturbed */
208         return;
209     }
210     if (tot - cnt == 0) {
211         return;
212     }
213     memset(base + cnt, -1, tot - cnt);
214 }
215 
216 static inline void vext_set_elem_mask(void *v0, int index,
217                                       uint8_t value)
218 {
219     int idx = index / 64;
220     int pos = index % 64;
221     uint64_t old = ((uint64_t *)v0)[idx];
222     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
223 }
224 
225 /*
226  * Earlier designs (pre-0.9) had a varying number of bits
227  * per mask value (MLEN). In the 0.9 design, MLEN=1.
228  * (Section 4.5)
229  */
230 static inline int vext_elem_mask(void *v0, int index)
231 {
232     int idx = index / 64;
233     int pos = index  % 64;
234     return (((uint64_t *)v0)[idx] >> pos) & 1;
235 }
236 
237 /* elements operations for load and store */
238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
239                                uint32_t idx, void *vd, uintptr_t retaddr);
240 
241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
242 static void NAME(CPURISCVState *env, abi_ptr addr,         \
243                  uint32_t idx, void *vd, uintptr_t retaddr)\
244 {                                                          \
245     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
246     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
247 }                                                          \
248 
249 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
253 
254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
255 static void NAME(CPURISCVState *env, abi_ptr addr,         \
256                  uint32_t idx, void *vd, uintptr_t retaddr)\
257 {                                                          \
258     ETYPE data = *((ETYPE *)vd + H(idx));                  \
259     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
260 }
261 
262 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
266 
267 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
268                                    uint32_t desc, uint32_t nf,
269                                    uint32_t esz, uint32_t max_elems)
270 {
271     uint32_t vta = vext_vta(desc);
272     int k;
273 
274     if (vta == 0) {
275         return;
276     }
277 
278     for (k = 0; k < nf; ++k) {
279         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
280                           (k * max_elems + max_elems) * esz);
281     }
282 }
283 
284 /*
285  * stride: access vector element from strided memory
286  */
287 static void
288 vext_ldst_stride(void *vd, void *v0, target_ulong base,
289                  target_ulong stride, CPURISCVState *env,
290                  uint32_t desc, uint32_t vm,
291                  vext_ldst_elem_fn *ldst_elem,
292                  uint32_t log2_esz, uintptr_t ra)
293 {
294     uint32_t i, k;
295     uint32_t nf = vext_nf(desc);
296     uint32_t max_elems = vext_max_elems(desc, log2_esz);
297     uint32_t esz = 1 << log2_esz;
298     uint32_t vma = vext_vma(desc);
299 
300     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
301         k = 0;
302         while (k < nf) {
303             if (!vm && !vext_elem_mask(v0, i)) {
304                 /* set masked-off elements to 1s */
305                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
306                                   (i + k * max_elems + 1) * esz);
307                 k++;
308                 continue;
309             }
310             target_ulong addr = base + stride * i + (k << log2_esz);
311             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
312             k++;
313         }
314     }
315     env->vstart = 0;
316 
317     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
318 }
319 
320 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
321 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
322                   target_ulong stride, CPURISCVState *env,              \
323                   uint32_t desc)                                        \
324 {                                                                       \
325     uint32_t vm = vext_vm(desc);                                        \
326     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
327                      ctzl(sizeof(ETYPE)), GETPC());                     \
328 }
329 
330 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
331 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
332 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
333 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
334 
335 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
336 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
337                   target_ulong stride, CPURISCVState *env,              \
338                   uint32_t desc)                                        \
339 {                                                                       \
340     uint32_t vm = vext_vm(desc);                                        \
341     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
342                      ctzl(sizeof(ETYPE)), GETPC());                     \
343 }
344 
345 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
346 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
347 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
348 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
349 
350 /*
351  * unit-stride: access elements stored contiguously in memory
352  */
353 
354 /* unmasked unit-stride load and store operation */
355 static void
356 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
357              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
358              uintptr_t ra)
359 {
360     uint32_t i, k;
361     uint32_t nf = vext_nf(desc);
362     uint32_t max_elems = vext_max_elems(desc, log2_esz);
363     uint32_t esz = 1 << log2_esz;
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375 
376     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
377 }
378 
379 /*
380  * masked unit-stride load and store operation will be a special case of
381  * stride, stride = NF * sizeof (ETYPE)
382  */
383 
384 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
385 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
386                          CPURISCVState *env, uint32_t desc)             \
387 {                                                                       \
388     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
389     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
390                      ctzl(sizeof(ETYPE)), GETPC());                     \
391 }                                                                       \
392                                                                         \
393 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
394                   CPURISCVState *env, uint32_t desc)                    \
395 {                                                                       \
396     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
397                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
398 }
399 
400 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
401 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
402 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
403 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
404 
405 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
406 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
407                          CPURISCVState *env, uint32_t desc)              \
408 {                                                                        \
409     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
410     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
411                      ctzl(sizeof(ETYPE)), GETPC());                      \
412 }                                                                        \
413                                                                          \
414 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
415                   CPURISCVState *env, uint32_t desc)                     \
416 {                                                                        \
417     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
418                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
419 }
420 
421 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
422 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
423 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
424 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
425 
426 /*
427  * unit stride mask load and store, EEW = 1
428  */
429 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
430                     CPURISCVState *env, uint32_t desc)
431 {
432     /* evl = ceil(vl/8) */
433     uint8_t evl = (env->vl + 7) >> 3;
434     vext_ldst_us(vd, base, env, desc, lde_b,
435                  0, evl, GETPC());
436 }
437 
438 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
439                     CPURISCVState *env, uint32_t desc)
440 {
441     /* evl = ceil(vl/8) */
442     uint8_t evl = (env->vl + 7) >> 3;
443     vext_ldst_us(vd, base, env, desc, ste_b,
444                  0, evl, GETPC());
445 }
446 
447 /*
448  * index: access vector element from indexed memory
449  */
450 typedef target_ulong vext_get_index_addr(target_ulong base,
451         uint32_t idx, void *vs2);
452 
453 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
454 static target_ulong NAME(target_ulong base,            \
455                          uint32_t idx, void *vs2)      \
456 {                                                      \
457     return (base + *((ETYPE *)vs2 + H(idx)));          \
458 }
459 
460 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
461 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
462 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
463 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
464 
465 static inline void
466 vext_ldst_index(void *vd, void *v0, target_ulong base,
467                 void *vs2, CPURISCVState *env, uint32_t desc,
468                 vext_get_index_addr get_index_addr,
469                 vext_ldst_elem_fn *ldst_elem,
470                 uint32_t log2_esz, uintptr_t ra)
471 {
472     uint32_t i, k;
473     uint32_t nf = vext_nf(desc);
474     uint32_t vm = vext_vm(desc);
475     uint32_t max_elems = vext_max_elems(desc, log2_esz);
476     uint32_t esz = 1 << log2_esz;
477     uint32_t vma = vext_vma(desc);
478 
479     /* load bytes from guest memory */
480     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
481         k = 0;
482         while (k < nf) {
483             if (!vm && !vext_elem_mask(v0, i)) {
484                 /* set masked-off elements to 1s */
485                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
486                                   (i + k * max_elems + 1) * esz);
487                 k++;
488                 continue;
489             }
490             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
491             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
492             k++;
493         }
494     }
495     env->vstart = 0;
496 
497     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
498 }
499 
500 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
501 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
502                   void *vs2, CPURISCVState *env, uint32_t desc)            \
503 {                                                                          \
504     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
505                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
506 }
507 
508 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
509 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
510 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
511 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
512 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
513 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
514 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
515 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
516 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
517 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
518 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
519 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
520 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
521 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
522 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
523 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
524 
525 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
526 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
527                   void *vs2, CPURISCVState *env, uint32_t desc)  \
528 {                                                                \
529     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
530                     STORE_FN, ctzl(sizeof(ETYPE)),               \
531                     GETPC());                                    \
532 }
533 
534 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
535 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
536 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
537 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
538 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
539 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
540 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
541 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
542 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
543 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
544 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
545 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
546 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
547 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
548 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
549 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
550 
551 /*
552  * unit-stride fault-only-fisrt load instructions
553  */
554 static inline void
555 vext_ldff(void *vd, void *v0, target_ulong base,
556           CPURISCVState *env, uint32_t desc,
557           vext_ldst_elem_fn *ldst_elem,
558           uint32_t log2_esz, uintptr_t ra)
559 {
560     void *host;
561     uint32_t i, k, vl = 0;
562     uint32_t nf = vext_nf(desc);
563     uint32_t vm = vext_vm(desc);
564     uint32_t max_elems = vext_max_elems(desc, log2_esz);
565     uint32_t esz = 1 << log2_esz;
566     uint32_t vma = vext_vma(desc);
567     target_ulong addr, offset, remain;
568 
569     /* probe every access */
570     for (i = env->vstart; i < env->vl; i++) {
571         if (!vm && !vext_elem_mask(v0, i)) {
572             continue;
573         }
574         addr = adjust_addr(env, base + i * (nf << log2_esz));
575         if (i == 0) {
576             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
577         } else {
578             /* if it triggers an exception, no need to check watchpoint */
579             remain = nf << log2_esz;
580             while (remain > 0) {
581                 offset = -(addr | TARGET_PAGE_MASK);
582                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
583                                          cpu_mmu_index(env, false));
584                 if (host) {
585 #ifdef CONFIG_USER_ONLY
586                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
587                         vl = i;
588                         goto ProbeSuccess;
589                     }
590 #else
591                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
592 #endif
593                 } else {
594                     vl = i;
595                     goto ProbeSuccess;
596                 }
597                 if (remain <=  offset) {
598                     break;
599                 }
600                 remain -= offset;
601                 addr = adjust_addr(env, addr + offset);
602             }
603         }
604     }
605 ProbeSuccess:
606     /* load bytes from guest memory */
607     if (vl != 0) {
608         env->vl = vl;
609     }
610     for (i = env->vstart; i < env->vl; i++) {
611         k = 0;
612         while (k < nf) {
613             if (!vm && !vext_elem_mask(v0, i)) {
614                 /* set masked-off elements to 1s */
615                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
616                                   (i + k * max_elems + 1) * esz);
617                 k++;
618                 continue;
619             }
620             target_ulong addr = base + ((i * nf + k) << log2_esz);
621             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
622             k++;
623         }
624     }
625     env->vstart = 0;
626 
627     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
628 }
629 
630 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
631 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
632                   CPURISCVState *env, uint32_t desc)      \
633 {                                                         \
634     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
635               ctzl(sizeof(ETYPE)), GETPC());              \
636 }
637 
638 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
639 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
640 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
641 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
642 
643 #define DO_SWAP(N, M) (M)
644 #define DO_AND(N, M)  (N & M)
645 #define DO_XOR(N, M)  (N ^ M)
646 #define DO_OR(N, M)   (N | M)
647 #define DO_ADD(N, M)  (N + M)
648 
649 /* Signed min/max */
650 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
651 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
652 
653 /*
654  * load and store whole register instructions
655  */
656 static void
657 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
658                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
659 {
660     uint32_t i, k, off, pos;
661     uint32_t nf = vext_nf(desc);
662     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
663     uint32_t max_elems = vlenb >> log2_esz;
664 
665     k = env->vstart / max_elems;
666     off = env->vstart % max_elems;
667 
668     if (off) {
669         /* load/store rest of elements of current segment pointed by vstart */
670         for (pos = off; pos < max_elems; pos++, env->vstart++) {
671             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
672             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
673                       ra);
674         }
675         k++;
676     }
677 
678     /* load/store elements for rest of segments */
679     for (; k < nf; k++) {
680         for (i = 0; i < max_elems; i++, env->vstart++) {
681             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
682             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
683         }
684     }
685 
686     env->vstart = 0;
687 }
688 
689 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
690 void HELPER(NAME)(void *vd, target_ulong base,       \
691                   CPURISCVState *env, uint32_t desc) \
692 {                                                    \
693     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
694                     ctzl(sizeof(ETYPE)), GETPC());   \
695 }
696 
697 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
698 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
699 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
700 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
701 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
702 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
703 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
704 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
705 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
706 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
707 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
708 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
709 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
710 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
711 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
712 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
713 
714 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
715 void HELPER(NAME)(void *vd, target_ulong base,       \
716                   CPURISCVState *env, uint32_t desc) \
717 {                                                    \
718     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
719                     ctzl(sizeof(ETYPE)), GETPC());   \
720 }
721 
722 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
723 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
724 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
725 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
726 
727 /*
728  * Vector Integer Arithmetic Instructions
729  */
730 
731 /* expand macro args before macro */
732 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
733 
734 /* (TD, T1, T2, TX1, TX2) */
735 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
736 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
737 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
738 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
739 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
740 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
741 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
742 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
743 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
744 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
745 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
746 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
747 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
748 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
749 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
750 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
751 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
752 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
753 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
754 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
755 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
756 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
757 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
758 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
759 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
760 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
761 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
762 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
763 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
764 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
765 
766 /* operation of two vector elements */
767 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
768 
769 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
770 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
771 {                                                               \
772     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
773     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
774     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
775 }
776 #define DO_SUB(N, M) (N - M)
777 #define DO_RSUB(N, M) (M - N)
778 
779 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
780 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
781 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
782 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
783 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
784 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
785 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
786 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
787 
788 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
789                        CPURISCVState *env, uint32_t desc,
790                        opivv2_fn *fn, uint32_t esz)
791 {
792     uint32_t vm = vext_vm(desc);
793     uint32_t vl = env->vl;
794     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
795     uint32_t vta = vext_vta(desc);
796     uint32_t vma = vext_vma(desc);
797     uint32_t i;
798 
799     for (i = env->vstart; i < vl; i++) {
800         if (!vm && !vext_elem_mask(v0, i)) {
801             /* set masked-off elements to 1s */
802             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
803             continue;
804         }
805         fn(vd, vs1, vs2, i);
806     }
807     env->vstart = 0;
808     /* set tail elements to 1s */
809     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
810 }
811 
812 /* generate the helpers for OPIVV */
813 #define GEN_VEXT_VV(NAME, ESZ)                            \
814 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
815                   void *vs2, CPURISCVState *env,          \
816                   uint32_t desc)                          \
817 {                                                         \
818     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
819                do_##NAME, ESZ);                           \
820 }
821 
822 GEN_VEXT_VV(vadd_vv_b, 1)
823 GEN_VEXT_VV(vadd_vv_h, 2)
824 GEN_VEXT_VV(vadd_vv_w, 4)
825 GEN_VEXT_VV(vadd_vv_d, 8)
826 GEN_VEXT_VV(vsub_vv_b, 1)
827 GEN_VEXT_VV(vsub_vv_h, 2)
828 GEN_VEXT_VV(vsub_vv_w, 4)
829 GEN_VEXT_VV(vsub_vv_d, 8)
830 
831 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
832 
833 /*
834  * (T1)s1 gives the real operator type.
835  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
836  */
837 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
838 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
839 {                                                                   \
840     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
841     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
842 }
843 
844 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
845 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
846 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
847 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
848 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
849 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
850 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
851 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
852 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
853 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
854 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
855 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
856 
857 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
858                        CPURISCVState *env, uint32_t desc,
859                        opivx2_fn fn, uint32_t esz)
860 {
861     uint32_t vm = vext_vm(desc);
862     uint32_t vl = env->vl;
863     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
864     uint32_t vta = vext_vta(desc);
865     uint32_t vma = vext_vma(desc);
866     uint32_t i;
867 
868     for (i = env->vstart; i < vl; i++) {
869         if (!vm && !vext_elem_mask(v0, i)) {
870             /* set masked-off elements to 1s */
871             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
872             continue;
873         }
874         fn(vd, s1, vs2, i);
875     }
876     env->vstart = 0;
877     /* set tail elements to 1s */
878     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
879 }
880 
881 /* generate the helpers for OPIVX */
882 #define GEN_VEXT_VX(NAME, ESZ)                            \
883 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
884                   void *vs2, CPURISCVState *env,          \
885                   uint32_t desc)                          \
886 {                                                         \
887     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
888                do_##NAME, ESZ);                           \
889 }
890 
891 GEN_VEXT_VX(vadd_vx_b, 1)
892 GEN_VEXT_VX(vadd_vx_h, 2)
893 GEN_VEXT_VX(vadd_vx_w, 4)
894 GEN_VEXT_VX(vadd_vx_d, 8)
895 GEN_VEXT_VX(vsub_vx_b, 1)
896 GEN_VEXT_VX(vsub_vx_h, 2)
897 GEN_VEXT_VX(vsub_vx_w, 4)
898 GEN_VEXT_VX(vsub_vx_d, 8)
899 GEN_VEXT_VX(vrsub_vx_b, 1)
900 GEN_VEXT_VX(vrsub_vx_h, 2)
901 GEN_VEXT_VX(vrsub_vx_w, 4)
902 GEN_VEXT_VX(vrsub_vx_d, 8)
903 
904 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
905 {
906     intptr_t oprsz = simd_oprsz(desc);
907     intptr_t i;
908 
909     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
910         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
911     }
912 }
913 
914 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
915 {
916     intptr_t oprsz = simd_oprsz(desc);
917     intptr_t i;
918 
919     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
920         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
921     }
922 }
923 
924 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
925 {
926     intptr_t oprsz = simd_oprsz(desc);
927     intptr_t i;
928 
929     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
930         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
931     }
932 }
933 
934 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
935 {
936     intptr_t oprsz = simd_oprsz(desc);
937     intptr_t i;
938 
939     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
940         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
941     }
942 }
943 
944 /* Vector Widening Integer Add/Subtract */
945 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
946 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
947 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
948 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
949 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
950 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
951 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
952 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
953 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
954 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
955 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
956 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
957 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
958 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
959 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
960 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
961 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
962 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
963 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
964 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
965 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
966 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
967 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
968 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
969 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
970 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
972 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
973 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
975 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
976 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
978 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
979 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
981 GEN_VEXT_VV(vwaddu_vv_b, 2)
982 GEN_VEXT_VV(vwaddu_vv_h, 4)
983 GEN_VEXT_VV(vwaddu_vv_w, 8)
984 GEN_VEXT_VV(vwsubu_vv_b, 2)
985 GEN_VEXT_VV(vwsubu_vv_h, 4)
986 GEN_VEXT_VV(vwsubu_vv_w, 8)
987 GEN_VEXT_VV(vwadd_vv_b, 2)
988 GEN_VEXT_VV(vwadd_vv_h, 4)
989 GEN_VEXT_VV(vwadd_vv_w, 8)
990 GEN_VEXT_VV(vwsub_vv_b, 2)
991 GEN_VEXT_VV(vwsub_vv_h, 4)
992 GEN_VEXT_VV(vwsub_vv_w, 8)
993 GEN_VEXT_VV(vwaddu_wv_b, 2)
994 GEN_VEXT_VV(vwaddu_wv_h, 4)
995 GEN_VEXT_VV(vwaddu_wv_w, 8)
996 GEN_VEXT_VV(vwsubu_wv_b, 2)
997 GEN_VEXT_VV(vwsubu_wv_h, 4)
998 GEN_VEXT_VV(vwsubu_wv_w, 8)
999 GEN_VEXT_VV(vwadd_wv_b, 2)
1000 GEN_VEXT_VV(vwadd_wv_h, 4)
1001 GEN_VEXT_VV(vwadd_wv_w, 8)
1002 GEN_VEXT_VV(vwsub_wv_b, 2)
1003 GEN_VEXT_VV(vwsub_wv_h, 4)
1004 GEN_VEXT_VV(vwsub_wv_w, 8)
1005 
1006 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1007 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1008 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1009 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1010 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1011 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1012 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1013 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1014 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1015 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1016 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1017 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1018 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1019 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1021 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1022 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1024 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1025 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1027 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1028 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1030 GEN_VEXT_VX(vwaddu_vx_b, 2)
1031 GEN_VEXT_VX(vwaddu_vx_h, 4)
1032 GEN_VEXT_VX(vwaddu_vx_w, 8)
1033 GEN_VEXT_VX(vwsubu_vx_b, 2)
1034 GEN_VEXT_VX(vwsubu_vx_h, 4)
1035 GEN_VEXT_VX(vwsubu_vx_w, 8)
1036 GEN_VEXT_VX(vwadd_vx_b, 2)
1037 GEN_VEXT_VX(vwadd_vx_h, 4)
1038 GEN_VEXT_VX(vwadd_vx_w, 8)
1039 GEN_VEXT_VX(vwsub_vx_b, 2)
1040 GEN_VEXT_VX(vwsub_vx_h, 4)
1041 GEN_VEXT_VX(vwsub_vx_w, 8)
1042 GEN_VEXT_VX(vwaddu_wx_b, 2)
1043 GEN_VEXT_VX(vwaddu_wx_h, 4)
1044 GEN_VEXT_VX(vwaddu_wx_w, 8)
1045 GEN_VEXT_VX(vwsubu_wx_b, 2)
1046 GEN_VEXT_VX(vwsubu_wx_h, 4)
1047 GEN_VEXT_VX(vwsubu_wx_w, 8)
1048 GEN_VEXT_VX(vwadd_wx_b, 2)
1049 GEN_VEXT_VX(vwadd_wx_h, 4)
1050 GEN_VEXT_VX(vwadd_wx_w, 8)
1051 GEN_VEXT_VX(vwsub_wx_b, 2)
1052 GEN_VEXT_VX(vwsub_wx_h, 4)
1053 GEN_VEXT_VX(vwsub_wx_w, 8)
1054 
1055 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1056 #define DO_VADC(N, M, C) (N + M + C)
1057 #define DO_VSBC(N, M, C) (N - M - C)
1058 
1059 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1060 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1061                   CPURISCVState *env, uint32_t desc)          \
1062 {                                                             \
1063     uint32_t vl = env->vl;                                    \
1064     uint32_t esz = sizeof(ETYPE);                             \
1065     uint32_t total_elems =                                    \
1066         vext_get_total_elems(env, desc, esz);                 \
1067     uint32_t vta = vext_vta(desc);                            \
1068     uint32_t i;                                               \
1069                                                               \
1070     for (i = env->vstart; i < vl; i++) {                      \
1071         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1072         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1073         ETYPE carry = vext_elem_mask(v0, i);                  \
1074                                                               \
1075         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1076     }                                                         \
1077     env->vstart = 0;                                          \
1078     /* set tail elements to 1s */                             \
1079     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1080 }
1081 
1082 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1083 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1084 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1085 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1086 
1087 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1088 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1089 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1090 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1091 
1092 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1093 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1094                   CPURISCVState *env, uint32_t desc)                     \
1095 {                                                                        \
1096     uint32_t vl = env->vl;                                               \
1097     uint32_t esz = sizeof(ETYPE);                                        \
1098     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1099     uint32_t vta = vext_vta(desc);                                       \
1100     uint32_t i;                                                          \
1101                                                                          \
1102     for (i = env->vstart; i < vl; i++) {                                 \
1103         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1104         ETYPE carry = vext_elem_mask(v0, i);                             \
1105                                                                          \
1106         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1107     }                                                                    \
1108     env->vstart = 0;                                                     \
1109     /* set tail elements to 1s */                                        \
1110     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1111 }
1112 
1113 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1114 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1115 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1116 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1117 
1118 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1119 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1120 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1121 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1122 
1123 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1124                           (__typeof(N))(N + M) < N)
1125 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1126 
1127 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1128 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1129                   CPURISCVState *env, uint32_t desc)          \
1130 {                                                             \
1131     uint32_t vl = env->vl;                                    \
1132     uint32_t vm = vext_vm(desc);                              \
1133     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1134     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1135     uint32_t i;                                               \
1136                                                               \
1137     for (i = env->vstart; i < vl; i++) {                      \
1138         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1139         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1140         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1141         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1142     }                                                         \
1143     env->vstart = 0;                                          \
1144     /*
1145      * mask destination register are always tail-agnostic
1146      * set tail elements to 1s
1147      */                                                       \
1148     if (vta_all_1s) {                                         \
1149         for (; i < total_elems; i++) {                        \
1150             vext_set_elem_mask(vd, i, 1);                     \
1151         }                                                     \
1152     }                                                         \
1153 }
1154 
1155 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1156 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1157 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1158 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1159 
1160 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1161 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1162 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1163 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1164 
1165 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1166 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1167                   void *vs2, CPURISCVState *env, uint32_t desc) \
1168 {                                                               \
1169     uint32_t vl = env->vl;                                      \
1170     uint32_t vm = vext_vm(desc);                                \
1171     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1172     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1173     uint32_t i;                                                 \
1174                                                                 \
1175     for (i = env->vstart; i < vl; i++) {                        \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1178         vext_set_elem_mask(vd, i,                               \
1179                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1180     }                                                           \
1181     env->vstart = 0;                                            \
1182     /*
1183      * mask destination register are always tail-agnostic
1184      * set tail elements to 1s
1185      */                                                         \
1186     if (vta_all_1s) {                                           \
1187         for (; i < total_elems; i++) {                          \
1188             vext_set_elem_mask(vd, i, 1);                       \
1189         }                                                       \
1190     }                                                           \
1191 }
1192 
1193 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1194 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1195 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1196 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1197 
1198 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1199 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1200 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1201 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1202 
1203 /* Vector Bitwise Logical Instructions */
1204 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1205 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1206 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1207 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1208 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1209 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1210 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1211 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1212 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1213 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1214 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1215 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1216 GEN_VEXT_VV(vand_vv_b, 1)
1217 GEN_VEXT_VV(vand_vv_h, 2)
1218 GEN_VEXT_VV(vand_vv_w, 4)
1219 GEN_VEXT_VV(vand_vv_d, 8)
1220 GEN_VEXT_VV(vor_vv_b, 1)
1221 GEN_VEXT_VV(vor_vv_h, 2)
1222 GEN_VEXT_VV(vor_vv_w, 4)
1223 GEN_VEXT_VV(vor_vv_d, 8)
1224 GEN_VEXT_VV(vxor_vv_b, 1)
1225 GEN_VEXT_VV(vxor_vv_h, 2)
1226 GEN_VEXT_VV(vxor_vv_w, 4)
1227 GEN_VEXT_VV(vxor_vv_d, 8)
1228 
1229 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1230 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1231 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1232 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1233 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1234 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1235 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1236 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1237 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1238 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1239 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1240 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1241 GEN_VEXT_VX(vand_vx_b, 1)
1242 GEN_VEXT_VX(vand_vx_h, 2)
1243 GEN_VEXT_VX(vand_vx_w, 4)
1244 GEN_VEXT_VX(vand_vx_d, 8)
1245 GEN_VEXT_VX(vor_vx_b, 1)
1246 GEN_VEXT_VX(vor_vx_h, 2)
1247 GEN_VEXT_VX(vor_vx_w, 4)
1248 GEN_VEXT_VX(vor_vx_d, 8)
1249 GEN_VEXT_VX(vxor_vx_b, 1)
1250 GEN_VEXT_VX(vxor_vx_h, 2)
1251 GEN_VEXT_VX(vxor_vx_w, 4)
1252 GEN_VEXT_VX(vxor_vx_d, 8)
1253 
1254 /* Vector Single-Width Bit Shift Instructions */
1255 #define DO_SLL(N, M)  (N << (M))
1256 #define DO_SRL(N, M)  (N >> (M))
1257 
1258 /* generate the helpers for shift instructions with two vector operators */
1259 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1260 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1261                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1262 {                                                                         \
1263     uint32_t vm = vext_vm(desc);                                          \
1264     uint32_t vl = env->vl;                                                \
1265     uint32_t esz = sizeof(TS1);                                           \
1266     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1267     uint32_t vta = vext_vta(desc);                                        \
1268     uint32_t vma = vext_vma(desc);                                        \
1269     uint32_t i;                                                           \
1270                                                                           \
1271     for (i = env->vstart; i < vl; i++) {                                  \
1272         if (!vm && !vext_elem_mask(v0, i)) {                              \
1273             /* set masked-off elements to 1s */                           \
1274             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1275             continue;                                                     \
1276         }                                                                 \
1277         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1278         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1279         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1280     }                                                                     \
1281     env->vstart = 0;                                                      \
1282     /* set tail elements to 1s */                                         \
1283     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1284 }
1285 
1286 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1287 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1288 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1289 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1290 
1291 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1292 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1293 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1294 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1295 
1296 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1297 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1298 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1299 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1300 
1301 /*
1302  * generate the helpers for shift instructions with one vector and one scalar
1303  */
1304 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1305 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1306                   void *vs2, CPURISCVState *env,            \
1307                   uint32_t desc)                            \
1308 {                                                           \
1309     uint32_t vm = vext_vm(desc);                            \
1310     uint32_t vl = env->vl;                                  \
1311     uint32_t esz = sizeof(TD);                              \
1312     uint32_t total_elems =                                  \
1313         vext_get_total_elems(env, desc, esz);               \
1314     uint32_t vta = vext_vta(desc);                          \
1315     uint32_t vma = vext_vma(desc);                          \
1316     uint32_t i;                                             \
1317                                                             \
1318     for (i = env->vstart; i < vl; i++) {                    \
1319         if (!vm && !vext_elem_mask(v0, i)) {                \
1320             /* set masked-off elements to 1s */             \
1321             vext_set_elems_1s(vd, vma, i * esz,             \
1322                               (i + 1) * esz);               \
1323             continue;                                       \
1324         }                                                   \
1325         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1326         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1327     }                                                       \
1328     env->vstart = 0;                                        \
1329     /* set tail elements to 1s */                           \
1330     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1331 }
1332 
1333 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1334 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1336 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1337 
1338 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1341 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1342 
1343 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1344 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1346 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1347 
1348 /* Vector Narrowing Integer Right Shift Instructions */
1349 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1351 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1352 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1354 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1355 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1357 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1358 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1360 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1361 
1362 /* Vector Integer Comparison Instructions */
1363 #define DO_MSEQ(N, M) (N == M)
1364 #define DO_MSNE(N, M) (N != M)
1365 #define DO_MSLT(N, M) (N < M)
1366 #define DO_MSLE(N, M) (N <= M)
1367 #define DO_MSGT(N, M) (N > M)
1368 
1369 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1370 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1371                   CPURISCVState *env, uint32_t desc)          \
1372 {                                                             \
1373     uint32_t vm = vext_vm(desc);                              \
1374     uint32_t vl = env->vl;                                    \
1375     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1376     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1377     uint32_t vma = vext_vma(desc);                            \
1378     uint32_t i;                                               \
1379                                                               \
1380     for (i = env->vstart; i < vl; i++) {                      \
1381         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1382         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1383         if (!vm && !vext_elem_mask(v0, i)) {                  \
1384             /* set masked-off elements to 1s */               \
1385             if (vma) {                                        \
1386                 vext_set_elem_mask(vd, i, 1);                 \
1387             }                                                 \
1388             continue;                                         \
1389         }                                                     \
1390         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1391     }                                                         \
1392     env->vstart = 0;                                          \
1393     /*
1394      * mask destination register are always tail-agnostic
1395      * set tail elements to 1s
1396      */                                                       \
1397     if (vta_all_1s) {                                         \
1398         for (; i < total_elems; i++) {                        \
1399             vext_set_elem_mask(vd, i, 1);                     \
1400         }                                                     \
1401     }                                                         \
1402 }
1403 
1404 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1405 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1406 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1408 
1409 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1410 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1411 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1413 
1414 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1415 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1416 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1418 
1419 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1420 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1421 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1423 
1424 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1425 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1426 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1428 
1429 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1430 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1431 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1433 
1434 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1435 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1436                   CPURISCVState *env, uint32_t desc)                \
1437 {                                                                   \
1438     uint32_t vm = vext_vm(desc);                                    \
1439     uint32_t vl = env->vl;                                          \
1440     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1441     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1442     uint32_t vma = vext_vma(desc);                                  \
1443     uint32_t i;                                                     \
1444                                                                     \
1445     for (i = env->vstart; i < vl; i++) {                            \
1446         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1447         if (!vm && !vext_elem_mask(v0, i)) {                        \
1448             /* set masked-off elements to 1s */                     \
1449             if (vma) {                                              \
1450                 vext_set_elem_mask(vd, i, 1);                       \
1451             }                                                       \
1452             continue;                                               \
1453         }                                                           \
1454         vext_set_elem_mask(vd, i,                                   \
1455                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1456     }                                                               \
1457     env->vstart = 0;                                                \
1458     /*
1459      * mask destination register are always tail-agnostic
1460      * set tail elements to 1s
1461      */                                                             \
1462     if (vta_all_1s) {                                               \
1463         for (; i < total_elems; i++) {                              \
1464             vext_set_elem_mask(vd, i, 1);                           \
1465         }                                                           \
1466     }                                                               \
1467 }
1468 
1469 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1470 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1471 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1472 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1473 
1474 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1475 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1476 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1477 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1478 
1479 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1480 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1481 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1482 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1483 
1484 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1485 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1488 
1489 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1490 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1491 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1492 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1493 
1494 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1495 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1498 
1499 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1500 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1501 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1502 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1503 
1504 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1505 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1508 
1509 /* Vector Integer Min/Max Instructions */
1510 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1511 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1512 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1513 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1514 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1515 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1516 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1517 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1518 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1519 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1520 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1521 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1522 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1523 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1524 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1525 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1526 GEN_VEXT_VV(vminu_vv_b, 1)
1527 GEN_VEXT_VV(vminu_vv_h, 2)
1528 GEN_VEXT_VV(vminu_vv_w, 4)
1529 GEN_VEXT_VV(vminu_vv_d, 8)
1530 GEN_VEXT_VV(vmin_vv_b, 1)
1531 GEN_VEXT_VV(vmin_vv_h, 2)
1532 GEN_VEXT_VV(vmin_vv_w, 4)
1533 GEN_VEXT_VV(vmin_vv_d, 8)
1534 GEN_VEXT_VV(vmaxu_vv_b, 1)
1535 GEN_VEXT_VV(vmaxu_vv_h, 2)
1536 GEN_VEXT_VV(vmaxu_vv_w, 4)
1537 GEN_VEXT_VV(vmaxu_vv_d, 8)
1538 GEN_VEXT_VV(vmax_vv_b, 1)
1539 GEN_VEXT_VV(vmax_vv_h, 2)
1540 GEN_VEXT_VV(vmax_vv_w, 4)
1541 GEN_VEXT_VV(vmax_vv_d, 8)
1542 
1543 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1544 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1545 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1546 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1547 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1548 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1549 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1550 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1551 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1552 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1553 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1554 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1555 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1556 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1557 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1558 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1559 GEN_VEXT_VX(vminu_vx_b, 1)
1560 GEN_VEXT_VX(vminu_vx_h, 2)
1561 GEN_VEXT_VX(vminu_vx_w, 4)
1562 GEN_VEXT_VX(vminu_vx_d, 8)
1563 GEN_VEXT_VX(vmin_vx_b, 1)
1564 GEN_VEXT_VX(vmin_vx_h, 2)
1565 GEN_VEXT_VX(vmin_vx_w, 4)
1566 GEN_VEXT_VX(vmin_vx_d, 8)
1567 GEN_VEXT_VX(vmaxu_vx_b, 1)
1568 GEN_VEXT_VX(vmaxu_vx_h, 2)
1569 GEN_VEXT_VX(vmaxu_vx_w, 4)
1570 GEN_VEXT_VX(vmaxu_vx_d, 8)
1571 GEN_VEXT_VX(vmax_vx_b, 1)
1572 GEN_VEXT_VX(vmax_vx_h, 2)
1573 GEN_VEXT_VX(vmax_vx_w, 4)
1574 GEN_VEXT_VX(vmax_vx_d, 8)
1575 
1576 /* Vector Single-Width Integer Multiply Instructions */
1577 #define DO_MUL(N, M) (N * M)
1578 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1579 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1580 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1581 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1582 GEN_VEXT_VV(vmul_vv_b, 1)
1583 GEN_VEXT_VV(vmul_vv_h, 2)
1584 GEN_VEXT_VV(vmul_vv_w, 4)
1585 GEN_VEXT_VV(vmul_vv_d, 8)
1586 
1587 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1588 {
1589     return (int16_t)s2 * (int16_t)s1 >> 8;
1590 }
1591 
1592 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1593 {
1594     return (int32_t)s2 * (int32_t)s1 >> 16;
1595 }
1596 
1597 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1598 {
1599     return (int64_t)s2 * (int64_t)s1 >> 32;
1600 }
1601 
1602 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1603 {
1604     uint64_t hi_64, lo_64;
1605 
1606     muls64(&lo_64, &hi_64, s1, s2);
1607     return hi_64;
1608 }
1609 
1610 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1611 {
1612     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1613 }
1614 
1615 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1616 {
1617     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1618 }
1619 
1620 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1621 {
1622     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1623 }
1624 
1625 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1626 {
1627     uint64_t hi_64, lo_64;
1628 
1629     mulu64(&lo_64, &hi_64, s2, s1);
1630     return hi_64;
1631 }
1632 
1633 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1634 {
1635     return (int16_t)s2 * (uint16_t)s1 >> 8;
1636 }
1637 
1638 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1639 {
1640     return (int32_t)s2 * (uint32_t)s1 >> 16;
1641 }
1642 
1643 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1644 {
1645     return (int64_t)s2 * (uint64_t)s1 >> 32;
1646 }
1647 
1648 /*
1649  * Let  A = signed operand,
1650  *      B = unsigned operand
1651  *      P = mulu64(A, B), unsigned product
1652  *
1653  * LET  X = 2 ** 64  - A, 2's complement of A
1654  *      SP = signed product
1655  * THEN
1656  *      IF A < 0
1657  *          SP = -X * B
1658  *             = -(2 ** 64 - A) * B
1659  *             = A * B - 2 ** 64 * B
1660  *             = P - 2 ** 64 * B
1661  *      ELSE
1662  *          SP = P
1663  * THEN
1664  *      HI_P -= (A < 0 ? B : 0)
1665  */
1666 
1667 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1668 {
1669     uint64_t hi_64, lo_64;
1670 
1671     mulu64(&lo_64, &hi_64, s2, s1);
1672 
1673     hi_64 -= s2 < 0 ? s1 : 0;
1674     return hi_64;
1675 }
1676 
1677 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1678 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1679 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1680 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1681 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1682 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1683 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1684 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1685 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1686 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1687 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1688 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1689 GEN_VEXT_VV(vmulh_vv_b, 1)
1690 GEN_VEXT_VV(vmulh_vv_h, 2)
1691 GEN_VEXT_VV(vmulh_vv_w, 4)
1692 GEN_VEXT_VV(vmulh_vv_d, 8)
1693 GEN_VEXT_VV(vmulhu_vv_b, 1)
1694 GEN_VEXT_VV(vmulhu_vv_h, 2)
1695 GEN_VEXT_VV(vmulhu_vv_w, 4)
1696 GEN_VEXT_VV(vmulhu_vv_d, 8)
1697 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1698 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1699 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1700 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1701 
1702 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1703 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1704 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1705 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1706 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1707 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1708 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1709 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1710 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1711 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1712 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1713 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1714 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1715 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1716 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1717 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1718 GEN_VEXT_VX(vmul_vx_b, 1)
1719 GEN_VEXT_VX(vmul_vx_h, 2)
1720 GEN_VEXT_VX(vmul_vx_w, 4)
1721 GEN_VEXT_VX(vmul_vx_d, 8)
1722 GEN_VEXT_VX(vmulh_vx_b, 1)
1723 GEN_VEXT_VX(vmulh_vx_h, 2)
1724 GEN_VEXT_VX(vmulh_vx_w, 4)
1725 GEN_VEXT_VX(vmulh_vx_d, 8)
1726 GEN_VEXT_VX(vmulhu_vx_b, 1)
1727 GEN_VEXT_VX(vmulhu_vx_h, 2)
1728 GEN_VEXT_VX(vmulhu_vx_w, 4)
1729 GEN_VEXT_VX(vmulhu_vx_d, 8)
1730 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1731 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1732 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1733 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1734 
1735 /* Vector Integer Divide Instructions */
1736 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1737 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1738 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1739         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1740 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1741         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1742 
1743 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1744 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1745 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1746 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1747 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1748 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1749 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1750 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1751 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1752 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1753 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1754 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1755 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1756 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1757 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1758 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1759 GEN_VEXT_VV(vdivu_vv_b, 1)
1760 GEN_VEXT_VV(vdivu_vv_h, 2)
1761 GEN_VEXT_VV(vdivu_vv_w, 4)
1762 GEN_VEXT_VV(vdivu_vv_d, 8)
1763 GEN_VEXT_VV(vdiv_vv_b, 1)
1764 GEN_VEXT_VV(vdiv_vv_h, 2)
1765 GEN_VEXT_VV(vdiv_vv_w, 4)
1766 GEN_VEXT_VV(vdiv_vv_d, 8)
1767 GEN_VEXT_VV(vremu_vv_b, 1)
1768 GEN_VEXT_VV(vremu_vv_h, 2)
1769 GEN_VEXT_VV(vremu_vv_w, 4)
1770 GEN_VEXT_VV(vremu_vv_d, 8)
1771 GEN_VEXT_VV(vrem_vv_b, 1)
1772 GEN_VEXT_VV(vrem_vv_h, 2)
1773 GEN_VEXT_VV(vrem_vv_w, 4)
1774 GEN_VEXT_VV(vrem_vv_d, 8)
1775 
1776 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1777 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1778 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1779 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1780 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1781 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1782 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1783 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1784 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1785 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1786 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1787 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1788 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1789 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1790 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1791 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1792 GEN_VEXT_VX(vdivu_vx_b, 1)
1793 GEN_VEXT_VX(vdivu_vx_h, 2)
1794 GEN_VEXT_VX(vdivu_vx_w, 4)
1795 GEN_VEXT_VX(vdivu_vx_d, 8)
1796 GEN_VEXT_VX(vdiv_vx_b, 1)
1797 GEN_VEXT_VX(vdiv_vx_h, 2)
1798 GEN_VEXT_VX(vdiv_vx_w, 4)
1799 GEN_VEXT_VX(vdiv_vx_d, 8)
1800 GEN_VEXT_VX(vremu_vx_b, 1)
1801 GEN_VEXT_VX(vremu_vx_h, 2)
1802 GEN_VEXT_VX(vremu_vx_w, 4)
1803 GEN_VEXT_VX(vremu_vx_d, 8)
1804 GEN_VEXT_VX(vrem_vx_b, 1)
1805 GEN_VEXT_VX(vrem_vx_h, 2)
1806 GEN_VEXT_VX(vrem_vx_w, 4)
1807 GEN_VEXT_VX(vrem_vx_d, 8)
1808 
1809 /* Vector Widening Integer Multiply Instructions */
1810 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1811 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1812 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1813 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1814 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1815 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1816 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1819 GEN_VEXT_VV(vwmul_vv_b, 2)
1820 GEN_VEXT_VV(vwmul_vv_h, 4)
1821 GEN_VEXT_VV(vwmul_vv_w, 8)
1822 GEN_VEXT_VV(vwmulu_vv_b, 2)
1823 GEN_VEXT_VV(vwmulu_vv_h, 4)
1824 GEN_VEXT_VV(vwmulu_vv_w, 8)
1825 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1826 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1827 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1828 
1829 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1830 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1831 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1832 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1833 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1834 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1835 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1838 GEN_VEXT_VX(vwmul_vx_b, 2)
1839 GEN_VEXT_VX(vwmul_vx_h, 4)
1840 GEN_VEXT_VX(vwmul_vx_w, 8)
1841 GEN_VEXT_VX(vwmulu_vx_b, 2)
1842 GEN_VEXT_VX(vwmulu_vx_h, 4)
1843 GEN_VEXT_VX(vwmulu_vx_w, 8)
1844 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1845 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1846 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1847 
1848 /* Vector Single-Width Integer Multiply-Add Instructions */
1849 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1850 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1851 {                                                                  \
1852     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1853     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1854     TD d = *((TD *)vd + HD(i));                                    \
1855     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1856 }
1857 
1858 #define DO_MACC(N, M, D) (M * N + D)
1859 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1860 #define DO_MADD(N, M, D) (M * D + N)
1861 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1862 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1863 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1864 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1865 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1866 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1867 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1868 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1869 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1870 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1871 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1872 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1873 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1874 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1875 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1876 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1877 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1878 GEN_VEXT_VV(vmacc_vv_b, 1)
1879 GEN_VEXT_VV(vmacc_vv_h, 2)
1880 GEN_VEXT_VV(vmacc_vv_w, 4)
1881 GEN_VEXT_VV(vmacc_vv_d, 8)
1882 GEN_VEXT_VV(vnmsac_vv_b, 1)
1883 GEN_VEXT_VV(vnmsac_vv_h, 2)
1884 GEN_VEXT_VV(vnmsac_vv_w, 4)
1885 GEN_VEXT_VV(vnmsac_vv_d, 8)
1886 GEN_VEXT_VV(vmadd_vv_b, 1)
1887 GEN_VEXT_VV(vmadd_vv_h, 2)
1888 GEN_VEXT_VV(vmadd_vv_w, 4)
1889 GEN_VEXT_VV(vmadd_vv_d, 8)
1890 GEN_VEXT_VV(vnmsub_vv_b, 1)
1891 GEN_VEXT_VV(vnmsub_vv_h, 2)
1892 GEN_VEXT_VV(vnmsub_vv_w, 4)
1893 GEN_VEXT_VV(vnmsub_vv_d, 8)
1894 
1895 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1896 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1897 {                                                                   \
1898     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1899     TD d = *((TD *)vd + HD(i));                                     \
1900     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1901 }
1902 
1903 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1904 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1905 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1906 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1907 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1908 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1909 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1910 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1911 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1912 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1913 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1914 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1915 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1916 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1917 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1918 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1919 GEN_VEXT_VX(vmacc_vx_b, 1)
1920 GEN_VEXT_VX(vmacc_vx_h, 2)
1921 GEN_VEXT_VX(vmacc_vx_w, 4)
1922 GEN_VEXT_VX(vmacc_vx_d, 8)
1923 GEN_VEXT_VX(vnmsac_vx_b, 1)
1924 GEN_VEXT_VX(vnmsac_vx_h, 2)
1925 GEN_VEXT_VX(vnmsac_vx_w, 4)
1926 GEN_VEXT_VX(vnmsac_vx_d, 8)
1927 GEN_VEXT_VX(vmadd_vx_b, 1)
1928 GEN_VEXT_VX(vmadd_vx_h, 2)
1929 GEN_VEXT_VX(vmadd_vx_w, 4)
1930 GEN_VEXT_VX(vmadd_vx_d, 8)
1931 GEN_VEXT_VX(vnmsub_vx_b, 1)
1932 GEN_VEXT_VX(vnmsub_vx_h, 2)
1933 GEN_VEXT_VX(vnmsub_vx_w, 4)
1934 GEN_VEXT_VX(vnmsub_vx_d, 8)
1935 
1936 /* Vector Widening Integer Multiply-Add Instructions */
1937 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1938 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1939 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1940 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1941 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1942 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1943 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1946 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1947 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1948 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1949 GEN_VEXT_VV(vwmacc_vv_b, 2)
1950 GEN_VEXT_VV(vwmacc_vv_h, 4)
1951 GEN_VEXT_VV(vwmacc_vv_w, 8)
1952 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1955 
1956 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1957 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1958 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1959 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1960 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1961 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1962 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1968 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1969 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1970 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1971 GEN_VEXT_VX(vwmacc_vx_b, 2)
1972 GEN_VEXT_VX(vwmacc_vx_h, 4)
1973 GEN_VEXT_VX(vwmacc_vx_w, 8)
1974 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1977 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1978 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1979 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1980 
1981 /* Vector Integer Merge and Move Instructions */
1982 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1983 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1984                   uint32_t desc)                                     \
1985 {                                                                    \
1986     uint32_t vl = env->vl;                                           \
1987     uint32_t esz = sizeof(ETYPE);                                    \
1988     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1989     uint32_t vta = vext_vta(desc);                                   \
1990     uint32_t i;                                                      \
1991                                                                      \
1992     for (i = env->vstart; i < vl; i++) {                             \
1993         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1994         *((ETYPE *)vd + H(i)) = s1;                                  \
1995     }                                                                \
1996     env->vstart = 0;                                                 \
1997     /* set tail elements to 1s */                                    \
1998     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1999 }
2000 
2001 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2002 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2003 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2004 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2005 
2006 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2007 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2008                   uint32_t desc)                                     \
2009 {                                                                    \
2010     uint32_t vl = env->vl;                                           \
2011     uint32_t esz = sizeof(ETYPE);                                    \
2012     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2013     uint32_t vta = vext_vta(desc);                                   \
2014     uint32_t i;                                                      \
2015                                                                      \
2016     for (i = env->vstart; i < vl; i++) {                             \
2017         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2018     }                                                                \
2019     env->vstart = 0;                                                 \
2020     /* set tail elements to 1s */                                    \
2021     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2022 }
2023 
2024 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2025 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2026 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2027 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2028 
2029 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2030 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2031                   CPURISCVState *env, uint32_t desc)                 \
2032 {                                                                    \
2033     uint32_t vl = env->vl;                                           \
2034     uint32_t esz = sizeof(ETYPE);                                    \
2035     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2036     uint32_t vta = vext_vta(desc);                                   \
2037     uint32_t i;                                                      \
2038                                                                      \
2039     for (i = env->vstart; i < vl; i++) {                             \
2040         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2041         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2042     }                                                                \
2043     env->vstart = 0;                                                 \
2044     /* set tail elements to 1s */                                    \
2045     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2046 }
2047 
2048 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2049 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2050 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2051 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2052 
2053 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2054 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2055                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2056 {                                                                    \
2057     uint32_t vl = env->vl;                                           \
2058     uint32_t esz = sizeof(ETYPE);                                    \
2059     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2060     uint32_t vta = vext_vta(desc);                                   \
2061     uint32_t i;                                                      \
2062                                                                      \
2063     for (i = env->vstart; i < vl; i++) {                             \
2064         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2065         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2066                    (ETYPE)(target_long)s1);                          \
2067         *((ETYPE *)vd + H(i)) = d;                                   \
2068     }                                                                \
2069     env->vstart = 0;                                                 \
2070     /* set tail elements to 1s */                                    \
2071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2072 }
2073 
2074 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2075 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2078 
2079 /*
2080  * Vector Fixed-Point Arithmetic Instructions
2081  */
2082 
2083 /* Vector Single-Width Saturating Add and Subtract */
2084 
2085 /*
2086  * As fixed point instructions probably have round mode and saturation,
2087  * define common macros for fixed point here.
2088  */
2089 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2090                           CPURISCVState *env, int vxrm);
2091 
2092 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2093 static inline void                                                  \
2094 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2095           CPURISCVState *env, int vxrm)                             \
2096 {                                                                   \
2097     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2098     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2099     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2100 }
2101 
2102 static inline void
2103 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2104              CPURISCVState *env,
2105              uint32_t vl, uint32_t vm, int vxrm,
2106              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2107 {
2108     for (uint32_t i = env->vstart; i < vl; i++) {
2109         if (!vm && !vext_elem_mask(v0, i)) {
2110             /* set masked-off elements to 1s */
2111             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2112             continue;
2113         }
2114         fn(vd, vs1, vs2, i, env, vxrm);
2115     }
2116     env->vstart = 0;
2117 }
2118 
2119 static inline void
2120 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2121              CPURISCVState *env,
2122              uint32_t desc,
2123              opivv2_rm_fn *fn, uint32_t esz)
2124 {
2125     uint32_t vm = vext_vm(desc);
2126     uint32_t vl = env->vl;
2127     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2128     uint32_t vta = vext_vta(desc);
2129     uint32_t vma = vext_vma(desc);
2130 
2131     switch (env->vxrm) {
2132     case 0: /* rnu */
2133         vext_vv_rm_1(vd, v0, vs1, vs2,
2134                      env, vl, vm, 0, fn, vma, esz);
2135         break;
2136     case 1: /* rne */
2137         vext_vv_rm_1(vd, v0, vs1, vs2,
2138                      env, vl, vm, 1, fn, vma, esz);
2139         break;
2140     case 2: /* rdn */
2141         vext_vv_rm_1(vd, v0, vs1, vs2,
2142                      env, vl, vm, 2, fn, vma, esz);
2143         break;
2144     default: /* rod */
2145         vext_vv_rm_1(vd, v0, vs1, vs2,
2146                      env, vl, vm, 3, fn, vma, esz);
2147         break;
2148     }
2149     /* set tail elements to 1s */
2150     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2151 }
2152 
2153 /* generate helpers for fixed point instructions with OPIVV format */
2154 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2155 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2156                   CPURISCVState *env, uint32_t desc)            \
2157 {                                                               \
2158     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2159                  do_##NAME, ESZ);                               \
2160 }
2161 
2162 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2163                              uint8_t b)
2164 {
2165     uint8_t res = a + b;
2166     if (res < a) {
2167         res = UINT8_MAX;
2168         env->vxsat = 0x1;
2169     }
2170     return res;
2171 }
2172 
2173 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2174                                uint16_t b)
2175 {
2176     uint16_t res = a + b;
2177     if (res < a) {
2178         res = UINT16_MAX;
2179         env->vxsat = 0x1;
2180     }
2181     return res;
2182 }
2183 
2184 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2185                                uint32_t b)
2186 {
2187     uint32_t res = a + b;
2188     if (res < a) {
2189         res = UINT32_MAX;
2190         env->vxsat = 0x1;
2191     }
2192     return res;
2193 }
2194 
2195 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2196                                uint64_t b)
2197 {
2198     uint64_t res = a + b;
2199     if (res < a) {
2200         res = UINT64_MAX;
2201         env->vxsat = 0x1;
2202     }
2203     return res;
2204 }
2205 
2206 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2207 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2209 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2210 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2211 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2212 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2213 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2214 
2215 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2216                           CPURISCVState *env, int vxrm);
2217 
2218 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2219 static inline void                                                  \
2220 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2221           CPURISCVState *env, int vxrm)                             \
2222 {                                                                   \
2223     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2224     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2225 }
2226 
2227 static inline void
2228 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2229              CPURISCVState *env,
2230              uint32_t vl, uint32_t vm, int vxrm,
2231              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2232 {
2233     for (uint32_t i = env->vstart; i < vl; i++) {
2234         if (!vm && !vext_elem_mask(v0, i)) {
2235             /* set masked-off elements to 1s */
2236             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2237             continue;
2238         }
2239         fn(vd, s1, vs2, i, env, vxrm);
2240     }
2241     env->vstart = 0;
2242 }
2243 
2244 static inline void
2245 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2246              CPURISCVState *env,
2247              uint32_t desc,
2248              opivx2_rm_fn *fn, uint32_t esz)
2249 {
2250     uint32_t vm = vext_vm(desc);
2251     uint32_t vl = env->vl;
2252     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2253     uint32_t vta = vext_vta(desc);
2254     uint32_t vma = vext_vma(desc);
2255 
2256     switch (env->vxrm) {
2257     case 0: /* rnu */
2258         vext_vx_rm_1(vd, v0, s1, vs2,
2259                      env, vl, vm, 0, fn, vma, esz);
2260         break;
2261     case 1: /* rne */
2262         vext_vx_rm_1(vd, v0, s1, vs2,
2263                      env, vl, vm, 1, fn, vma, esz);
2264         break;
2265     case 2: /* rdn */
2266         vext_vx_rm_1(vd, v0, s1, vs2,
2267                      env, vl, vm, 2, fn, vma, esz);
2268         break;
2269     default: /* rod */
2270         vext_vx_rm_1(vd, v0, s1, vs2,
2271                      env, vl, vm, 3, fn, vma, esz);
2272         break;
2273     }
2274     /* set tail elements to 1s */
2275     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2276 }
2277 
2278 /* generate helpers for fixed point instructions with OPIVX format */
2279 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2280 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2281                   void *vs2, CPURISCVState *env,          \
2282                   uint32_t desc)                          \
2283 {                                                         \
2284     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2285                  do_##NAME, ESZ);                         \
2286 }
2287 
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2290 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2291 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2292 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2293 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2294 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2295 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2296 
2297 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2298 {
2299     int8_t res = a + b;
2300     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2301         res = a > 0 ? INT8_MAX : INT8_MIN;
2302         env->vxsat = 0x1;
2303     }
2304     return res;
2305 }
2306 
2307 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2308                              int16_t b)
2309 {
2310     int16_t res = a + b;
2311     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2312         res = a > 0 ? INT16_MAX : INT16_MIN;
2313         env->vxsat = 0x1;
2314     }
2315     return res;
2316 }
2317 
2318 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2319                              int32_t b)
2320 {
2321     int32_t res = a + b;
2322     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2323         res = a > 0 ? INT32_MAX : INT32_MIN;
2324         env->vxsat = 0x1;
2325     }
2326     return res;
2327 }
2328 
2329 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2330                              int64_t b)
2331 {
2332     int64_t res = a + b;
2333     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2334         res = a > 0 ? INT64_MAX : INT64_MIN;
2335         env->vxsat = 0x1;
2336     }
2337     return res;
2338 }
2339 
2340 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2341 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2342 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2343 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2344 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2345 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2346 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2347 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2348 
2349 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2350 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2351 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2352 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2353 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2354 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2355 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2356 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2357 
2358 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2359                              uint8_t b)
2360 {
2361     uint8_t res = a - b;
2362     if (res > a) {
2363         res = 0;
2364         env->vxsat = 0x1;
2365     }
2366     return res;
2367 }
2368 
2369 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2370                                uint16_t b)
2371 {
2372     uint16_t res = a - b;
2373     if (res > a) {
2374         res = 0;
2375         env->vxsat = 0x1;
2376     }
2377     return res;
2378 }
2379 
2380 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2381                                uint32_t b)
2382 {
2383     uint32_t res = a - b;
2384     if (res > a) {
2385         res = 0;
2386         env->vxsat = 0x1;
2387     }
2388     return res;
2389 }
2390 
2391 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2392                                uint64_t b)
2393 {
2394     uint64_t res = a - b;
2395     if (res > a) {
2396         res = 0;
2397         env->vxsat = 0x1;
2398     }
2399     return res;
2400 }
2401 
2402 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2403 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2404 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2405 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2406 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2407 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2408 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2409 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2410 
2411 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2412 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2413 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2414 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2415 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2416 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2417 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2418 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2419 
2420 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2421 {
2422     int8_t res = a - b;
2423     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2424         res = a >= 0 ? INT8_MAX : INT8_MIN;
2425         env->vxsat = 0x1;
2426     }
2427     return res;
2428 }
2429 
2430 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2431                              int16_t b)
2432 {
2433     int16_t res = a - b;
2434     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2435         res = a >= 0 ? INT16_MAX : INT16_MIN;
2436         env->vxsat = 0x1;
2437     }
2438     return res;
2439 }
2440 
2441 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2442                              int32_t b)
2443 {
2444     int32_t res = a - b;
2445     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2446         res = a >= 0 ? INT32_MAX : INT32_MIN;
2447         env->vxsat = 0x1;
2448     }
2449     return res;
2450 }
2451 
2452 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2453                              int64_t b)
2454 {
2455     int64_t res = a - b;
2456     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2457         res = a >= 0 ? INT64_MAX : INT64_MIN;
2458         env->vxsat = 0x1;
2459     }
2460     return res;
2461 }
2462 
2463 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2464 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2465 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2466 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2467 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2468 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2469 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2470 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2471 
2472 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2473 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2474 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2475 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2476 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2477 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2478 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2479 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2480 
2481 /* Vector Single-Width Averaging Add and Subtract */
2482 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2483 {
2484     uint8_t d = extract64(v, shift, 1);
2485     uint8_t d1;
2486     uint64_t D1, D2;
2487 
2488     if (shift == 0 || shift > 64) {
2489         return 0;
2490     }
2491 
2492     d1 = extract64(v, shift - 1, 1);
2493     D1 = extract64(v, 0, shift);
2494     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2495         return d1;
2496     } else if (vxrm == 1) { /* round-to-nearest-even */
2497         if (shift > 1) {
2498             D2 = extract64(v, 0, shift - 1);
2499             return d1 & ((D2 != 0) | d);
2500         } else {
2501             return d1 & d;
2502         }
2503     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2504         return !d & (D1 != 0);
2505     }
2506     return 0; /* round-down (truncate) */
2507 }
2508 
2509 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2510                              int32_t b)
2511 {
2512     int64_t res = (int64_t)a + b;
2513     uint8_t round = get_round(vxrm, res, 1);
2514 
2515     return (res >> 1) + round;
2516 }
2517 
2518 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2519                              int64_t b)
2520 {
2521     int64_t res = a + b;
2522     uint8_t round = get_round(vxrm, res, 1);
2523     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2524 
2525     /* With signed overflow, bit 64 is inverse of bit 63. */
2526     return ((res >> 1) ^ over) + round;
2527 }
2528 
2529 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2530 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2531 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2532 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2533 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2534 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2535 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2536 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2537 
2538 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2539 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2540 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2541 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2542 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2543 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2544 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2545 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2546 
2547 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2548                                uint32_t a, uint32_t b)
2549 {
2550     uint64_t res = (uint64_t)a + b;
2551     uint8_t round = get_round(vxrm, res, 1);
2552 
2553     return (res >> 1) + round;
2554 }
2555 
2556 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2557                                uint64_t a, uint64_t b)
2558 {
2559     uint64_t res = a + b;
2560     uint8_t round = get_round(vxrm, res, 1);
2561     uint64_t over = (uint64_t)(res < a) << 63;
2562 
2563     return ((res >> 1) | over) + round;
2564 }
2565 
2566 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2567 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2568 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2569 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2570 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2571 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2572 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2573 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2574 
2575 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2576 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2577 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2578 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2579 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2580 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2581 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2582 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2583 
2584 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2585                              int32_t b)
2586 {
2587     int64_t res = (int64_t)a - b;
2588     uint8_t round = get_round(vxrm, res, 1);
2589 
2590     return (res >> 1) + round;
2591 }
2592 
2593 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2594                              int64_t b)
2595 {
2596     int64_t res = (int64_t)a - b;
2597     uint8_t round = get_round(vxrm, res, 1);
2598     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2599 
2600     /* With signed overflow, bit 64 is inverse of bit 63. */
2601     return ((res >> 1) ^ over) + round;
2602 }
2603 
2604 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2605 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2606 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2607 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2608 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2609 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2610 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2611 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2612 
2613 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2614 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2615 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2616 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2617 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2618 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2619 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2620 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2621 
2622 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2623                                uint32_t a, uint32_t b)
2624 {
2625     int64_t res = (int64_t)a - b;
2626     uint8_t round = get_round(vxrm, res, 1);
2627 
2628     return (res >> 1) + round;
2629 }
2630 
2631 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2632                                uint64_t a, uint64_t b)
2633 {
2634     uint64_t res = (uint64_t)a - b;
2635     uint8_t round = get_round(vxrm, res, 1);
2636     uint64_t over = (uint64_t)(res > a) << 63;
2637 
2638     return ((res >> 1) | over) + round;
2639 }
2640 
2641 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2642 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2643 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2644 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2645 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2646 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2647 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2648 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2649 
2650 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2651 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2652 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2653 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2654 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2655 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2656 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2657 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2658 
2659 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2660 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2661 {
2662     uint8_t round;
2663     int16_t res;
2664 
2665     res = (int16_t)a * (int16_t)b;
2666     round = get_round(vxrm, res, 7);
2667     res = (res >> 7) + round;
2668 
2669     if (res > INT8_MAX) {
2670         env->vxsat = 0x1;
2671         return INT8_MAX;
2672     } else if (res < INT8_MIN) {
2673         env->vxsat = 0x1;
2674         return INT8_MIN;
2675     } else {
2676         return res;
2677     }
2678 }
2679 
2680 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2681 {
2682     uint8_t round;
2683     int32_t res;
2684 
2685     res = (int32_t)a * (int32_t)b;
2686     round = get_round(vxrm, res, 15);
2687     res = (res >> 15) + round;
2688 
2689     if (res > INT16_MAX) {
2690         env->vxsat = 0x1;
2691         return INT16_MAX;
2692     } else if (res < INT16_MIN) {
2693         env->vxsat = 0x1;
2694         return INT16_MIN;
2695     } else {
2696         return res;
2697     }
2698 }
2699 
2700 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2701 {
2702     uint8_t round;
2703     int64_t res;
2704 
2705     res = (int64_t)a * (int64_t)b;
2706     round = get_round(vxrm, res, 31);
2707     res = (res >> 31) + round;
2708 
2709     if (res > INT32_MAX) {
2710         env->vxsat = 0x1;
2711         return INT32_MAX;
2712     } else if (res < INT32_MIN) {
2713         env->vxsat = 0x1;
2714         return INT32_MIN;
2715     } else {
2716         return res;
2717     }
2718 }
2719 
2720 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2721 {
2722     uint8_t round;
2723     uint64_t hi_64, lo_64;
2724     int64_t res;
2725 
2726     if (a == INT64_MIN && b == INT64_MIN) {
2727         env->vxsat = 1;
2728         return INT64_MAX;
2729     }
2730 
2731     muls64(&lo_64, &hi_64, a, b);
2732     round = get_round(vxrm, lo_64, 63);
2733     /*
2734      * Cannot overflow, as there are always
2735      * 2 sign bits after multiply.
2736      */
2737     res = (hi_64 << 1) | (lo_64 >> 63);
2738     if (round) {
2739         if (res == INT64_MAX) {
2740             env->vxsat = 1;
2741         } else {
2742             res += 1;
2743         }
2744     }
2745     return res;
2746 }
2747 
2748 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2749 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2750 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2751 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2752 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2753 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2754 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2755 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2756 
2757 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2758 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2759 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2760 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2761 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2762 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2763 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2764 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2765 
2766 /* Vector Single-Width Scaling Shift Instructions */
2767 static inline uint8_t
2768 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2769 {
2770     uint8_t round, shift = b & 0x7;
2771     uint8_t res;
2772 
2773     round = get_round(vxrm, a, shift);
2774     res = (a >> shift) + round;
2775     return res;
2776 }
2777 static inline uint16_t
2778 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2779 {
2780     uint8_t round, shift = b & 0xf;
2781 
2782     round = get_round(vxrm, a, shift);
2783     return (a >> shift) + round;
2784 }
2785 static inline uint32_t
2786 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2787 {
2788     uint8_t round, shift = b & 0x1f;
2789 
2790     round = get_round(vxrm, a, shift);
2791     return (a >> shift) + round;
2792 }
2793 static inline uint64_t
2794 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2795 {
2796     uint8_t round, shift = b & 0x3f;
2797 
2798     round = get_round(vxrm, a, shift);
2799     return (a >> shift) + round;
2800 }
2801 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2802 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2803 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2804 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2805 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2806 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2807 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2808 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2809 
2810 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2811 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2812 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2813 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2814 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2815 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2816 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2817 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2818 
2819 static inline int8_t
2820 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2821 {
2822     uint8_t round, shift = b & 0x7;
2823 
2824     round = get_round(vxrm, a, shift);
2825     return (a >> shift) + round;
2826 }
2827 static inline int16_t
2828 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2829 {
2830     uint8_t round, shift = b & 0xf;
2831 
2832     round = get_round(vxrm, a, shift);
2833     return (a >> shift) + round;
2834 }
2835 static inline int32_t
2836 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2837 {
2838     uint8_t round, shift = b & 0x1f;
2839 
2840     round = get_round(vxrm, a, shift);
2841     return (a >> shift) + round;
2842 }
2843 static inline int64_t
2844 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2845 {
2846     uint8_t round, shift = b & 0x3f;
2847 
2848     round = get_round(vxrm, a, shift);
2849     return (a >> shift) + round;
2850 }
2851 
2852 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2853 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2854 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2855 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2856 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2857 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2858 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2859 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2860 
2861 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2862 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2863 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2864 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2865 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2866 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2867 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2868 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2869 
2870 /* Vector Narrowing Fixed-Point Clip Instructions */
2871 static inline int8_t
2872 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2873 {
2874     uint8_t round, shift = b & 0xf;
2875     int16_t res;
2876 
2877     round = get_round(vxrm, a, shift);
2878     res = (a >> shift) + round;
2879     if (res > INT8_MAX) {
2880         env->vxsat = 0x1;
2881         return INT8_MAX;
2882     } else if (res < INT8_MIN) {
2883         env->vxsat = 0x1;
2884         return INT8_MIN;
2885     } else {
2886         return res;
2887     }
2888 }
2889 
2890 static inline int16_t
2891 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2892 {
2893     uint8_t round, shift = b & 0x1f;
2894     int32_t res;
2895 
2896     round = get_round(vxrm, a, shift);
2897     res = (a >> shift) + round;
2898     if (res > INT16_MAX) {
2899         env->vxsat = 0x1;
2900         return INT16_MAX;
2901     } else if (res < INT16_MIN) {
2902         env->vxsat = 0x1;
2903         return INT16_MIN;
2904     } else {
2905         return res;
2906     }
2907 }
2908 
2909 static inline int32_t
2910 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2911 {
2912     uint8_t round, shift = b & 0x3f;
2913     int64_t res;
2914 
2915     round = get_round(vxrm, a, shift);
2916     res = (a >> shift) + round;
2917     if (res > INT32_MAX) {
2918         env->vxsat = 0x1;
2919         return INT32_MAX;
2920     } else if (res < INT32_MIN) {
2921         env->vxsat = 0x1;
2922         return INT32_MIN;
2923     } else {
2924         return res;
2925     }
2926 }
2927 
2928 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2929 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2930 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2931 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2932 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2933 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2934 
2935 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2936 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2937 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2938 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2939 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2940 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2941 
2942 static inline uint8_t
2943 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2944 {
2945     uint8_t round, shift = b & 0xf;
2946     uint16_t res;
2947 
2948     round = get_round(vxrm, a, shift);
2949     res = (a >> shift) + round;
2950     if (res > UINT8_MAX) {
2951         env->vxsat = 0x1;
2952         return UINT8_MAX;
2953     } else {
2954         return res;
2955     }
2956 }
2957 
2958 static inline uint16_t
2959 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2960 {
2961     uint8_t round, shift = b & 0x1f;
2962     uint32_t res;
2963 
2964     round = get_round(vxrm, a, shift);
2965     res = (a >> shift) + round;
2966     if (res > UINT16_MAX) {
2967         env->vxsat = 0x1;
2968         return UINT16_MAX;
2969     } else {
2970         return res;
2971     }
2972 }
2973 
2974 static inline uint32_t
2975 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2976 {
2977     uint8_t round, shift = b & 0x3f;
2978     uint64_t res;
2979 
2980     round = get_round(vxrm, a, shift);
2981     res = (a >> shift) + round;
2982     if (res > UINT32_MAX) {
2983         env->vxsat = 0x1;
2984         return UINT32_MAX;
2985     } else {
2986         return res;
2987     }
2988 }
2989 
2990 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2991 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2992 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2993 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2994 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2995 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2996 
2997 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2998 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2999 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3000 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3001 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3002 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3003 
3004 /*
3005  * Vector Float Point Arithmetic Instructions
3006  */
3007 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3008 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3009 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3010                       CPURISCVState *env)                      \
3011 {                                                              \
3012     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3013     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3014     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3015 }
3016 
3017 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3018 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3019                   void *vs2, CPURISCVState *env,          \
3020                   uint32_t desc)                          \
3021 {                                                         \
3022     uint32_t vm = vext_vm(desc);                          \
3023     uint32_t vl = env->vl;                                \
3024     uint32_t total_elems =                                \
3025         vext_get_total_elems(env, desc, ESZ);             \
3026     uint32_t vta = vext_vta(desc);                        \
3027     uint32_t vma = vext_vma(desc);                        \
3028     uint32_t i;                                           \
3029                                                           \
3030     for (i = env->vstart; i < vl; i++) {                  \
3031         if (!vm && !vext_elem_mask(v0, i)) {              \
3032             /* set masked-off elements to 1s */           \
3033             vext_set_elems_1s(vd, vma, i * ESZ,           \
3034                               (i + 1) * ESZ);             \
3035             continue;                                     \
3036         }                                                 \
3037         do_##NAME(vd, vs1, vs2, i, env);                  \
3038     }                                                     \
3039     env->vstart = 0;                                      \
3040     /* set tail elements to 1s */                         \
3041     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3042                       total_elems * ESZ);                 \
3043 }
3044 
3045 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3046 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3047 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3048 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3049 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3050 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3051 
3052 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3053 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3054                       CPURISCVState *env)                      \
3055 {                                                              \
3056     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3057     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3058 }
3059 
3060 #define GEN_VEXT_VF(NAME, ESZ)                            \
3061 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3062                   void *vs2, CPURISCVState *env,          \
3063                   uint32_t desc)                          \
3064 {                                                         \
3065     uint32_t vm = vext_vm(desc);                          \
3066     uint32_t vl = env->vl;                                \
3067     uint32_t total_elems =                                \
3068         vext_get_total_elems(env, desc, ESZ);             \
3069     uint32_t vta = vext_vta(desc);                        \
3070     uint32_t vma = vext_vma(desc);                        \
3071     uint32_t i;                                           \
3072                                                           \
3073     for (i = env->vstart; i < vl; i++) {                  \
3074         if (!vm && !vext_elem_mask(v0, i)) {              \
3075             /* set masked-off elements to 1s */           \
3076             vext_set_elems_1s(vd, vma, i * ESZ,           \
3077                               (i + 1) * ESZ);             \
3078             continue;                                     \
3079         }                                                 \
3080         do_##NAME(vd, s1, vs2, i, env);                   \
3081     }                                                     \
3082     env->vstart = 0;                                      \
3083     /* set tail elements to 1s */                         \
3084     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3085                       total_elems * ESZ);                 \
3086 }
3087 
3088 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3089 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3090 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3091 GEN_VEXT_VF(vfadd_vf_h, 2)
3092 GEN_VEXT_VF(vfadd_vf_w, 4)
3093 GEN_VEXT_VF(vfadd_vf_d, 8)
3094 
3095 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3096 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3097 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3098 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3099 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3100 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3101 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3102 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3103 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3104 GEN_VEXT_VF(vfsub_vf_h, 2)
3105 GEN_VEXT_VF(vfsub_vf_w, 4)
3106 GEN_VEXT_VF(vfsub_vf_d, 8)
3107 
3108 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3109 {
3110     return float16_sub(b, a, s);
3111 }
3112 
3113 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3114 {
3115     return float32_sub(b, a, s);
3116 }
3117 
3118 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3119 {
3120     return float64_sub(b, a, s);
3121 }
3122 
3123 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3124 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3125 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3126 GEN_VEXT_VF(vfrsub_vf_h, 2)
3127 GEN_VEXT_VF(vfrsub_vf_w, 4)
3128 GEN_VEXT_VF(vfrsub_vf_d, 8)
3129 
3130 /* Vector Widening Floating-Point Add/Subtract Instructions */
3131 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3132 {
3133     return float32_add(float16_to_float32(a, true, s),
3134                        float16_to_float32(b, true, s), s);
3135 }
3136 
3137 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3138 {
3139     return float64_add(float32_to_float64(a, s),
3140                        float32_to_float64(b, s), s);
3141 
3142 }
3143 
3144 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3145 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3146 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3147 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3148 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3149 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3150 GEN_VEXT_VF(vfwadd_vf_h, 4)
3151 GEN_VEXT_VF(vfwadd_vf_w, 8)
3152 
3153 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3154 {
3155     return float32_sub(float16_to_float32(a, true, s),
3156                        float16_to_float32(b, true, s), s);
3157 }
3158 
3159 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3160 {
3161     return float64_sub(float32_to_float64(a, s),
3162                        float32_to_float64(b, s), s);
3163 
3164 }
3165 
3166 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3167 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3168 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3169 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3170 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3171 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3172 GEN_VEXT_VF(vfwsub_vf_h, 4)
3173 GEN_VEXT_VF(vfwsub_vf_w, 8)
3174 
3175 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3176 {
3177     return float32_add(a, float16_to_float32(b, true, s), s);
3178 }
3179 
3180 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3181 {
3182     return float64_add(a, float32_to_float64(b, s), s);
3183 }
3184 
3185 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3186 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3187 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3188 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3189 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3190 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3191 GEN_VEXT_VF(vfwadd_wf_h, 4)
3192 GEN_VEXT_VF(vfwadd_wf_w, 8)
3193 
3194 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3195 {
3196     return float32_sub(a, float16_to_float32(b, true, s), s);
3197 }
3198 
3199 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3200 {
3201     return float64_sub(a, float32_to_float64(b, s), s);
3202 }
3203 
3204 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3205 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3206 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3207 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3208 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3209 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3210 GEN_VEXT_VF(vfwsub_wf_h, 4)
3211 GEN_VEXT_VF(vfwsub_wf_w, 8)
3212 
3213 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3214 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3215 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3216 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3217 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3218 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3219 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3220 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3221 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3222 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3223 GEN_VEXT_VF(vfmul_vf_h, 2)
3224 GEN_VEXT_VF(vfmul_vf_w, 4)
3225 GEN_VEXT_VF(vfmul_vf_d, 8)
3226 
3227 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3228 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3229 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3230 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3231 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3232 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3233 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3234 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3235 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3236 GEN_VEXT_VF(vfdiv_vf_h, 2)
3237 GEN_VEXT_VF(vfdiv_vf_w, 4)
3238 GEN_VEXT_VF(vfdiv_vf_d, 8)
3239 
3240 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3241 {
3242     return float16_div(b, a, s);
3243 }
3244 
3245 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3246 {
3247     return float32_div(b, a, s);
3248 }
3249 
3250 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3251 {
3252     return float64_div(b, a, s);
3253 }
3254 
3255 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3256 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3257 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3258 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3259 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3260 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3261 
3262 /* Vector Widening Floating-Point Multiply */
3263 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3264 {
3265     return float32_mul(float16_to_float32(a, true, s),
3266                        float16_to_float32(b, true, s), s);
3267 }
3268 
3269 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3270 {
3271     return float64_mul(float32_to_float64(a, s),
3272                        float32_to_float64(b, s), s);
3273 
3274 }
3275 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3276 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3277 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3278 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3279 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3280 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3281 GEN_VEXT_VF(vfwmul_vf_h, 4)
3282 GEN_VEXT_VF(vfwmul_vf_w, 8)
3283 
3284 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3285 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3286 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3287                       CPURISCVState *env)                          \
3288 {                                                                  \
3289     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3290     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3291     TD d = *((TD *)vd + HD(i));                                    \
3292     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3293 }
3294 
3295 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3296 {
3297     return float16_muladd(a, b, d, 0, s);
3298 }
3299 
3300 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3301 {
3302     return float32_muladd(a, b, d, 0, s);
3303 }
3304 
3305 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3306 {
3307     return float64_muladd(a, b, d, 0, s);
3308 }
3309 
3310 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3311 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3312 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3313 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3314 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3315 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3316 
3317 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3318 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3319                       CPURISCVState *env)                         \
3320 {                                                                 \
3321     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3322     TD d = *((TD *)vd + HD(i));                                   \
3323     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3324 }
3325 
3326 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3327 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3328 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3329 GEN_VEXT_VF(vfmacc_vf_h, 2)
3330 GEN_VEXT_VF(vfmacc_vf_w, 4)
3331 GEN_VEXT_VF(vfmacc_vf_d, 8)
3332 
3333 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3334 {
3335     return float16_muladd(a, b, d, float_muladd_negate_c |
3336                                    float_muladd_negate_product, s);
3337 }
3338 
3339 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3340 {
3341     return float32_muladd(a, b, d, float_muladd_negate_c |
3342                                    float_muladd_negate_product, s);
3343 }
3344 
3345 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3346 {
3347     return float64_muladd(a, b, d, float_muladd_negate_c |
3348                                    float_muladd_negate_product, s);
3349 }
3350 
3351 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3352 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3353 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3354 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3355 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3356 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3357 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3358 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3359 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3360 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3361 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3362 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3363 
3364 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3365 {
3366     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3367 }
3368 
3369 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3370 {
3371     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3372 }
3373 
3374 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3375 {
3376     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3377 }
3378 
3379 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3380 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3381 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3382 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3383 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3384 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3385 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3386 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3387 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3388 GEN_VEXT_VF(vfmsac_vf_h, 2)
3389 GEN_VEXT_VF(vfmsac_vf_w, 4)
3390 GEN_VEXT_VF(vfmsac_vf_d, 8)
3391 
3392 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3393 {
3394     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3395 }
3396 
3397 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3398 {
3399     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3400 }
3401 
3402 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3403 {
3404     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3405 }
3406 
3407 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3408 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3409 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3410 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3411 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3412 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3413 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3414 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3415 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3416 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3417 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3418 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3419 
3420 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3421 {
3422     return float16_muladd(d, b, a, 0, s);
3423 }
3424 
3425 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3426 {
3427     return float32_muladd(d, b, a, 0, s);
3428 }
3429 
3430 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3431 {
3432     return float64_muladd(d, b, a, 0, s);
3433 }
3434 
3435 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3436 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3437 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3438 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3439 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3440 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3441 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3442 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3443 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3444 GEN_VEXT_VF(vfmadd_vf_h, 2)
3445 GEN_VEXT_VF(vfmadd_vf_w, 4)
3446 GEN_VEXT_VF(vfmadd_vf_d, 8)
3447 
3448 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3449 {
3450     return float16_muladd(d, b, a, float_muladd_negate_c |
3451                                    float_muladd_negate_product, s);
3452 }
3453 
3454 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3455 {
3456     return float32_muladd(d, b, a, float_muladd_negate_c |
3457                                    float_muladd_negate_product, s);
3458 }
3459 
3460 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3461 {
3462     return float64_muladd(d, b, a, float_muladd_negate_c |
3463                                    float_muladd_negate_product, s);
3464 }
3465 
3466 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3467 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3468 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3469 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3470 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3471 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3472 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3473 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3474 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3475 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3476 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3477 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3478 
3479 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3480 {
3481     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3482 }
3483 
3484 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3485 {
3486     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3487 }
3488 
3489 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3490 {
3491     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3492 }
3493 
3494 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3495 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3496 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3497 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3498 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3499 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3500 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3501 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3502 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3503 GEN_VEXT_VF(vfmsub_vf_h, 2)
3504 GEN_VEXT_VF(vfmsub_vf_w, 4)
3505 GEN_VEXT_VF(vfmsub_vf_d, 8)
3506 
3507 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3508 {
3509     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3510 }
3511 
3512 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3513 {
3514     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3515 }
3516 
3517 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3518 {
3519     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3520 }
3521 
3522 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3523 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3524 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3525 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3526 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3527 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3528 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3529 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3530 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3531 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3532 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3533 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3534 
3535 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3536 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3537 {
3538     return float32_muladd(float16_to_float32(a, true, s),
3539                           float16_to_float32(b, true, s), d, 0, s);
3540 }
3541 
3542 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3543 {
3544     return float64_muladd(float32_to_float64(a, s),
3545                           float32_to_float64(b, s), d, 0, s);
3546 }
3547 
3548 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3549 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3550 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3551 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3552 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3553 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3554 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3555 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3556 
3557 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3558 {
3559     return float32_muladd(bfloat16_to_float32(a, s),
3560                           bfloat16_to_float32(b, s), d, 0, s);
3561 }
3562 
3563 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3564 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3565 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmacc16)
3566 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3567 
3568 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3569 {
3570     return float32_muladd(float16_to_float32(a, true, s),
3571                           float16_to_float32(b, true, s), d,
3572                           float_muladd_negate_c | float_muladd_negate_product,
3573                           s);
3574 }
3575 
3576 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3577 {
3578     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3579                           d, float_muladd_negate_c |
3580                              float_muladd_negate_product, s);
3581 }
3582 
3583 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3584 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3585 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3586 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3587 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3588 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3589 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3590 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3591 
3592 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3593 {
3594     return float32_muladd(float16_to_float32(a, true, s),
3595                           float16_to_float32(b, true, s), d,
3596                           float_muladd_negate_c, s);
3597 }
3598 
3599 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3600 {
3601     return float64_muladd(float32_to_float64(a, s),
3602                           float32_to_float64(b, s), d,
3603                           float_muladd_negate_c, s);
3604 }
3605 
3606 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3607 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3608 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3609 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3610 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3611 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3612 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3613 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3614 
3615 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3616 {
3617     return float32_muladd(float16_to_float32(a, true, s),
3618                           float16_to_float32(b, true, s), d,
3619                           float_muladd_negate_product, s);
3620 }
3621 
3622 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3623 {
3624     return float64_muladd(float32_to_float64(a, s),
3625                           float32_to_float64(b, s), d,
3626                           float_muladd_negate_product, s);
3627 }
3628 
3629 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3630 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3631 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3632 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3633 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3634 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3635 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3636 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3637 
3638 /* Vector Floating-Point Square-Root Instruction */
3639 /* (TD, T2, TX2) */
3640 #define OP_UU_H uint16_t, uint16_t, uint16_t
3641 #define OP_UU_W uint32_t, uint32_t, uint32_t
3642 #define OP_UU_D uint64_t, uint64_t, uint64_t
3643 
3644 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3645 static void do_##NAME(void *vd, void *vs2, int i,      \
3646                       CPURISCVState *env)              \
3647 {                                                      \
3648     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3649     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3650 }
3651 
3652 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3653 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3654                   CPURISCVState *env, uint32_t desc)   \
3655 {                                                      \
3656     uint32_t vm = vext_vm(desc);                       \
3657     uint32_t vl = env->vl;                             \
3658     uint32_t total_elems =                             \
3659         vext_get_total_elems(env, desc, ESZ);          \
3660     uint32_t vta = vext_vta(desc);                     \
3661     uint32_t vma = vext_vma(desc);                     \
3662     uint32_t i;                                        \
3663                                                        \
3664     if (vl == 0) {                                     \
3665         return;                                        \
3666     }                                                  \
3667     for (i = env->vstart; i < vl; i++) {               \
3668         if (!vm && !vext_elem_mask(v0, i)) {           \
3669             /* set masked-off elements to 1s */        \
3670             vext_set_elems_1s(vd, vma, i * ESZ,        \
3671                               (i + 1) * ESZ);          \
3672             continue;                                  \
3673         }                                              \
3674         do_##NAME(vd, vs2, i, env);                    \
3675     }                                                  \
3676     env->vstart = 0;                                   \
3677     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3678                       total_elems * ESZ);              \
3679 }
3680 
3681 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3682 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3683 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3684 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3685 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3686 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3687 
3688 /*
3689  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3690  *
3691  * Adapted from riscv-v-spec recip.c:
3692  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3693  */
3694 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3695 {
3696     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3697     uint64_t exp = extract64(f, frac_size, exp_size);
3698     uint64_t frac = extract64(f, 0, frac_size);
3699 
3700     const uint8_t lookup_table[] = {
3701         52, 51, 50, 48, 47, 46, 44, 43,
3702         42, 41, 40, 39, 38, 36, 35, 34,
3703         33, 32, 31, 30, 30, 29, 28, 27,
3704         26, 25, 24, 23, 23, 22, 21, 20,
3705         19, 19, 18, 17, 16, 16, 15, 14,
3706         14, 13, 12, 12, 11, 10, 10, 9,
3707         9, 8, 7, 7, 6, 6, 5, 4,
3708         4, 3, 3, 2, 2, 1, 1, 0,
3709         127, 125, 123, 121, 119, 118, 116, 114,
3710         113, 111, 109, 108, 106, 105, 103, 102,
3711         100, 99, 97, 96, 95, 93, 92, 91,
3712         90, 88, 87, 86, 85, 84, 83, 82,
3713         80, 79, 78, 77, 76, 75, 74, 73,
3714         72, 71, 70, 70, 69, 68, 67, 66,
3715         65, 64, 63, 63, 62, 61, 60, 59,
3716         59, 58, 57, 56, 56, 55, 54, 53
3717     };
3718     const int precision = 7;
3719 
3720     if (exp == 0 && frac != 0) { /* subnormal */
3721         /* Normalize the subnormal. */
3722         while (extract64(frac, frac_size - 1, 1) == 0) {
3723             exp--;
3724             frac <<= 1;
3725         }
3726 
3727         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3728     }
3729 
3730     int idx = ((exp & 1) << (precision - 1)) |
3731               (frac >> (frac_size - precision + 1));
3732     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3733                         (frac_size - precision);
3734     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3735 
3736     uint64_t val = 0;
3737     val = deposit64(val, 0, frac_size, out_frac);
3738     val = deposit64(val, frac_size, exp_size, out_exp);
3739     val = deposit64(val, frac_size + exp_size, 1, sign);
3740     return val;
3741 }
3742 
3743 static float16 frsqrt7_h(float16 f, float_status *s)
3744 {
3745     int exp_size = 5, frac_size = 10;
3746     bool sign = float16_is_neg(f);
3747 
3748     /*
3749      * frsqrt7(sNaN) = canonical NaN
3750      * frsqrt7(-inf) = canonical NaN
3751      * frsqrt7(-normal) = canonical NaN
3752      * frsqrt7(-subnormal) = canonical NaN
3753      */
3754     if (float16_is_signaling_nan(f, s) ||
3755         (float16_is_infinity(f) && sign) ||
3756         (float16_is_normal(f) && sign) ||
3757         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3758         s->float_exception_flags |= float_flag_invalid;
3759         return float16_default_nan(s);
3760     }
3761 
3762     /* frsqrt7(qNaN) = canonical NaN */
3763     if (float16_is_quiet_nan(f, s)) {
3764         return float16_default_nan(s);
3765     }
3766 
3767     /* frsqrt7(+-0) = +-inf */
3768     if (float16_is_zero(f)) {
3769         s->float_exception_flags |= float_flag_divbyzero;
3770         return float16_set_sign(float16_infinity, sign);
3771     }
3772 
3773     /* frsqrt7(+inf) = +0 */
3774     if (float16_is_infinity(f) && !sign) {
3775         return float16_set_sign(float16_zero, sign);
3776     }
3777 
3778     /* +normal, +subnormal */
3779     uint64_t val = frsqrt7(f, exp_size, frac_size);
3780     return make_float16(val);
3781 }
3782 
3783 static float32 frsqrt7_s(float32 f, float_status *s)
3784 {
3785     int exp_size = 8, frac_size = 23;
3786     bool sign = float32_is_neg(f);
3787 
3788     /*
3789      * frsqrt7(sNaN) = canonical NaN
3790      * frsqrt7(-inf) = canonical NaN
3791      * frsqrt7(-normal) = canonical NaN
3792      * frsqrt7(-subnormal) = canonical NaN
3793      */
3794     if (float32_is_signaling_nan(f, s) ||
3795         (float32_is_infinity(f) && sign) ||
3796         (float32_is_normal(f) && sign) ||
3797         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3798         s->float_exception_flags |= float_flag_invalid;
3799         return float32_default_nan(s);
3800     }
3801 
3802     /* frsqrt7(qNaN) = canonical NaN */
3803     if (float32_is_quiet_nan(f, s)) {
3804         return float32_default_nan(s);
3805     }
3806 
3807     /* frsqrt7(+-0) = +-inf */
3808     if (float32_is_zero(f)) {
3809         s->float_exception_flags |= float_flag_divbyzero;
3810         return float32_set_sign(float32_infinity, sign);
3811     }
3812 
3813     /* frsqrt7(+inf) = +0 */
3814     if (float32_is_infinity(f) && !sign) {
3815         return float32_set_sign(float32_zero, sign);
3816     }
3817 
3818     /* +normal, +subnormal */
3819     uint64_t val = frsqrt7(f, exp_size, frac_size);
3820     return make_float32(val);
3821 }
3822 
3823 static float64 frsqrt7_d(float64 f, float_status *s)
3824 {
3825     int exp_size = 11, frac_size = 52;
3826     bool sign = float64_is_neg(f);
3827 
3828     /*
3829      * frsqrt7(sNaN) = canonical NaN
3830      * frsqrt7(-inf) = canonical NaN
3831      * frsqrt7(-normal) = canonical NaN
3832      * frsqrt7(-subnormal) = canonical NaN
3833      */
3834     if (float64_is_signaling_nan(f, s) ||
3835         (float64_is_infinity(f) && sign) ||
3836         (float64_is_normal(f) && sign) ||
3837         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3838         s->float_exception_flags |= float_flag_invalid;
3839         return float64_default_nan(s);
3840     }
3841 
3842     /* frsqrt7(qNaN) = canonical NaN */
3843     if (float64_is_quiet_nan(f, s)) {
3844         return float64_default_nan(s);
3845     }
3846 
3847     /* frsqrt7(+-0) = +-inf */
3848     if (float64_is_zero(f)) {
3849         s->float_exception_flags |= float_flag_divbyzero;
3850         return float64_set_sign(float64_infinity, sign);
3851     }
3852 
3853     /* frsqrt7(+inf) = +0 */
3854     if (float64_is_infinity(f) && !sign) {
3855         return float64_set_sign(float64_zero, sign);
3856     }
3857 
3858     /* +normal, +subnormal */
3859     uint64_t val = frsqrt7(f, exp_size, frac_size);
3860     return make_float64(val);
3861 }
3862 
3863 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3864 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3865 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3866 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3867 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3868 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3869 
3870 /*
3871  * Vector Floating-Point Reciprocal Estimate Instruction
3872  *
3873  * Adapted from riscv-v-spec recip.c:
3874  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3875  */
3876 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3877                       float_status *s)
3878 {
3879     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3880     uint64_t exp = extract64(f, frac_size, exp_size);
3881     uint64_t frac = extract64(f, 0, frac_size);
3882 
3883     const uint8_t lookup_table[] = {
3884         127, 125, 123, 121, 119, 117, 116, 114,
3885         112, 110, 109, 107, 105, 104, 102, 100,
3886         99, 97, 96, 94, 93, 91, 90, 88,
3887         87, 85, 84, 83, 81, 80, 79, 77,
3888         76, 75, 74, 72, 71, 70, 69, 68,
3889         66, 65, 64, 63, 62, 61, 60, 59,
3890         58, 57, 56, 55, 54, 53, 52, 51,
3891         50, 49, 48, 47, 46, 45, 44, 43,
3892         42, 41, 40, 40, 39, 38, 37, 36,
3893         35, 35, 34, 33, 32, 31, 31, 30,
3894         29, 28, 28, 27, 26, 25, 25, 24,
3895         23, 23, 22, 21, 21, 20, 19, 19,
3896         18, 17, 17, 16, 15, 15, 14, 14,
3897         13, 12, 12, 11, 11, 10, 9, 9,
3898         8, 8, 7, 7, 6, 5, 5, 4,
3899         4, 3, 3, 2, 2, 1, 1, 0
3900     };
3901     const int precision = 7;
3902 
3903     if (exp == 0 && frac != 0) { /* subnormal */
3904         /* Normalize the subnormal. */
3905         while (extract64(frac, frac_size - 1, 1) == 0) {
3906             exp--;
3907             frac <<= 1;
3908         }
3909 
3910         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3911 
3912         if (exp != 0 && exp != UINT64_MAX) {
3913             /*
3914              * Overflow to inf or max value of same sign,
3915              * depending on sign and rounding mode.
3916              */
3917             s->float_exception_flags |= (float_flag_inexact |
3918                                          float_flag_overflow);
3919 
3920             if ((s->float_rounding_mode == float_round_to_zero) ||
3921                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3922                 ((s->float_rounding_mode == float_round_up) && sign)) {
3923                 /* Return greatest/negative finite value. */
3924                 return (sign << (exp_size + frac_size)) |
3925                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3926             } else {
3927                 /* Return +-inf. */
3928                 return (sign << (exp_size + frac_size)) |
3929                        MAKE_64BIT_MASK(frac_size, exp_size);
3930             }
3931         }
3932     }
3933 
3934     int idx = frac >> (frac_size - precision);
3935     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3936                         (frac_size - precision);
3937     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3938 
3939     if (out_exp == 0 || out_exp == UINT64_MAX) {
3940         /*
3941          * The result is subnormal, but don't raise the underflow exception,
3942          * because there's no additional loss of precision.
3943          */
3944         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3945         if (out_exp == UINT64_MAX) {
3946             out_frac >>= 1;
3947             out_exp = 0;
3948         }
3949     }
3950 
3951     uint64_t val = 0;
3952     val = deposit64(val, 0, frac_size, out_frac);
3953     val = deposit64(val, frac_size, exp_size, out_exp);
3954     val = deposit64(val, frac_size + exp_size, 1, sign);
3955     return val;
3956 }
3957 
3958 static float16 frec7_h(float16 f, float_status *s)
3959 {
3960     int exp_size = 5, frac_size = 10;
3961     bool sign = float16_is_neg(f);
3962 
3963     /* frec7(+-inf) = +-0 */
3964     if (float16_is_infinity(f)) {
3965         return float16_set_sign(float16_zero, sign);
3966     }
3967 
3968     /* frec7(+-0) = +-inf */
3969     if (float16_is_zero(f)) {
3970         s->float_exception_flags |= float_flag_divbyzero;
3971         return float16_set_sign(float16_infinity, sign);
3972     }
3973 
3974     /* frec7(sNaN) = canonical NaN */
3975     if (float16_is_signaling_nan(f, s)) {
3976         s->float_exception_flags |= float_flag_invalid;
3977         return float16_default_nan(s);
3978     }
3979 
3980     /* frec7(qNaN) = canonical NaN */
3981     if (float16_is_quiet_nan(f, s)) {
3982         return float16_default_nan(s);
3983     }
3984 
3985     /* +-normal, +-subnormal */
3986     uint64_t val = frec7(f, exp_size, frac_size, s);
3987     return make_float16(val);
3988 }
3989 
3990 static float32 frec7_s(float32 f, float_status *s)
3991 {
3992     int exp_size = 8, frac_size = 23;
3993     bool sign = float32_is_neg(f);
3994 
3995     /* frec7(+-inf) = +-0 */
3996     if (float32_is_infinity(f)) {
3997         return float32_set_sign(float32_zero, sign);
3998     }
3999 
4000     /* frec7(+-0) = +-inf */
4001     if (float32_is_zero(f)) {
4002         s->float_exception_flags |= float_flag_divbyzero;
4003         return float32_set_sign(float32_infinity, sign);
4004     }
4005 
4006     /* frec7(sNaN) = canonical NaN */
4007     if (float32_is_signaling_nan(f, s)) {
4008         s->float_exception_flags |= float_flag_invalid;
4009         return float32_default_nan(s);
4010     }
4011 
4012     /* frec7(qNaN) = canonical NaN */
4013     if (float32_is_quiet_nan(f, s)) {
4014         return float32_default_nan(s);
4015     }
4016 
4017     /* +-normal, +-subnormal */
4018     uint64_t val = frec7(f, exp_size, frac_size, s);
4019     return make_float32(val);
4020 }
4021 
4022 static float64 frec7_d(float64 f, float_status *s)
4023 {
4024     int exp_size = 11, frac_size = 52;
4025     bool sign = float64_is_neg(f);
4026 
4027     /* frec7(+-inf) = +-0 */
4028     if (float64_is_infinity(f)) {
4029         return float64_set_sign(float64_zero, sign);
4030     }
4031 
4032     /* frec7(+-0) = +-inf */
4033     if (float64_is_zero(f)) {
4034         s->float_exception_flags |= float_flag_divbyzero;
4035         return float64_set_sign(float64_infinity, sign);
4036     }
4037 
4038     /* frec7(sNaN) = canonical NaN */
4039     if (float64_is_signaling_nan(f, s)) {
4040         s->float_exception_flags |= float_flag_invalid;
4041         return float64_default_nan(s);
4042     }
4043 
4044     /* frec7(qNaN) = canonical NaN */
4045     if (float64_is_quiet_nan(f, s)) {
4046         return float64_default_nan(s);
4047     }
4048 
4049     /* +-normal, +-subnormal */
4050     uint64_t val = frec7(f, exp_size, frac_size, s);
4051     return make_float64(val);
4052 }
4053 
4054 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4055 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4056 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4057 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4058 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4059 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4060 
4061 /* Vector Floating-Point MIN/MAX Instructions */
4062 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4063 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4064 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4065 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4066 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4067 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4068 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4069 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4070 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4071 GEN_VEXT_VF(vfmin_vf_h, 2)
4072 GEN_VEXT_VF(vfmin_vf_w, 4)
4073 GEN_VEXT_VF(vfmin_vf_d, 8)
4074 
4075 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4076 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4077 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4078 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4079 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4080 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4081 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4082 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4083 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4084 GEN_VEXT_VF(vfmax_vf_h, 2)
4085 GEN_VEXT_VF(vfmax_vf_w, 4)
4086 GEN_VEXT_VF(vfmax_vf_d, 8)
4087 
4088 /* Vector Floating-Point Sign-Injection Instructions */
4089 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4090 {
4091     return deposit64(b, 0, 15, a);
4092 }
4093 
4094 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4095 {
4096     return deposit64(b, 0, 31, a);
4097 }
4098 
4099 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4100 {
4101     return deposit64(b, 0, 63, a);
4102 }
4103 
4104 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4105 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4106 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4107 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4108 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4109 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4110 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4111 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4112 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4113 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4114 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4115 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4116 
4117 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4118 {
4119     return deposit64(~b, 0, 15, a);
4120 }
4121 
4122 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4123 {
4124     return deposit64(~b, 0, 31, a);
4125 }
4126 
4127 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4128 {
4129     return deposit64(~b, 0, 63, a);
4130 }
4131 
4132 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4133 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4134 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4135 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4136 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4137 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4138 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4139 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4140 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4141 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4142 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4143 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4144 
4145 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4146 {
4147     return deposit64(b ^ a, 0, 15, a);
4148 }
4149 
4150 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4151 {
4152     return deposit64(b ^ a, 0, 31, a);
4153 }
4154 
4155 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4156 {
4157     return deposit64(b ^ a, 0, 63, a);
4158 }
4159 
4160 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4161 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4162 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4163 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4164 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4165 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4166 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4167 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4168 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4169 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4170 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4171 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4172 
4173 /* Vector Floating-Point Compare Instructions */
4174 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4175 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4176                   CPURISCVState *env, uint32_t desc)          \
4177 {                                                             \
4178     uint32_t vm = vext_vm(desc);                              \
4179     uint32_t vl = env->vl;                                    \
4180     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4181     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4182     uint32_t vma = vext_vma(desc);                            \
4183     uint32_t i;                                               \
4184                                                               \
4185     for (i = env->vstart; i < vl; i++) {                      \
4186         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4187         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4188         if (!vm && !vext_elem_mask(v0, i)) {                  \
4189             /* set masked-off elements to 1s */               \
4190             if (vma) {                                        \
4191                 vext_set_elem_mask(vd, i, 1);                 \
4192             }                                                 \
4193             continue;                                         \
4194         }                                                     \
4195         vext_set_elem_mask(vd, i,                             \
4196                            DO_OP(s2, s1, &env->fp_status));   \
4197     }                                                         \
4198     env->vstart = 0;                                          \
4199     /*
4200      * mask destination register are always tail-agnostic
4201      * set tail elements to 1s
4202      */                                                       \
4203     if (vta_all_1s) {                                         \
4204         for (; i < total_elems; i++) {                        \
4205             vext_set_elem_mask(vd, i, 1);                     \
4206         }                                                     \
4207     }                                                         \
4208 }
4209 
4210 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4211 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4212 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4213 
4214 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4215 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4216                   CPURISCVState *env, uint32_t desc)                \
4217 {                                                                   \
4218     uint32_t vm = vext_vm(desc);                                    \
4219     uint32_t vl = env->vl;                                          \
4220     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4221     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4222     uint32_t vma = vext_vma(desc);                                  \
4223     uint32_t i;                                                     \
4224                                                                     \
4225     for (i = env->vstart; i < vl; i++) {                            \
4226         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4227         if (!vm && !vext_elem_mask(v0, i)) {                        \
4228             /* set masked-off elements to 1s */                     \
4229             if (vma) {                                              \
4230                 vext_set_elem_mask(vd, i, 1);                       \
4231             }                                                       \
4232             continue;                                               \
4233         }                                                           \
4234         vext_set_elem_mask(vd, i,                                   \
4235                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4236     }                                                               \
4237     env->vstart = 0;                                                \
4238     /*
4239      * mask destination register are always tail-agnostic
4240      * set tail elements to 1s
4241      */                                                             \
4242     if (vta_all_1s) {                                               \
4243         for (; i < total_elems; i++) {                              \
4244             vext_set_elem_mask(vd, i, 1);                           \
4245         }                                                           \
4246     }                                                               \
4247 }
4248 
4249 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4250 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4251 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4252 
4253 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4254 {
4255     FloatRelation compare = float16_compare_quiet(a, b, s);
4256     return compare != float_relation_equal;
4257 }
4258 
4259 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4260 {
4261     FloatRelation compare = float32_compare_quiet(a, b, s);
4262     return compare != float_relation_equal;
4263 }
4264 
4265 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4266 {
4267     FloatRelation compare = float64_compare_quiet(a, b, s);
4268     return compare != float_relation_equal;
4269 }
4270 
4271 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4272 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4273 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4274 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4275 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4276 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4277 
4278 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4279 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4280 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4281 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4282 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4283 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4284 
4285 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4286 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4287 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4288 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4289 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4290 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4291 
4292 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4293 {
4294     FloatRelation compare = float16_compare(a, b, s);
4295     return compare == float_relation_greater;
4296 }
4297 
4298 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4299 {
4300     FloatRelation compare = float32_compare(a, b, s);
4301     return compare == float_relation_greater;
4302 }
4303 
4304 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4305 {
4306     FloatRelation compare = float64_compare(a, b, s);
4307     return compare == float_relation_greater;
4308 }
4309 
4310 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4311 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4312 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4313 
4314 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4315 {
4316     FloatRelation compare = float16_compare(a, b, s);
4317     return compare == float_relation_greater ||
4318            compare == float_relation_equal;
4319 }
4320 
4321 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4322 {
4323     FloatRelation compare = float32_compare(a, b, s);
4324     return compare == float_relation_greater ||
4325            compare == float_relation_equal;
4326 }
4327 
4328 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4329 {
4330     FloatRelation compare = float64_compare(a, b, s);
4331     return compare == float_relation_greater ||
4332            compare == float_relation_equal;
4333 }
4334 
4335 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4336 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4337 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4338 
4339 /* Vector Floating-Point Classify Instruction */
4340 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4341 static void do_##NAME(void *vd, void *vs2, int i)      \
4342 {                                                      \
4343     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4344     *((TD *)vd + HD(i)) = OP(s2);                      \
4345 }
4346 
4347 #define GEN_VEXT_V(NAME, ESZ)                          \
4348 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4349                   CPURISCVState *env, uint32_t desc)   \
4350 {                                                      \
4351     uint32_t vm = vext_vm(desc);                       \
4352     uint32_t vl = env->vl;                             \
4353     uint32_t total_elems =                             \
4354         vext_get_total_elems(env, desc, ESZ);          \
4355     uint32_t vta = vext_vta(desc);                     \
4356     uint32_t vma = vext_vma(desc);                     \
4357     uint32_t i;                                        \
4358                                                        \
4359     for (i = env->vstart; i < vl; i++) {               \
4360         if (!vm && !vext_elem_mask(v0, i)) {           \
4361             /* set masked-off elements to 1s */        \
4362             vext_set_elems_1s(vd, vma, i * ESZ,        \
4363                               (i + 1) * ESZ);          \
4364             continue;                                  \
4365         }                                              \
4366         do_##NAME(vd, vs2, i);                         \
4367     }                                                  \
4368     env->vstart = 0;                                   \
4369     /* set tail elements to 1s */                      \
4370     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4371                       total_elems * ESZ);              \
4372 }
4373 
4374 target_ulong fclass_h(uint64_t frs1)
4375 {
4376     float16 f = frs1;
4377     bool sign = float16_is_neg(f);
4378 
4379     if (float16_is_infinity(f)) {
4380         return sign ? 1 << 0 : 1 << 7;
4381     } else if (float16_is_zero(f)) {
4382         return sign ? 1 << 3 : 1 << 4;
4383     } else if (float16_is_zero_or_denormal(f)) {
4384         return sign ? 1 << 2 : 1 << 5;
4385     } else if (float16_is_any_nan(f)) {
4386         float_status s = { }; /* for snan_bit_is_one */
4387         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4388     } else {
4389         return sign ? 1 << 1 : 1 << 6;
4390     }
4391 }
4392 
4393 target_ulong fclass_s(uint64_t frs1)
4394 {
4395     float32 f = frs1;
4396     bool sign = float32_is_neg(f);
4397 
4398     if (float32_is_infinity(f)) {
4399         return sign ? 1 << 0 : 1 << 7;
4400     } else if (float32_is_zero(f)) {
4401         return sign ? 1 << 3 : 1 << 4;
4402     } else if (float32_is_zero_or_denormal(f)) {
4403         return sign ? 1 << 2 : 1 << 5;
4404     } else if (float32_is_any_nan(f)) {
4405         float_status s = { }; /* for snan_bit_is_one */
4406         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4407     } else {
4408         return sign ? 1 << 1 : 1 << 6;
4409     }
4410 }
4411 
4412 target_ulong fclass_d(uint64_t frs1)
4413 {
4414     float64 f = frs1;
4415     bool sign = float64_is_neg(f);
4416 
4417     if (float64_is_infinity(f)) {
4418         return sign ? 1 << 0 : 1 << 7;
4419     } else if (float64_is_zero(f)) {
4420         return sign ? 1 << 3 : 1 << 4;
4421     } else if (float64_is_zero_or_denormal(f)) {
4422         return sign ? 1 << 2 : 1 << 5;
4423     } else if (float64_is_any_nan(f)) {
4424         float_status s = { }; /* for snan_bit_is_one */
4425         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4426     } else {
4427         return sign ? 1 << 1 : 1 << 6;
4428     }
4429 }
4430 
4431 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4432 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4433 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4434 GEN_VEXT_V(vfclass_v_h, 2)
4435 GEN_VEXT_V(vfclass_v_w, 4)
4436 GEN_VEXT_V(vfclass_v_d, 8)
4437 
4438 /* Vector Floating-Point Merge Instruction */
4439 
4440 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4441 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4442                   CPURISCVState *env, uint32_t desc)          \
4443 {                                                             \
4444     uint32_t vm = vext_vm(desc);                              \
4445     uint32_t vl = env->vl;                                    \
4446     uint32_t esz = sizeof(ETYPE);                             \
4447     uint32_t total_elems =                                    \
4448         vext_get_total_elems(env, desc, esz);                 \
4449     uint32_t vta = vext_vta(desc);                            \
4450     uint32_t i;                                               \
4451                                                               \
4452     for (i = env->vstart; i < vl; i++) {                      \
4453         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4454         *((ETYPE *)vd + H(i)) =                               \
4455             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4456     }                                                         \
4457     env->vstart = 0;                                          \
4458     /* set tail elements to 1s */                             \
4459     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4460 }
4461 
4462 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4463 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4464 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4465 
4466 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4467 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4468 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4469 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4470 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4471 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4472 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4473 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4474 
4475 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4476 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4477 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4478 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4479 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4480 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4481 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4482 
4483 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4484 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4485 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4486 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4487 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4488 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4489 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4490 
4491 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4492 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4493 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4494 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4495 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4496 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4497 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4498 
4499 /* Widening Floating-Point/Integer Type-Convert Instructions */
4500 /* (TD, T2, TX2) */
4501 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4502 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4503 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4504 /*
4505  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4506  */
4507 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4508 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4509 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4510 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4511 
4512 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4513 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4514 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4515 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4516 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4517 
4518 /*
4519  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4520  */
4521 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4522 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4523 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4524 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4525 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4526 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4527 
4528 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4529 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4530 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4531 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4532 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4533 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4534 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4535 
4536 /*
4537  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4538  */
4539 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4540 {
4541     return float16_to_float32(a, true, s);
4542 }
4543 
4544 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4545 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4546 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4547 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4548 
4549 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4550 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4551 
4552 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4553 /* (TD, T2, TX2) */
4554 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4555 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4556 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4557 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4558 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4559 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4560 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4561 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4562 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4563 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4564 
4565 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4566 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4567 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4568 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4569 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4570 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4571 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4572 
4573 /*
4574  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4575  */
4576 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4577 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4578 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4579 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4580 
4581 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4582 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4583 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4584 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4585 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4586 
4587 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4588 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4589 {
4590     return float32_to_float16(a, true, s);
4591 }
4592 
4593 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4594 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4595 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4596 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4597 
4598 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4599 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4600 
4601 /*
4602  * Vector Reduction Operations
4603  */
4604 /* Vector Single-Width Integer Reduction Instructions */
4605 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4606 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4607                   void *vs2, CPURISCVState *env,          \
4608                   uint32_t desc)                          \
4609 {                                                         \
4610     uint32_t vm = vext_vm(desc);                          \
4611     uint32_t vl = env->vl;                                \
4612     uint32_t esz = sizeof(TD);                            \
4613     uint32_t vlenb = simd_maxsz(desc);                    \
4614     uint32_t vta = vext_vta(desc);                        \
4615     uint32_t i;                                           \
4616     TD s1 =  *((TD *)vs1 + HD(0));                        \
4617                                                           \
4618     for (i = env->vstart; i < vl; i++) {                  \
4619         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4620         if (!vm && !vext_elem_mask(v0, i)) {              \
4621             continue;                                     \
4622         }                                                 \
4623         s1 = OP(s1, (TD)s2);                              \
4624     }                                                     \
4625     *((TD *)vd + HD(0)) = s1;                             \
4626     env->vstart = 0;                                      \
4627     /* set tail elements to 1s */                         \
4628     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4629 }
4630 
4631 /* vd[0] = sum(vs1[0], vs2[*]) */
4632 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4633 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4634 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4635 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4636 
4637 /* vd[0] = maxu(vs1[0], vs2[*]) */
4638 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4639 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4640 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4641 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4642 
4643 /* vd[0] = max(vs1[0], vs2[*]) */
4644 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4645 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4646 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4647 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4648 
4649 /* vd[0] = minu(vs1[0], vs2[*]) */
4650 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4651 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4652 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4653 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4654 
4655 /* vd[0] = min(vs1[0], vs2[*]) */
4656 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4657 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4658 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4659 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4660 
4661 /* vd[0] = and(vs1[0], vs2[*]) */
4662 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4663 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4664 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4665 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4666 
4667 /* vd[0] = or(vs1[0], vs2[*]) */
4668 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4669 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4670 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4671 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4672 
4673 /* vd[0] = xor(vs1[0], vs2[*]) */
4674 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4675 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4676 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4677 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4678 
4679 /* Vector Widening Integer Reduction Instructions */
4680 /* signed sum reduction into double-width accumulator */
4681 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4682 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4683 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4684 
4685 /* Unsigned sum reduction into double-width accumulator */
4686 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4687 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4688 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4689 
4690 /* Vector Single-Width Floating-Point Reduction Instructions */
4691 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4692 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4693                   void *vs2, CPURISCVState *env,           \
4694                   uint32_t desc)                           \
4695 {                                                          \
4696     uint32_t vm = vext_vm(desc);                           \
4697     uint32_t vl = env->vl;                                 \
4698     uint32_t esz = sizeof(TD);                             \
4699     uint32_t vlenb = simd_maxsz(desc);                     \
4700     uint32_t vta = vext_vta(desc);                         \
4701     uint32_t i;                                            \
4702     TD s1 =  *((TD *)vs1 + HD(0));                         \
4703                                                            \
4704     for (i = env->vstart; i < vl; i++) {                   \
4705         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4706         if (!vm && !vext_elem_mask(v0, i)) {               \
4707             continue;                                      \
4708         }                                                  \
4709         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4710     }                                                      \
4711     *((TD *)vd + HD(0)) = s1;                              \
4712     env->vstart = 0;                                       \
4713     /* set tail elements to 1s */                          \
4714     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4715 }
4716 
4717 /* Unordered sum */
4718 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4719 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4720 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4721 
4722 /* Ordered sum */
4723 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4724 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4725 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4726 
4727 /* Maximum value */
4728 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4729               float16_maximum_number)
4730 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4731               float32_maximum_number)
4732 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4733               float64_maximum_number)
4734 
4735 /* Minimum value */
4736 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4737               float16_minimum_number)
4738 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4739               float32_minimum_number)
4740 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4741               float64_minimum_number)
4742 
4743 /* Vector Widening Floating-Point Add Instructions */
4744 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4745 {
4746     return float32_add(a, float16_to_float32(b, true, s), s);
4747 }
4748 
4749 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4750 {
4751     return float64_add(a, float32_to_float64(b, s), s);
4752 }
4753 
4754 /* Vector Widening Floating-Point Reduction Instructions */
4755 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4756 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4757 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4758 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4759 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4760 
4761 /*
4762  * Vector Mask Operations
4763  */
4764 /* Vector Mask-Register Logical Instructions */
4765 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4766 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4767                   void *vs2, CPURISCVState *env,          \
4768                   uint32_t desc)                          \
4769 {                                                         \
4770     uint32_t vl = env->vl;                                \
4771     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4772     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4773     uint32_t i;                                           \
4774     int a, b;                                             \
4775                                                           \
4776     for (i = env->vstart; i < vl; i++) {                  \
4777         a = vext_elem_mask(vs1, i);                       \
4778         b = vext_elem_mask(vs2, i);                       \
4779         vext_set_elem_mask(vd, i, OP(b, a));              \
4780     }                                                     \
4781     env->vstart = 0;                                      \
4782     /*
4783      * mask destination register are always tail-agnostic
4784      * set tail elements to 1s
4785      */                                                   \
4786     if (vta_all_1s) {                                     \
4787         for (; i < total_elems; i++) {                    \
4788             vext_set_elem_mask(vd, i, 1);                 \
4789         }                                                 \
4790     }                                                     \
4791 }
4792 
4793 #define DO_NAND(N, M)  (!(N & M))
4794 #define DO_ANDNOT(N, M)  (N & !M)
4795 #define DO_NOR(N, M)  (!(N | M))
4796 #define DO_ORNOT(N, M)  (N | !M)
4797 #define DO_XNOR(N, M)  (!(N ^ M))
4798 
4799 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4800 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4801 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4802 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4803 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4804 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4805 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4806 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4807 
4808 /* Vector count population in mask vcpop */
4809 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4810                              uint32_t desc)
4811 {
4812     target_ulong cnt = 0;
4813     uint32_t vm = vext_vm(desc);
4814     uint32_t vl = env->vl;
4815     int i;
4816 
4817     for (i = env->vstart; i < vl; i++) {
4818         if (vm || vext_elem_mask(v0, i)) {
4819             if (vext_elem_mask(vs2, i)) {
4820                 cnt++;
4821             }
4822         }
4823     }
4824     env->vstart = 0;
4825     return cnt;
4826 }
4827 
4828 /* vfirst find-first-set mask bit */
4829 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4830                               uint32_t desc)
4831 {
4832     uint32_t vm = vext_vm(desc);
4833     uint32_t vl = env->vl;
4834     int i;
4835 
4836     for (i = env->vstart; i < vl; i++) {
4837         if (vm || vext_elem_mask(v0, i)) {
4838             if (vext_elem_mask(vs2, i)) {
4839                 return i;
4840             }
4841         }
4842     }
4843     env->vstart = 0;
4844     return -1LL;
4845 }
4846 
4847 enum set_mask_type {
4848     ONLY_FIRST = 1,
4849     INCLUDE_FIRST,
4850     BEFORE_FIRST,
4851 };
4852 
4853 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4854                    uint32_t desc, enum set_mask_type type)
4855 {
4856     uint32_t vm = vext_vm(desc);
4857     uint32_t vl = env->vl;
4858     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4859     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4860     uint32_t vma = vext_vma(desc);
4861     int i;
4862     bool first_mask_bit = false;
4863 
4864     for (i = env->vstart; i < vl; i++) {
4865         if (!vm && !vext_elem_mask(v0, i)) {
4866             /* set masked-off elements to 1s */
4867             if (vma) {
4868                 vext_set_elem_mask(vd, i, 1);
4869             }
4870             continue;
4871         }
4872         /* write a zero to all following active elements */
4873         if (first_mask_bit) {
4874             vext_set_elem_mask(vd, i, 0);
4875             continue;
4876         }
4877         if (vext_elem_mask(vs2, i)) {
4878             first_mask_bit = true;
4879             if (type == BEFORE_FIRST) {
4880                 vext_set_elem_mask(vd, i, 0);
4881             } else {
4882                 vext_set_elem_mask(vd, i, 1);
4883             }
4884         } else {
4885             if (type == ONLY_FIRST) {
4886                 vext_set_elem_mask(vd, i, 0);
4887             } else {
4888                 vext_set_elem_mask(vd, i, 1);
4889             }
4890         }
4891     }
4892     env->vstart = 0;
4893     /*
4894      * mask destination register are always tail-agnostic
4895      * set tail elements to 1s
4896      */
4897     if (vta_all_1s) {
4898         for (; i < total_elems; i++) {
4899             vext_set_elem_mask(vd, i, 1);
4900         }
4901     }
4902 }
4903 
4904 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4905                      uint32_t desc)
4906 {
4907     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4908 }
4909 
4910 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4911                      uint32_t desc)
4912 {
4913     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4914 }
4915 
4916 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4917                      uint32_t desc)
4918 {
4919     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4920 }
4921 
4922 /* Vector Iota Instruction */
4923 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4924 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4925                   uint32_t desc)                                          \
4926 {                                                                         \
4927     uint32_t vm = vext_vm(desc);                                          \
4928     uint32_t vl = env->vl;                                                \
4929     uint32_t esz = sizeof(ETYPE);                                         \
4930     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4931     uint32_t vta = vext_vta(desc);                                        \
4932     uint32_t vma = vext_vma(desc);                                        \
4933     uint32_t sum = 0;                                                     \
4934     int i;                                                                \
4935                                                                           \
4936     for (i = env->vstart; i < vl; i++) {                                  \
4937         if (!vm && !vext_elem_mask(v0, i)) {                              \
4938             /* set masked-off elements to 1s */                           \
4939             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4940             continue;                                                     \
4941         }                                                                 \
4942         *((ETYPE *)vd + H(i)) = sum;                                      \
4943         if (vext_elem_mask(vs2, i)) {                                     \
4944             sum++;                                                        \
4945         }                                                                 \
4946     }                                                                     \
4947     env->vstart = 0;                                                      \
4948     /* set tail elements to 1s */                                         \
4949     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4950 }
4951 
4952 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4953 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4954 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4955 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4956 
4957 /* Vector Element Index Instruction */
4958 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4959 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4960 {                                                                         \
4961     uint32_t vm = vext_vm(desc);                                          \
4962     uint32_t vl = env->vl;                                                \
4963     uint32_t esz = sizeof(ETYPE);                                         \
4964     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4965     uint32_t vta = vext_vta(desc);                                        \
4966     uint32_t vma = vext_vma(desc);                                        \
4967     int i;                                                                \
4968                                                                           \
4969     for (i = env->vstart; i < vl; i++) {                                  \
4970         if (!vm && !vext_elem_mask(v0, i)) {                              \
4971             /* set masked-off elements to 1s */                           \
4972             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4973             continue;                                                     \
4974         }                                                                 \
4975         *((ETYPE *)vd + H(i)) = i;                                        \
4976     }                                                                     \
4977     env->vstart = 0;                                                      \
4978     /* set tail elements to 1s */                                         \
4979     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4980 }
4981 
4982 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4983 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4984 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4985 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4986 
4987 /*
4988  * Vector Permutation Instructions
4989  */
4990 
4991 /* Vector Slide Instructions */
4992 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4993 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4994                   CPURISCVState *env, uint32_t desc)                      \
4995 {                                                                         \
4996     uint32_t vm = vext_vm(desc);                                          \
4997     uint32_t vl = env->vl;                                                \
4998     uint32_t esz = sizeof(ETYPE);                                         \
4999     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5000     uint32_t vta = vext_vta(desc);                                        \
5001     uint32_t vma = vext_vma(desc);                                        \
5002     target_ulong offset = s1, i_min, i;                                   \
5003                                                                           \
5004     i_min = MAX(env->vstart, offset);                                     \
5005     for (i = i_min; i < vl; i++) {                                        \
5006         if (!vm && !vext_elem_mask(v0, i)) {                              \
5007             /* set masked-off elements to 1s */                           \
5008             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5009             continue;                                                     \
5010         }                                                                 \
5011         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5012     }                                                                     \
5013     /* set tail elements to 1s */                                         \
5014     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5015 }
5016 
5017 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5018 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5019 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5020 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5021 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5022 
5023 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5024 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5025                   CPURISCVState *env, uint32_t desc)                      \
5026 {                                                                         \
5027     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5028     uint32_t vm = vext_vm(desc);                                          \
5029     uint32_t vl = env->vl;                                                \
5030     uint32_t esz = sizeof(ETYPE);                                         \
5031     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5032     uint32_t vta = vext_vta(desc);                                        \
5033     uint32_t vma = vext_vma(desc);                                        \
5034     target_ulong i_max, i;                                                \
5035                                                                           \
5036     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5037     for (i = env->vstart; i < i_max; ++i) {                               \
5038         if (!vm && !vext_elem_mask(v0, i)) {                              \
5039             /* set masked-off elements to 1s */                           \
5040             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5041             continue;                                                     \
5042         }                                                                 \
5043         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5044     }                                                                     \
5045                                                                           \
5046     for (i = i_max; i < vl; ++i) {                                        \
5047         if (vm || vext_elem_mask(v0, i)) {                                \
5048             *((ETYPE *)vd + H(i)) = 0;                                    \
5049         }                                                                 \
5050     }                                                                     \
5051                                                                           \
5052     env->vstart = 0;                                                      \
5053     /* set tail elements to 1s */                                         \
5054     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5055 }
5056 
5057 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5058 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5059 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5060 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5061 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5062 
5063 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5064 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5065                                  void *vs2, CPURISCVState *env,             \
5066                                  uint32_t desc)                             \
5067 {                                                                           \
5068     typedef uint##BITWIDTH##_t ETYPE;                                       \
5069     uint32_t vm = vext_vm(desc);                                            \
5070     uint32_t vl = env->vl;                                                  \
5071     uint32_t esz = sizeof(ETYPE);                                           \
5072     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5073     uint32_t vta = vext_vta(desc);                                          \
5074     uint32_t vma = vext_vma(desc);                                          \
5075     uint32_t i;                                                             \
5076                                                                             \
5077     for (i = env->vstart; i < vl; i++) {                                    \
5078         if (!vm && !vext_elem_mask(v0, i)) {                                \
5079             /* set masked-off elements to 1s */                             \
5080             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5081             continue;                                                       \
5082         }                                                                   \
5083         if (i == 0) {                                                       \
5084             *((ETYPE *)vd + H(i)) = s1;                                     \
5085         } else {                                                            \
5086             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5087         }                                                                   \
5088     }                                                                       \
5089     env->vstart = 0;                                                        \
5090     /* set tail elements to 1s */                                           \
5091     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5092 }
5093 
5094 GEN_VEXT_VSLIE1UP(8,  H1)
5095 GEN_VEXT_VSLIE1UP(16, H2)
5096 GEN_VEXT_VSLIE1UP(32, H4)
5097 GEN_VEXT_VSLIE1UP(64, H8)
5098 
5099 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5100 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5101                   CPURISCVState *env, uint32_t desc)              \
5102 {                                                                 \
5103     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5104 }
5105 
5106 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5107 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5108 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5109 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5110 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5111 
5112 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5113 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5114                                    void *vs2, CPURISCVState *env,             \
5115                                    uint32_t desc)                             \
5116 {                                                                             \
5117     typedef uint##BITWIDTH##_t ETYPE;                                         \
5118     uint32_t vm = vext_vm(desc);                                              \
5119     uint32_t vl = env->vl;                                                    \
5120     uint32_t esz = sizeof(ETYPE);                                             \
5121     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5122     uint32_t vta = vext_vta(desc);                                            \
5123     uint32_t vma = vext_vma(desc);                                            \
5124     uint32_t i;                                                               \
5125                                                                               \
5126     for (i = env->vstart; i < vl; i++) {                                      \
5127         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5128             /* set masked-off elements to 1s */                               \
5129             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5130             continue;                                                         \
5131         }                                                                     \
5132         if (i == vl - 1) {                                                    \
5133             *((ETYPE *)vd + H(i)) = s1;                                       \
5134         } else {                                                              \
5135             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5136         }                                                                     \
5137     }                                                                         \
5138     env->vstart = 0;                                                          \
5139     /* set tail elements to 1s */                                             \
5140     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5141 }
5142 
5143 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5144 GEN_VEXT_VSLIDE1DOWN(16, H2)
5145 GEN_VEXT_VSLIDE1DOWN(32, H4)
5146 GEN_VEXT_VSLIDE1DOWN(64, H8)
5147 
5148 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5149 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5150                   CPURISCVState *env, uint32_t desc)              \
5151 {                                                                 \
5152     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5153 }
5154 
5155 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5156 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5157 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5158 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5159 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5160 
5161 /* Vector Floating-Point Slide Instructions */
5162 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5163 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5164                   CPURISCVState *env, uint32_t desc)          \
5165 {                                                             \
5166     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5167 }
5168 
5169 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5170 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5171 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5172 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5173 
5174 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5175 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5176                   CPURISCVState *env, uint32_t desc)          \
5177 {                                                             \
5178     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5179 }
5180 
5181 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5182 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5183 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5184 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5185 
5186 /* Vector Register Gather Instruction */
5187 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5188 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5189                   CPURISCVState *env, uint32_t desc)                      \
5190 {                                                                         \
5191     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5192     uint32_t vm = vext_vm(desc);                                          \
5193     uint32_t vl = env->vl;                                                \
5194     uint32_t esz = sizeof(TS2);                                           \
5195     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5196     uint32_t vta = vext_vta(desc);                                        \
5197     uint32_t vma = vext_vma(desc);                                        \
5198     uint64_t index;                                                       \
5199     uint32_t i;                                                           \
5200                                                                           \
5201     for (i = env->vstart; i < vl; i++) {                                  \
5202         if (!vm && !vext_elem_mask(v0, i)) {                              \
5203             /* set masked-off elements to 1s */                           \
5204             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5205             continue;                                                     \
5206         }                                                                 \
5207         index = *((TS1 *)vs1 + HS1(i));                                   \
5208         if (index >= vlmax) {                                             \
5209             *((TS2 *)vd + HS2(i)) = 0;                                    \
5210         } else {                                                          \
5211             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5212         }                                                                 \
5213     }                                                                     \
5214     env->vstart = 0;                                                      \
5215     /* set tail elements to 1s */                                         \
5216     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5217 }
5218 
5219 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5220 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5221 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5222 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5223 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5224 
5225 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5226 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5227 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5228 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5229 
5230 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5231 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5232                   CPURISCVState *env, uint32_t desc)                      \
5233 {                                                                         \
5234     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5235     uint32_t vm = vext_vm(desc);                                          \
5236     uint32_t vl = env->vl;                                                \
5237     uint32_t esz = sizeof(ETYPE);                                         \
5238     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5239     uint32_t vta = vext_vta(desc);                                        \
5240     uint32_t vma = vext_vma(desc);                                        \
5241     uint64_t index = s1;                                                  \
5242     uint32_t i;                                                           \
5243                                                                           \
5244     for (i = env->vstart; i < vl; i++) {                                  \
5245         if (!vm && !vext_elem_mask(v0, i)) {                              \
5246             /* set masked-off elements to 1s */                           \
5247             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5248             continue;                                                     \
5249         }                                                                 \
5250         if (index >= vlmax) {                                             \
5251             *((ETYPE *)vd + H(i)) = 0;                                    \
5252         } else {                                                          \
5253             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5254         }                                                                 \
5255     }                                                                     \
5256     env->vstart = 0;                                                      \
5257     /* set tail elements to 1s */                                         \
5258     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5259 }
5260 
5261 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5262 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5263 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5264 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5265 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5266 
5267 /* Vector Compress Instruction */
5268 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5269 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5270                   CPURISCVState *env, uint32_t desc)                      \
5271 {                                                                         \
5272     uint32_t vl = env->vl;                                                \
5273     uint32_t esz = sizeof(ETYPE);                                         \
5274     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5275     uint32_t vta = vext_vta(desc);                                        \
5276     uint32_t num = 0, i;                                                  \
5277                                                                           \
5278     for (i = env->vstart; i < vl; i++) {                                  \
5279         if (!vext_elem_mask(vs1, i)) {                                    \
5280             continue;                                                     \
5281         }                                                                 \
5282         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5283         num++;                                                            \
5284     }                                                                     \
5285     env->vstart = 0;                                                      \
5286     /* set tail elements to 1s */                                         \
5287     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5288 }
5289 
5290 /* Compress into vd elements of vs2 where vs1 is enabled */
5291 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5292 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5293 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5294 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5295 
5296 /* Vector Whole Register Move */
5297 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5298 {
5299     /* EEW = SEW */
5300     uint32_t maxsz = simd_maxsz(desc);
5301     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5302     uint32_t startb = env->vstart * sewb;
5303     uint32_t i = startb;
5304 
5305     memcpy((uint8_t *)vd + H1(i),
5306            (uint8_t *)vs2 + H1(i),
5307            maxsz - startb);
5308 
5309     env->vstart = 0;
5310 }
5311 
5312 /* Vector Integer Extension */
5313 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5314 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5315                   CPURISCVState *env, uint32_t desc)             \
5316 {                                                                \
5317     uint32_t vl = env->vl;                                       \
5318     uint32_t vm = vext_vm(desc);                                 \
5319     uint32_t esz = sizeof(ETYPE);                                \
5320     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5321     uint32_t vta = vext_vta(desc);                               \
5322     uint32_t vma = vext_vma(desc);                               \
5323     uint32_t i;                                                  \
5324                                                                  \
5325     for (i = env->vstart; i < vl; i++) {                         \
5326         if (!vm && !vext_elem_mask(v0, i)) {                     \
5327             /* set masked-off elements to 1s */                  \
5328             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5329             continue;                                            \
5330         }                                                        \
5331         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5332     }                                                            \
5333     env->vstart = 0;                                             \
5334     /* set tail elements to 1s */                                \
5335     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5336 }
5337 
5338 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5339 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5340 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5341 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5342 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5343 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5344 
5345 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5346 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5347 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5348 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5349 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5350 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5351