xref: /openbmc/qemu/target/riscv/vector_helper.c (revision d3860a57)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl,
271                                    void *vd, uint32_t desc, uint32_t nf,
272                                    uint32_t esz, uint32_t max_elems)
273 {
274     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
275     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
276     uint32_t vta = vext_vta(desc);
277     uint32_t registers_used;
278     int k;
279 
280     for (k = 0; k < nf; ++k) {
281         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
282                           (k * max_elems + max_elems) * esz);
283     }
284 
285     if (nf * max_elems % total_elems != 0) {
286         registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
287         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
288                           registers_used * vlenb);
289     }
290 }
291 
292 /*
293  *** stride: access vector element from strided memory
294  */
295 static void
296 vext_ldst_stride(void *vd, void *v0, target_ulong base,
297                  target_ulong stride, CPURISCVState *env,
298                  uint32_t desc, uint32_t vm,
299                  vext_ldst_elem_fn *ldst_elem,
300                  uint32_t log2_esz, uintptr_t ra)
301 {
302     uint32_t i, k;
303     uint32_t nf = vext_nf(desc);
304     uint32_t max_elems = vext_max_elems(desc, log2_esz);
305     uint32_t esz = 1 << log2_esz;
306     uint32_t vma = vext_vma(desc);
307 
308     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
309         k = 0;
310         while (k < nf) {
311             if (!vm && !vext_elem_mask(v0, i)) {
312                 /* set masked-off elements to 1s */
313                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
314                                   (i + k * max_elems + 1) * esz);
315                 k++;
316                 continue;
317             }
318             target_ulong addr = base + stride * i + (k << log2_esz);
319             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
320             k++;
321         }
322     }
323     env->vstart = 0;
324 
325     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
326 }
327 
328 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
329 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
330                   target_ulong stride, CPURISCVState *env,              \
331                   uint32_t desc)                                        \
332 {                                                                       \
333     uint32_t vm = vext_vm(desc);                                        \
334     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
335                      ctzl(sizeof(ETYPE)), GETPC());                     \
336 }
337 
338 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
339 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
340 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
341 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
342 
343 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
344 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
345                   target_ulong stride, CPURISCVState *env,              \
346                   uint32_t desc)                                        \
347 {                                                                       \
348     uint32_t vm = vext_vm(desc);                                        \
349     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
350                      ctzl(sizeof(ETYPE)), GETPC());                     \
351 }
352 
353 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
354 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
355 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
356 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
357 
358 /*
359  *** unit-stride: access elements stored contiguously in memory
360  */
361 
362 /* unmasked unit-stride load and store operation*/
363 static void
364 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
365              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
366              uintptr_t ra)
367 {
368     uint32_t i, k;
369     uint32_t nf = vext_nf(desc);
370     uint32_t max_elems = vext_max_elems(desc, log2_esz);
371     uint32_t esz = 1 << log2_esz;
372 
373     /* load bytes from guest memory */
374     for (i = env->vstart; i < evl; i++, env->vstart++) {
375         k = 0;
376         while (k < nf) {
377             target_ulong addr = base + ((i * nf + k) << log2_esz);
378             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
379             k++;
380         }
381     }
382     env->vstart = 0;
383 
384     vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems);
385 }
386 
387 /*
388  * masked unit-stride load and store operation will be a special case of stride,
389  * stride = NF * sizeof (MTYPE)
390  */
391 
392 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
393 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
394                          CPURISCVState *env, uint32_t desc)             \
395 {                                                                       \
396     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
397     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
398                      ctzl(sizeof(ETYPE)), GETPC());                     \
399 }                                                                       \
400                                                                         \
401 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
402                   CPURISCVState *env, uint32_t desc)                    \
403 {                                                                       \
404     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
405                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
406 }
407 
408 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
409 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
410 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
411 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
412 
413 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
414 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
415                          CPURISCVState *env, uint32_t desc)              \
416 {                                                                        \
417     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
418     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
419                      ctzl(sizeof(ETYPE)), GETPC());                      \
420 }                                                                        \
421                                                                          \
422 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
423                   CPURISCVState *env, uint32_t desc)                     \
424 {                                                                        \
425     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
426                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
427 }
428 
429 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
430 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
431 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
432 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
433 
434 /*
435  *** unit stride mask load and store, EEW = 1
436  */
437 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
438                     CPURISCVState *env, uint32_t desc)
439 {
440     /* evl = ceil(vl/8) */
441     uint8_t evl = (env->vl + 7) >> 3;
442     vext_ldst_us(vd, base, env, desc, lde_b,
443                  0, evl, GETPC());
444 }
445 
446 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
447                     CPURISCVState *env, uint32_t desc)
448 {
449     /* evl = ceil(vl/8) */
450     uint8_t evl = (env->vl + 7) >> 3;
451     vext_ldst_us(vd, base, env, desc, ste_b,
452                  0, evl, GETPC());
453 }
454 
455 /*
456  *** index: access vector element from indexed memory
457  */
458 typedef target_ulong vext_get_index_addr(target_ulong base,
459         uint32_t idx, void *vs2);
460 
461 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
462 static target_ulong NAME(target_ulong base,            \
463                          uint32_t idx, void *vs2)      \
464 {                                                      \
465     return (base + *((ETYPE *)vs2 + H(idx)));          \
466 }
467 
468 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
469 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
470 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
471 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
472 
473 static inline void
474 vext_ldst_index(void *vd, void *v0, target_ulong base,
475                 void *vs2, CPURISCVState *env, uint32_t desc,
476                 vext_get_index_addr get_index_addr,
477                 vext_ldst_elem_fn *ldst_elem,
478                 uint32_t log2_esz, uintptr_t ra)
479 {
480     uint32_t i, k;
481     uint32_t nf = vext_nf(desc);
482     uint32_t vm = vext_vm(desc);
483     uint32_t max_elems = vext_max_elems(desc, log2_esz);
484     uint32_t esz = 1 << log2_esz;
485     uint32_t vma = vext_vma(desc);
486 
487     /* load bytes from guest memory */
488     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
489         k = 0;
490         while (k < nf) {
491             if (!vm && !vext_elem_mask(v0, i)) {
492                 /* set masked-off elements to 1s */
493                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
494                                   (i + k * max_elems + 1) * esz);
495                 k++;
496                 continue;
497             }
498             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
499             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
500             k++;
501         }
502     }
503     env->vstart = 0;
504 
505     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
506 }
507 
508 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
509 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
510                   void *vs2, CPURISCVState *env, uint32_t desc)            \
511 {                                                                          \
512     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
513                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
514 }
515 
516 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
517 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
518 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
519 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
520 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
521 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
522 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
523 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
524 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
525 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
526 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
527 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
528 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
529 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
530 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
531 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
532 
533 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
534 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
535                   void *vs2, CPURISCVState *env, uint32_t desc)  \
536 {                                                                \
537     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
538                     STORE_FN, ctzl(sizeof(ETYPE)),               \
539                     GETPC());                                    \
540 }
541 
542 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
543 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
544 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
545 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
546 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
547 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
548 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
549 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
550 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
551 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
552 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
553 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
554 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
555 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
556 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
557 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
558 
559 /*
560  *** unit-stride fault-only-fisrt load instructions
561  */
562 static inline void
563 vext_ldff(void *vd, void *v0, target_ulong base,
564           CPURISCVState *env, uint32_t desc,
565           vext_ldst_elem_fn *ldst_elem,
566           uint32_t log2_esz, uintptr_t ra)
567 {
568     void *host;
569     uint32_t i, k, vl = 0;
570     uint32_t nf = vext_nf(desc);
571     uint32_t vm = vext_vm(desc);
572     uint32_t max_elems = vext_max_elems(desc, log2_esz);
573     uint32_t esz = 1 << log2_esz;
574     uint32_t vma = vext_vma(desc);
575     target_ulong addr, offset, remain;
576 
577     /* probe every access*/
578     for (i = env->vstart; i < env->vl; i++) {
579         if (!vm && !vext_elem_mask(v0, i)) {
580             continue;
581         }
582         addr = adjust_addr(env, base + i * (nf << log2_esz));
583         if (i == 0) {
584             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
585         } else {
586             /* if it triggers an exception, no need to check watchpoint */
587             remain = nf << log2_esz;
588             while (remain > 0) {
589                 offset = -(addr | TARGET_PAGE_MASK);
590                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
591                                          cpu_mmu_index(env, false));
592                 if (host) {
593 #ifdef CONFIG_USER_ONLY
594                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
595                         vl = i;
596                         goto ProbeSuccess;
597                     }
598 #else
599                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
600 #endif
601                 } else {
602                     vl = i;
603                     goto ProbeSuccess;
604                 }
605                 if (remain <=  offset) {
606                     break;
607                 }
608                 remain -= offset;
609                 addr = adjust_addr(env, addr + offset);
610             }
611         }
612     }
613 ProbeSuccess:
614     /* load bytes from guest memory */
615     if (vl != 0) {
616         env->vl = vl;
617     }
618     for (i = env->vstart; i < env->vl; i++) {
619         k = 0;
620         while (k < nf) {
621             if (!vm && !vext_elem_mask(v0, i)) {
622                 /* set masked-off elements to 1s */
623                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
624                                   (i + k * max_elems + 1) * esz);
625                 k++;
626                 continue;
627             }
628             target_ulong addr = base + ((i * nf + k) << log2_esz);
629             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
630             k++;
631         }
632     }
633     env->vstart = 0;
634 
635     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
636 }
637 
638 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
639 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
640                   CPURISCVState *env, uint32_t desc)      \
641 {                                                         \
642     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
643               ctzl(sizeof(ETYPE)), GETPC());              \
644 }
645 
646 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
647 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
648 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
649 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
650 
651 #define DO_SWAP(N, M) (M)
652 #define DO_AND(N, M)  (N & M)
653 #define DO_XOR(N, M)  (N ^ M)
654 #define DO_OR(N, M)   (N | M)
655 #define DO_ADD(N, M)  (N + M)
656 
657 /* Signed min/max */
658 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
659 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
660 
661 /* Unsigned min/max */
662 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
663 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
664 
665 /*
666  *** load and store whole register instructions
667  */
668 static void
669 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
670                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
671 {
672     uint32_t i, k, off, pos;
673     uint32_t nf = vext_nf(desc);
674     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
675     uint32_t max_elems = vlenb >> log2_esz;
676 
677     k = env->vstart / max_elems;
678     off = env->vstart % max_elems;
679 
680     if (off) {
681         /* load/store rest of elements of current segment pointed by vstart */
682         for (pos = off; pos < max_elems; pos++, env->vstart++) {
683             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
684             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
685         }
686         k++;
687     }
688 
689     /* load/store elements for rest of segments */
690     for (; k < nf; k++) {
691         for (i = 0; i < max_elems; i++, env->vstart++) {
692             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
693             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
694         }
695     }
696 
697     env->vstart = 0;
698 }
699 
700 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
701 void HELPER(NAME)(void *vd, target_ulong base,       \
702                   CPURISCVState *env, uint32_t desc) \
703 {                                                    \
704     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
705                     ctzl(sizeof(ETYPE)), GETPC());   \
706 }
707 
708 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
709 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
710 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
711 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
712 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
713 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
714 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
715 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
716 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
717 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
718 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
719 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
720 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
721 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
722 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
723 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
724 
725 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
726 void HELPER(NAME)(void *vd, target_ulong base,       \
727                   CPURISCVState *env, uint32_t desc) \
728 {                                                    \
729     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
730                     ctzl(sizeof(ETYPE)), GETPC());   \
731 }
732 
733 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
734 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
735 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
736 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
737 
738 /*
739  *** Vector Integer Arithmetic Instructions
740  */
741 
742 /* expand macro args before macro */
743 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
744 
745 /* (TD, T1, T2, TX1, TX2) */
746 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
747 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
748 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
749 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
750 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
751 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
752 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
753 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
754 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
755 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
756 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
757 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
758 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
759 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
760 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
761 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
762 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
763 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
764 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
765 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
766 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
767 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
768 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
769 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
770 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
771 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
772 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
773 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
774 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
775 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
776 
777 /* operation of two vector elements */
778 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
779 
780 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
781 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
782 {                                                               \
783     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
784     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
785     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
786 }
787 #define DO_SUB(N, M) (N - M)
788 #define DO_RSUB(N, M) (M - N)
789 
790 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
791 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
792 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
793 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
794 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
795 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
796 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
797 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
798 
799 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
800                        CPURISCVState *env, uint32_t desc,
801                        opivv2_fn *fn, uint32_t esz)
802 {
803     uint32_t vm = vext_vm(desc);
804     uint32_t vl = env->vl;
805     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
806     uint32_t vta = vext_vta(desc);
807     uint32_t vma = vext_vma(desc);
808     uint32_t i;
809 
810     for (i = env->vstart; i < vl; i++) {
811         if (!vm && !vext_elem_mask(v0, i)) {
812             /* set masked-off elements to 1s */
813             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
814             continue;
815         }
816         fn(vd, vs1, vs2, i);
817     }
818     env->vstart = 0;
819     /* set tail elements to 1s */
820     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
821 }
822 
823 /* generate the helpers for OPIVV */
824 #define GEN_VEXT_VV(NAME, ESZ)                            \
825 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
826                   void *vs2, CPURISCVState *env,          \
827                   uint32_t desc)                          \
828 {                                                         \
829     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
830                do_##NAME, ESZ);                           \
831 }
832 
833 GEN_VEXT_VV(vadd_vv_b, 1)
834 GEN_VEXT_VV(vadd_vv_h, 2)
835 GEN_VEXT_VV(vadd_vv_w, 4)
836 GEN_VEXT_VV(vadd_vv_d, 8)
837 GEN_VEXT_VV(vsub_vv_b, 1)
838 GEN_VEXT_VV(vsub_vv_h, 2)
839 GEN_VEXT_VV(vsub_vv_w, 4)
840 GEN_VEXT_VV(vsub_vv_d, 8)
841 
842 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
843 
844 /*
845  * (T1)s1 gives the real operator type.
846  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
847  */
848 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
849 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
850 {                                                                   \
851     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
852     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
853 }
854 
855 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
856 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
857 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
858 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
859 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
860 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
861 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
862 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
863 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
864 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
865 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
866 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
867 
868 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
869                        CPURISCVState *env, uint32_t desc,
870                        opivx2_fn fn, uint32_t esz)
871 {
872     uint32_t vm = vext_vm(desc);
873     uint32_t vl = env->vl;
874     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
875     uint32_t vta = vext_vta(desc);
876     uint32_t vma = vext_vma(desc);
877     uint32_t i;
878 
879     for (i = env->vstart; i < vl; i++) {
880         if (!vm && !vext_elem_mask(v0, i)) {
881             /* set masked-off elements to 1s */
882             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
883             continue;
884         }
885         fn(vd, s1, vs2, i);
886     }
887     env->vstart = 0;
888     /* set tail elements to 1s */
889     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
890 }
891 
892 /* generate the helpers for OPIVX */
893 #define GEN_VEXT_VX(NAME, ESZ)                            \
894 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
895                   void *vs2, CPURISCVState *env,          \
896                   uint32_t desc)                          \
897 {                                                         \
898     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
899                do_##NAME, ESZ);                           \
900 }
901 
902 GEN_VEXT_VX(vadd_vx_b, 1)
903 GEN_VEXT_VX(vadd_vx_h, 2)
904 GEN_VEXT_VX(vadd_vx_w, 4)
905 GEN_VEXT_VX(vadd_vx_d, 8)
906 GEN_VEXT_VX(vsub_vx_b, 1)
907 GEN_VEXT_VX(vsub_vx_h, 2)
908 GEN_VEXT_VX(vsub_vx_w, 4)
909 GEN_VEXT_VX(vsub_vx_d, 8)
910 GEN_VEXT_VX(vrsub_vx_b, 1)
911 GEN_VEXT_VX(vrsub_vx_h, 2)
912 GEN_VEXT_VX(vrsub_vx_w, 4)
913 GEN_VEXT_VX(vrsub_vx_d, 8)
914 
915 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
916 {
917     intptr_t oprsz = simd_oprsz(desc);
918     intptr_t i;
919 
920     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
921         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
922     }
923 }
924 
925 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
926 {
927     intptr_t oprsz = simd_oprsz(desc);
928     intptr_t i;
929 
930     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
931         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
932     }
933 }
934 
935 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
936 {
937     intptr_t oprsz = simd_oprsz(desc);
938     intptr_t i;
939 
940     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
941         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
942     }
943 }
944 
945 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
946 {
947     intptr_t oprsz = simd_oprsz(desc);
948     intptr_t i;
949 
950     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
951         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
952     }
953 }
954 
955 /* Vector Widening Integer Add/Subtract */
956 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
957 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
958 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
959 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
960 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
961 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
962 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
963 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
964 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
965 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
966 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
967 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
968 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
969 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
970 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
971 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
972 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
973 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
974 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
975 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
976 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
977 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
978 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
979 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
980 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
981 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
982 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
983 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
984 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
985 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
986 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
987 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
988 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
989 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
990 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
991 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
992 GEN_VEXT_VV(vwaddu_vv_b, 2)
993 GEN_VEXT_VV(vwaddu_vv_h, 4)
994 GEN_VEXT_VV(vwaddu_vv_w, 8)
995 GEN_VEXT_VV(vwsubu_vv_b, 2)
996 GEN_VEXT_VV(vwsubu_vv_h, 4)
997 GEN_VEXT_VV(vwsubu_vv_w, 8)
998 GEN_VEXT_VV(vwadd_vv_b, 2)
999 GEN_VEXT_VV(vwadd_vv_h, 4)
1000 GEN_VEXT_VV(vwadd_vv_w, 8)
1001 GEN_VEXT_VV(vwsub_vv_b, 2)
1002 GEN_VEXT_VV(vwsub_vv_h, 4)
1003 GEN_VEXT_VV(vwsub_vv_w, 8)
1004 GEN_VEXT_VV(vwaddu_wv_b, 2)
1005 GEN_VEXT_VV(vwaddu_wv_h, 4)
1006 GEN_VEXT_VV(vwaddu_wv_w, 8)
1007 GEN_VEXT_VV(vwsubu_wv_b, 2)
1008 GEN_VEXT_VV(vwsubu_wv_h, 4)
1009 GEN_VEXT_VV(vwsubu_wv_w, 8)
1010 GEN_VEXT_VV(vwadd_wv_b, 2)
1011 GEN_VEXT_VV(vwadd_wv_h, 4)
1012 GEN_VEXT_VV(vwadd_wv_w, 8)
1013 GEN_VEXT_VV(vwsub_wv_b, 2)
1014 GEN_VEXT_VV(vwsub_wv_h, 4)
1015 GEN_VEXT_VV(vwsub_wv_w, 8)
1016 
1017 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1018 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1019 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1020 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1021 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1022 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1023 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1024 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1025 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1026 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1027 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1028 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1029 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1030 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1031 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1032 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1033 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1034 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1035 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1036 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1037 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1038 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1039 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1040 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1041 GEN_VEXT_VX(vwaddu_vx_b, 2)
1042 GEN_VEXT_VX(vwaddu_vx_h, 4)
1043 GEN_VEXT_VX(vwaddu_vx_w, 8)
1044 GEN_VEXT_VX(vwsubu_vx_b, 2)
1045 GEN_VEXT_VX(vwsubu_vx_h, 4)
1046 GEN_VEXT_VX(vwsubu_vx_w, 8)
1047 GEN_VEXT_VX(vwadd_vx_b, 2)
1048 GEN_VEXT_VX(vwadd_vx_h, 4)
1049 GEN_VEXT_VX(vwadd_vx_w, 8)
1050 GEN_VEXT_VX(vwsub_vx_b, 2)
1051 GEN_VEXT_VX(vwsub_vx_h, 4)
1052 GEN_VEXT_VX(vwsub_vx_w, 8)
1053 GEN_VEXT_VX(vwaddu_wx_b, 2)
1054 GEN_VEXT_VX(vwaddu_wx_h, 4)
1055 GEN_VEXT_VX(vwaddu_wx_w, 8)
1056 GEN_VEXT_VX(vwsubu_wx_b, 2)
1057 GEN_VEXT_VX(vwsubu_wx_h, 4)
1058 GEN_VEXT_VX(vwsubu_wx_w, 8)
1059 GEN_VEXT_VX(vwadd_wx_b, 2)
1060 GEN_VEXT_VX(vwadd_wx_h, 4)
1061 GEN_VEXT_VX(vwadd_wx_w, 8)
1062 GEN_VEXT_VX(vwsub_wx_b, 2)
1063 GEN_VEXT_VX(vwsub_wx_h, 4)
1064 GEN_VEXT_VX(vwsub_wx_w, 8)
1065 
1066 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1067 #define DO_VADC(N, M, C) (N + M + C)
1068 #define DO_VSBC(N, M, C) (N - M - C)
1069 
1070 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1071 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1072                   CPURISCVState *env, uint32_t desc)          \
1073 {                                                             \
1074     uint32_t vl = env->vl;                                    \
1075     uint32_t esz = sizeof(ETYPE);                             \
1076     uint32_t total_elems =                                    \
1077         vext_get_total_elems(env, desc, esz);                 \
1078     uint32_t vta = vext_vta(desc);                            \
1079     uint32_t i;                                               \
1080                                                               \
1081     for (i = env->vstart; i < vl; i++) {                      \
1082         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1083         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1084         ETYPE carry = vext_elem_mask(v0, i);                  \
1085                                                               \
1086         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1087     }                                                         \
1088     env->vstart = 0;                                          \
1089     /* set tail elements to 1s */                             \
1090     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1091 }
1092 
1093 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1094 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1095 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1097 
1098 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1099 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1102 
1103 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1104 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1105                   CPURISCVState *env, uint32_t desc)                     \
1106 {                                                                        \
1107     uint32_t vl = env->vl;                                               \
1108     uint32_t esz = sizeof(ETYPE);                                        \
1109     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1110     uint32_t vta = vext_vta(desc);                                       \
1111     uint32_t i;                                                          \
1112                                                                          \
1113     for (i = env->vstart; i < vl; i++) {                                 \
1114         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1115         ETYPE carry = vext_elem_mask(v0, i);                             \
1116                                                                          \
1117         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1118     }                                                                    \
1119     env->vstart = 0;                                          \
1120     /* set tail elements to 1s */                                        \
1121     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1122 }
1123 
1124 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1125 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1126 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1128 
1129 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1130 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1133 
1134 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1135                           (__typeof(N))(N + M) < N)
1136 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1137 
1138 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1139 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1140                   CPURISCVState *env, uint32_t desc)          \
1141 {                                                             \
1142     uint32_t vl = env->vl;                                    \
1143     uint32_t vm = vext_vm(desc);                              \
1144     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1145     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1146     uint32_t i;                                               \
1147                                                               \
1148     for (i = env->vstart; i < vl; i++) {                      \
1149         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1150         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1151         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1152         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1153     }                                                         \
1154     env->vstart = 0;                                          \
1155     /* mask destination register are always tail-agnostic */  \
1156     /* set tail elements to 1s */                             \
1157     if (vta_all_1s) {                                         \
1158         for (; i < total_elems; i++) {                        \
1159             vext_set_elem_mask(vd, i, 1);                     \
1160         }                                                     \
1161     }                                                         \
1162 }
1163 
1164 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1165 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1168 
1169 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1170 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1173 
1174 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1175 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1176                   void *vs2, CPURISCVState *env, uint32_t desc) \
1177 {                                                               \
1178     uint32_t vl = env->vl;                                      \
1179     uint32_t vm = vext_vm(desc);                                \
1180     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1181     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1182     uint32_t i;                                                 \
1183                                                                 \
1184     for (i = env->vstart; i < vl; i++) {                        \
1185         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1186         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1187         vext_set_elem_mask(vd, i,                               \
1188                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1189     }                                                           \
1190     env->vstart = 0;                                            \
1191     /* mask destination register are always tail-agnostic */    \
1192     /* set tail elements to 1s */                               \
1193     if (vta_all_1s) {                                           \
1194         for (; i < total_elems; i++) {                          \
1195             vext_set_elem_mask(vd, i, 1);                       \
1196         }                                                       \
1197     }                                                           \
1198 }
1199 
1200 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1201 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1204 
1205 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1206 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1209 
1210 /* Vector Bitwise Logical Instructions */
1211 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1212 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1213 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1215 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1216 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1217 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1219 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1220 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1221 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1223 GEN_VEXT_VV(vand_vv_b, 1)
1224 GEN_VEXT_VV(vand_vv_h, 2)
1225 GEN_VEXT_VV(vand_vv_w, 4)
1226 GEN_VEXT_VV(vand_vv_d, 8)
1227 GEN_VEXT_VV(vor_vv_b, 1)
1228 GEN_VEXT_VV(vor_vv_h, 2)
1229 GEN_VEXT_VV(vor_vv_w, 4)
1230 GEN_VEXT_VV(vor_vv_d, 8)
1231 GEN_VEXT_VV(vxor_vv_b, 1)
1232 GEN_VEXT_VV(vxor_vv_h, 2)
1233 GEN_VEXT_VV(vxor_vv_w, 4)
1234 GEN_VEXT_VV(vxor_vv_d, 8)
1235 
1236 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1237 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1238 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1240 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1241 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1242 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1244 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1245 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1246 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1248 GEN_VEXT_VX(vand_vx_b, 1)
1249 GEN_VEXT_VX(vand_vx_h, 2)
1250 GEN_VEXT_VX(vand_vx_w, 4)
1251 GEN_VEXT_VX(vand_vx_d, 8)
1252 GEN_VEXT_VX(vor_vx_b, 1)
1253 GEN_VEXT_VX(vor_vx_h, 2)
1254 GEN_VEXT_VX(vor_vx_w, 4)
1255 GEN_VEXT_VX(vor_vx_d, 8)
1256 GEN_VEXT_VX(vxor_vx_b, 1)
1257 GEN_VEXT_VX(vxor_vx_h, 2)
1258 GEN_VEXT_VX(vxor_vx_w, 4)
1259 GEN_VEXT_VX(vxor_vx_d, 8)
1260 
1261 /* Vector Single-Width Bit Shift Instructions */
1262 #define DO_SLL(N, M)  (N << (M))
1263 #define DO_SRL(N, M)  (N >> (M))
1264 
1265 /* generate the helpers for shift instructions with two vector operators */
1266 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1267 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1268                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1269 {                                                                         \
1270     uint32_t vm = vext_vm(desc);                                          \
1271     uint32_t vl = env->vl;                                                \
1272     uint32_t esz = sizeof(TS1);                                           \
1273     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1274     uint32_t vta = vext_vta(desc);                                        \
1275     uint32_t vma = vext_vma(desc);                                        \
1276     uint32_t i;                                                           \
1277                                                                           \
1278     for (i = env->vstart; i < vl; i++) {                                  \
1279         if (!vm && !vext_elem_mask(v0, i)) {                              \
1280             /* set masked-off elements to 1s */                           \
1281             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1282             continue;                                                     \
1283         }                                                                 \
1284         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1285         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1286         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1287     }                                                                     \
1288     env->vstart = 0;                                                      \
1289     /* set tail elements to 1s */                                         \
1290     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1291 }
1292 
1293 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1294 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1296 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1297 
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1301 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1302 
1303 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1304 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1306 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1307 
1308 /* generate the helpers for shift instructions with one vector and one scalar */
1309 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1310 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1311         void *vs2, CPURISCVState *env, uint32_t desc)       \
1312 {                                                           \
1313     uint32_t vm = vext_vm(desc);                            \
1314     uint32_t vl = env->vl;                                  \
1315     uint32_t esz = sizeof(TD);                              \
1316     uint32_t total_elems =                                  \
1317         vext_get_total_elems(env, desc, esz);               \
1318     uint32_t vta = vext_vta(desc);                          \
1319     uint32_t vma = vext_vma(desc);                          \
1320     uint32_t i;                                             \
1321                                                             \
1322     for (i = env->vstart; i < vl; i++) {                    \
1323         if (!vm && !vext_elem_mask(v0, i)) {                \
1324             /* set masked-off elements to 1s */             \
1325             vext_set_elems_1s(vd, vma, i * esz,             \
1326                               (i + 1) * esz);               \
1327             continue;                                       \
1328         }                                                   \
1329         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1330         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1331     }                                                       \
1332     env->vstart = 0;                                        \
1333     /* set tail elements to 1s */                           \
1334     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1335 }
1336 
1337 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1338 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1339 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1340 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1341 
1342 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1346 
1347 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1348 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1351 
1352 /* Vector Narrowing Integer Right Shift Instructions */
1353 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1354 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1355 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1356 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1357 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1358 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1359 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1360 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1361 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1362 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1363 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1364 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1365 
1366 /* Vector Integer Comparison Instructions */
1367 #define DO_MSEQ(N, M) (N == M)
1368 #define DO_MSNE(N, M) (N != M)
1369 #define DO_MSLT(N, M) (N < M)
1370 #define DO_MSLE(N, M) (N <= M)
1371 #define DO_MSGT(N, M) (N > M)
1372 
1373 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1374 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1375                   CPURISCVState *env, uint32_t desc)          \
1376 {                                                             \
1377     uint32_t vm = vext_vm(desc);                              \
1378     uint32_t vl = env->vl;                                    \
1379     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1380     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1381     uint32_t vma = vext_vma(desc);                            \
1382     uint32_t i;                                               \
1383                                                               \
1384     for (i = env->vstart; i < vl; i++) {                      \
1385         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1386         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1387         if (!vm && !vext_elem_mask(v0, i)) {                  \
1388             /* set masked-off elements to 1s */               \
1389             if (vma) {                                        \
1390                 vext_set_elem_mask(vd, i, 1);                 \
1391             }                                                 \
1392             continue;                                         \
1393         }                                                     \
1394         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1395     }                                                         \
1396     env->vstart = 0;                                          \
1397     /* mask destination register are always tail-agnostic */  \
1398     /* set tail elements to 1s */                             \
1399     if (vta_all_1s) {                                         \
1400         for (; i < total_elems; i++) {                        \
1401             vext_set_elem_mask(vd, i, 1);                     \
1402         }                                                     \
1403     }                                                         \
1404 }
1405 
1406 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1408 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1409 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1410 
1411 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1413 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1414 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1415 
1416 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1418 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1419 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1420 
1421 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1423 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1424 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1425 
1426 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1428 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1429 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1430 
1431 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1433 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1434 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1435 
1436 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1437 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1438                   CPURISCVState *env, uint32_t desc)                \
1439 {                                                                   \
1440     uint32_t vm = vext_vm(desc);                                    \
1441     uint32_t vl = env->vl;                                          \
1442     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1443     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1444     uint32_t vma = vext_vma(desc);                                  \
1445     uint32_t i;                                                     \
1446                                                                     \
1447     for (i = env->vstart; i < vl; i++) {                            \
1448         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1449         if (!vm && !vext_elem_mask(v0, i)) {                        \
1450             /* set masked-off elements to 1s */                     \
1451             if (vma) {                                              \
1452                 vext_set_elem_mask(vd, i, 1);                       \
1453             }                                                       \
1454             continue;                                               \
1455         }                                                           \
1456         vext_set_elem_mask(vd, i,                                   \
1457                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1458     }                                                               \
1459     env->vstart = 0;                                                \
1460     /* mask destination register are always tail-agnostic */        \
1461     /* set tail elements to 1s */                                   \
1462     if (vta_all_1s) {                                               \
1463         for (; i < total_elems; i++) {                              \
1464             vext_set_elem_mask(vd, i, 1);                           \
1465         }                                                           \
1466     }                                                               \
1467 }
1468 
1469 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1470 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1471 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1472 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1473 
1474 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1475 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1476 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1477 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1478 
1479 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1480 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1481 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1482 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1483 
1484 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1485 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1488 
1489 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1490 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1491 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1492 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1493 
1494 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1495 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1498 
1499 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1500 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1501 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1502 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1503 
1504 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1505 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1508 
1509 /* Vector Integer Min/Max Instructions */
1510 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1511 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1512 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1513 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1514 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1515 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1516 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1517 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1518 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1519 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1520 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1521 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1522 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1523 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1524 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1525 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1526 GEN_VEXT_VV(vminu_vv_b, 1)
1527 GEN_VEXT_VV(vminu_vv_h, 2)
1528 GEN_VEXT_VV(vminu_vv_w, 4)
1529 GEN_VEXT_VV(vminu_vv_d, 8)
1530 GEN_VEXT_VV(vmin_vv_b, 1)
1531 GEN_VEXT_VV(vmin_vv_h, 2)
1532 GEN_VEXT_VV(vmin_vv_w, 4)
1533 GEN_VEXT_VV(vmin_vv_d, 8)
1534 GEN_VEXT_VV(vmaxu_vv_b, 1)
1535 GEN_VEXT_VV(vmaxu_vv_h, 2)
1536 GEN_VEXT_VV(vmaxu_vv_w, 4)
1537 GEN_VEXT_VV(vmaxu_vv_d, 8)
1538 GEN_VEXT_VV(vmax_vv_b, 1)
1539 GEN_VEXT_VV(vmax_vv_h, 2)
1540 GEN_VEXT_VV(vmax_vv_w, 4)
1541 GEN_VEXT_VV(vmax_vv_d, 8)
1542 
1543 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1544 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1545 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1546 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1547 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1548 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1549 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1550 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1551 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1552 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1553 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1554 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1555 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1556 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1557 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1558 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1559 GEN_VEXT_VX(vminu_vx_b, 1)
1560 GEN_VEXT_VX(vminu_vx_h, 2)
1561 GEN_VEXT_VX(vminu_vx_w, 4)
1562 GEN_VEXT_VX(vminu_vx_d, 8)
1563 GEN_VEXT_VX(vmin_vx_b, 1)
1564 GEN_VEXT_VX(vmin_vx_h, 2)
1565 GEN_VEXT_VX(vmin_vx_w, 4)
1566 GEN_VEXT_VX(vmin_vx_d, 8)
1567 GEN_VEXT_VX(vmaxu_vx_b, 1)
1568 GEN_VEXT_VX(vmaxu_vx_h, 2)
1569 GEN_VEXT_VX(vmaxu_vx_w, 4)
1570 GEN_VEXT_VX(vmaxu_vx_d, 8)
1571 GEN_VEXT_VX(vmax_vx_b, 1)
1572 GEN_VEXT_VX(vmax_vx_h, 2)
1573 GEN_VEXT_VX(vmax_vx_w, 4)
1574 GEN_VEXT_VX(vmax_vx_d, 8)
1575 
1576 /* Vector Single-Width Integer Multiply Instructions */
1577 #define DO_MUL(N, M) (N * M)
1578 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1579 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1580 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1581 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1582 GEN_VEXT_VV(vmul_vv_b, 1)
1583 GEN_VEXT_VV(vmul_vv_h, 2)
1584 GEN_VEXT_VV(vmul_vv_w, 4)
1585 GEN_VEXT_VV(vmul_vv_d, 8)
1586 
1587 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1588 {
1589     return (int16_t)s2 * (int16_t)s1 >> 8;
1590 }
1591 
1592 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1593 {
1594     return (int32_t)s2 * (int32_t)s1 >> 16;
1595 }
1596 
1597 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1598 {
1599     return (int64_t)s2 * (int64_t)s1 >> 32;
1600 }
1601 
1602 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1603 {
1604     uint64_t hi_64, lo_64;
1605 
1606     muls64(&lo_64, &hi_64, s1, s2);
1607     return hi_64;
1608 }
1609 
1610 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1611 {
1612     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1613 }
1614 
1615 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1616 {
1617     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1618 }
1619 
1620 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1621 {
1622     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1623 }
1624 
1625 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1626 {
1627     uint64_t hi_64, lo_64;
1628 
1629     mulu64(&lo_64, &hi_64, s2, s1);
1630     return hi_64;
1631 }
1632 
1633 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1634 {
1635     return (int16_t)s2 * (uint16_t)s1 >> 8;
1636 }
1637 
1638 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1639 {
1640     return (int32_t)s2 * (uint32_t)s1 >> 16;
1641 }
1642 
1643 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1644 {
1645     return (int64_t)s2 * (uint64_t)s1 >> 32;
1646 }
1647 
1648 /*
1649  * Let  A = signed operand,
1650  *      B = unsigned operand
1651  *      P = mulu64(A, B), unsigned product
1652  *
1653  * LET  X = 2 ** 64  - A, 2's complement of A
1654  *      SP = signed product
1655  * THEN
1656  *      IF A < 0
1657  *          SP = -X * B
1658  *             = -(2 ** 64 - A) * B
1659  *             = A * B - 2 ** 64 * B
1660  *             = P - 2 ** 64 * B
1661  *      ELSE
1662  *          SP = P
1663  * THEN
1664  *      HI_P -= (A < 0 ? B : 0)
1665  */
1666 
1667 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1668 {
1669     uint64_t hi_64, lo_64;
1670 
1671     mulu64(&lo_64, &hi_64, s2, s1);
1672 
1673     hi_64 -= s2 < 0 ? s1 : 0;
1674     return hi_64;
1675 }
1676 
1677 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1678 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1679 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1680 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1681 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1682 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1683 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1684 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1685 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1686 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1687 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1688 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1689 GEN_VEXT_VV(vmulh_vv_b, 1)
1690 GEN_VEXT_VV(vmulh_vv_h, 2)
1691 GEN_VEXT_VV(vmulh_vv_w, 4)
1692 GEN_VEXT_VV(vmulh_vv_d, 8)
1693 GEN_VEXT_VV(vmulhu_vv_b, 1)
1694 GEN_VEXT_VV(vmulhu_vv_h, 2)
1695 GEN_VEXT_VV(vmulhu_vv_w, 4)
1696 GEN_VEXT_VV(vmulhu_vv_d, 8)
1697 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1698 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1699 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1700 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1701 
1702 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1703 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1704 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1705 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1706 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1707 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1708 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1709 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1710 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1711 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1712 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1713 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1714 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1715 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1716 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1717 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1718 GEN_VEXT_VX(vmul_vx_b, 1)
1719 GEN_VEXT_VX(vmul_vx_h, 2)
1720 GEN_VEXT_VX(vmul_vx_w, 4)
1721 GEN_VEXT_VX(vmul_vx_d, 8)
1722 GEN_VEXT_VX(vmulh_vx_b, 1)
1723 GEN_VEXT_VX(vmulh_vx_h, 2)
1724 GEN_VEXT_VX(vmulh_vx_w, 4)
1725 GEN_VEXT_VX(vmulh_vx_d, 8)
1726 GEN_VEXT_VX(vmulhu_vx_b, 1)
1727 GEN_VEXT_VX(vmulhu_vx_h, 2)
1728 GEN_VEXT_VX(vmulhu_vx_w, 4)
1729 GEN_VEXT_VX(vmulhu_vx_d, 8)
1730 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1731 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1732 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1733 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1734 
1735 /* Vector Integer Divide Instructions */
1736 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1737 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1738 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1739         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1740 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1741         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1742 
1743 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1744 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1745 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1746 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1747 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1748 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1749 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1750 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1751 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1752 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1753 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1754 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1755 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1756 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1757 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1758 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1759 GEN_VEXT_VV(vdivu_vv_b, 1)
1760 GEN_VEXT_VV(vdivu_vv_h, 2)
1761 GEN_VEXT_VV(vdivu_vv_w, 4)
1762 GEN_VEXT_VV(vdivu_vv_d, 8)
1763 GEN_VEXT_VV(vdiv_vv_b, 1)
1764 GEN_VEXT_VV(vdiv_vv_h, 2)
1765 GEN_VEXT_VV(vdiv_vv_w, 4)
1766 GEN_VEXT_VV(vdiv_vv_d, 8)
1767 GEN_VEXT_VV(vremu_vv_b, 1)
1768 GEN_VEXT_VV(vremu_vv_h, 2)
1769 GEN_VEXT_VV(vremu_vv_w, 4)
1770 GEN_VEXT_VV(vremu_vv_d, 8)
1771 GEN_VEXT_VV(vrem_vv_b, 1)
1772 GEN_VEXT_VV(vrem_vv_h, 2)
1773 GEN_VEXT_VV(vrem_vv_w, 4)
1774 GEN_VEXT_VV(vrem_vv_d, 8)
1775 
1776 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1777 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1778 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1779 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1780 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1781 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1782 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1783 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1784 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1785 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1786 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1787 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1788 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1789 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1790 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1791 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1792 GEN_VEXT_VX(vdivu_vx_b, 1)
1793 GEN_VEXT_VX(vdivu_vx_h, 2)
1794 GEN_VEXT_VX(vdivu_vx_w, 4)
1795 GEN_VEXT_VX(vdivu_vx_d, 8)
1796 GEN_VEXT_VX(vdiv_vx_b, 1)
1797 GEN_VEXT_VX(vdiv_vx_h, 2)
1798 GEN_VEXT_VX(vdiv_vx_w, 4)
1799 GEN_VEXT_VX(vdiv_vx_d, 8)
1800 GEN_VEXT_VX(vremu_vx_b, 1)
1801 GEN_VEXT_VX(vremu_vx_h, 2)
1802 GEN_VEXT_VX(vremu_vx_w, 4)
1803 GEN_VEXT_VX(vremu_vx_d, 8)
1804 GEN_VEXT_VX(vrem_vx_b, 1)
1805 GEN_VEXT_VX(vrem_vx_h, 2)
1806 GEN_VEXT_VX(vrem_vx_w, 4)
1807 GEN_VEXT_VX(vrem_vx_d, 8)
1808 
1809 /* Vector Widening Integer Multiply Instructions */
1810 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1811 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1812 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1813 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1814 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1815 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1816 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1819 GEN_VEXT_VV(vwmul_vv_b, 2)
1820 GEN_VEXT_VV(vwmul_vv_h, 4)
1821 GEN_VEXT_VV(vwmul_vv_w, 8)
1822 GEN_VEXT_VV(vwmulu_vv_b, 2)
1823 GEN_VEXT_VV(vwmulu_vv_h, 4)
1824 GEN_VEXT_VV(vwmulu_vv_w, 8)
1825 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1826 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1827 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1828 
1829 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1830 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1831 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1832 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1833 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1834 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1835 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1838 GEN_VEXT_VX(vwmul_vx_b, 2)
1839 GEN_VEXT_VX(vwmul_vx_h, 4)
1840 GEN_VEXT_VX(vwmul_vx_w, 8)
1841 GEN_VEXT_VX(vwmulu_vx_b, 2)
1842 GEN_VEXT_VX(vwmulu_vx_h, 4)
1843 GEN_VEXT_VX(vwmulu_vx_w, 8)
1844 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1845 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1846 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1847 
1848 /* Vector Single-Width Integer Multiply-Add Instructions */
1849 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1850 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1851 {                                                                  \
1852     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1853     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1854     TD d = *((TD *)vd + HD(i));                                    \
1855     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1856 }
1857 
1858 #define DO_MACC(N, M, D) (M * N + D)
1859 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1860 #define DO_MADD(N, M, D) (M * D + N)
1861 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1862 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1863 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1864 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1865 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1866 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1867 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1868 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1869 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1870 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1871 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1872 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1873 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1874 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1875 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1876 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1877 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1878 GEN_VEXT_VV(vmacc_vv_b, 1)
1879 GEN_VEXT_VV(vmacc_vv_h, 2)
1880 GEN_VEXT_VV(vmacc_vv_w, 4)
1881 GEN_VEXT_VV(vmacc_vv_d, 8)
1882 GEN_VEXT_VV(vnmsac_vv_b, 1)
1883 GEN_VEXT_VV(vnmsac_vv_h, 2)
1884 GEN_VEXT_VV(vnmsac_vv_w, 4)
1885 GEN_VEXT_VV(vnmsac_vv_d, 8)
1886 GEN_VEXT_VV(vmadd_vv_b, 1)
1887 GEN_VEXT_VV(vmadd_vv_h, 2)
1888 GEN_VEXT_VV(vmadd_vv_w, 4)
1889 GEN_VEXT_VV(vmadd_vv_d, 8)
1890 GEN_VEXT_VV(vnmsub_vv_b, 1)
1891 GEN_VEXT_VV(vnmsub_vv_h, 2)
1892 GEN_VEXT_VV(vnmsub_vv_w, 4)
1893 GEN_VEXT_VV(vnmsub_vv_d, 8)
1894 
1895 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1896 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1897 {                                                                   \
1898     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1899     TD d = *((TD *)vd + HD(i));                                     \
1900     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1901 }
1902 
1903 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1904 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1905 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1906 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1907 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1908 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1909 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1910 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1911 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1912 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1913 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1914 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1915 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1916 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1917 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1918 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1919 GEN_VEXT_VX(vmacc_vx_b, 1)
1920 GEN_VEXT_VX(vmacc_vx_h, 2)
1921 GEN_VEXT_VX(vmacc_vx_w, 4)
1922 GEN_VEXT_VX(vmacc_vx_d, 8)
1923 GEN_VEXT_VX(vnmsac_vx_b, 1)
1924 GEN_VEXT_VX(vnmsac_vx_h, 2)
1925 GEN_VEXT_VX(vnmsac_vx_w, 4)
1926 GEN_VEXT_VX(vnmsac_vx_d, 8)
1927 GEN_VEXT_VX(vmadd_vx_b, 1)
1928 GEN_VEXT_VX(vmadd_vx_h, 2)
1929 GEN_VEXT_VX(vmadd_vx_w, 4)
1930 GEN_VEXT_VX(vmadd_vx_d, 8)
1931 GEN_VEXT_VX(vnmsub_vx_b, 1)
1932 GEN_VEXT_VX(vnmsub_vx_h, 2)
1933 GEN_VEXT_VX(vnmsub_vx_w, 4)
1934 GEN_VEXT_VX(vnmsub_vx_d, 8)
1935 
1936 /* Vector Widening Integer Multiply-Add Instructions */
1937 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1938 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1939 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1940 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1941 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1942 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1943 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1946 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1947 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1948 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1949 GEN_VEXT_VV(vwmacc_vv_b, 2)
1950 GEN_VEXT_VV(vwmacc_vv_h, 4)
1951 GEN_VEXT_VV(vwmacc_vv_w, 8)
1952 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1955 
1956 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1957 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1958 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1959 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1960 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1961 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1962 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1968 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1969 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1970 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1971 GEN_VEXT_VX(vwmacc_vx_b, 2)
1972 GEN_VEXT_VX(vwmacc_vx_h, 4)
1973 GEN_VEXT_VX(vwmacc_vx_w, 8)
1974 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1977 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1978 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1979 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1980 
1981 /* Vector Integer Merge and Move Instructions */
1982 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1983 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1984                   uint32_t desc)                                     \
1985 {                                                                    \
1986     uint32_t vl = env->vl;                                           \
1987     uint32_t esz = sizeof(ETYPE);                                    \
1988     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1989     uint32_t vta = vext_vta(desc);                                   \
1990     uint32_t i;                                                      \
1991                                                                      \
1992     for (i = env->vstart; i < vl; i++) {                             \
1993         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1994         *((ETYPE *)vd + H(i)) = s1;                                  \
1995     }                                                                \
1996     env->vstart = 0;                                                 \
1997     /* set tail elements to 1s */                                    \
1998     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1999 }
2000 
2001 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2002 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2003 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2004 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2005 
2006 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2007 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2008                   uint32_t desc)                                     \
2009 {                                                                    \
2010     uint32_t vl = env->vl;                                           \
2011     uint32_t esz = sizeof(ETYPE);                                    \
2012     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2013     uint32_t vta = vext_vta(desc);                                   \
2014     uint32_t i;                                                      \
2015                                                                      \
2016     for (i = env->vstart; i < vl; i++) {                             \
2017         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2018     }                                                                \
2019     env->vstart = 0;                                                 \
2020     /* set tail elements to 1s */                                    \
2021     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2022 }
2023 
2024 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2025 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2026 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2027 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2028 
2029 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2030 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2031                   CPURISCVState *env, uint32_t desc)                 \
2032 {                                                                    \
2033     uint32_t vl = env->vl;                                           \
2034     uint32_t esz = sizeof(ETYPE);                                    \
2035     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2036     uint32_t vta = vext_vta(desc);                                   \
2037     uint32_t i;                                                      \
2038                                                                      \
2039     for (i = env->vstart; i < vl; i++) {                             \
2040         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2041         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2042     }                                                                \
2043     env->vstart = 0;                                                 \
2044     /* set tail elements to 1s */                                    \
2045     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2046 }
2047 
2048 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2049 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2050 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2051 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2052 
2053 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2054 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2055                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2056 {                                                                    \
2057     uint32_t vl = env->vl;                                           \
2058     uint32_t esz = sizeof(ETYPE);                                    \
2059     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2060     uint32_t vta = vext_vta(desc);                                   \
2061     uint32_t i;                                                      \
2062                                                                      \
2063     for (i = env->vstart; i < vl; i++) {                             \
2064         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2065         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2066                    (ETYPE)(target_long)s1);                          \
2067         *((ETYPE *)vd + H(i)) = d;                                   \
2068     }                                                                \
2069     env->vstart = 0;                                                 \
2070     /* set tail elements to 1s */                                    \
2071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2072 }
2073 
2074 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2075 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2078 
2079 /*
2080  *** Vector Fixed-Point Arithmetic Instructions
2081  */
2082 
2083 /* Vector Single-Width Saturating Add and Subtract */
2084 
2085 /*
2086  * As fixed point instructions probably have round mode and saturation,
2087  * define common macros for fixed point here.
2088  */
2089 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2090                           CPURISCVState *env, int vxrm);
2091 
2092 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2093 static inline void                                                  \
2094 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2095           CPURISCVState *env, int vxrm)                             \
2096 {                                                                   \
2097     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2098     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2099     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2100 }
2101 
2102 static inline void
2103 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2104              CPURISCVState *env,
2105              uint32_t vl, uint32_t vm, int vxrm,
2106              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2107 {
2108     for (uint32_t i = env->vstart; i < vl; i++) {
2109         if (!vm && !vext_elem_mask(v0, i)) {
2110             /* set masked-off elements to 1s */
2111             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2112             continue;
2113         }
2114         fn(vd, vs1, vs2, i, env, vxrm);
2115     }
2116     env->vstart = 0;
2117 }
2118 
2119 static inline void
2120 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2121              CPURISCVState *env,
2122              uint32_t desc,
2123              opivv2_rm_fn *fn, uint32_t esz)
2124 {
2125     uint32_t vm = vext_vm(desc);
2126     uint32_t vl = env->vl;
2127     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2128     uint32_t vta = vext_vta(desc);
2129     uint32_t vma = vext_vma(desc);
2130 
2131     switch (env->vxrm) {
2132     case 0: /* rnu */
2133         vext_vv_rm_1(vd, v0, vs1, vs2,
2134                      env, vl, vm, 0, fn, vma, esz);
2135         break;
2136     case 1: /* rne */
2137         vext_vv_rm_1(vd, v0, vs1, vs2,
2138                      env, vl, vm, 1, fn, vma, esz);
2139         break;
2140     case 2: /* rdn */
2141         vext_vv_rm_1(vd, v0, vs1, vs2,
2142                      env, vl, vm, 2, fn, vma, esz);
2143         break;
2144     default: /* rod */
2145         vext_vv_rm_1(vd, v0, vs1, vs2,
2146                      env, vl, vm, 3, fn, vma, esz);
2147         break;
2148     }
2149     /* set tail elements to 1s */
2150     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2151 }
2152 
2153 /* generate helpers for fixed point instructions with OPIVV format */
2154 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2155 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2156                   CPURISCVState *env, uint32_t desc)            \
2157 {                                                               \
2158     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2159                  do_##NAME, ESZ);                               \
2160 }
2161 
2162 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2163 {
2164     uint8_t res = a + b;
2165     if (res < a) {
2166         res = UINT8_MAX;
2167         env->vxsat = 0x1;
2168     }
2169     return res;
2170 }
2171 
2172 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2173                                uint16_t b)
2174 {
2175     uint16_t res = a + b;
2176     if (res < a) {
2177         res = UINT16_MAX;
2178         env->vxsat = 0x1;
2179     }
2180     return res;
2181 }
2182 
2183 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2184                                uint32_t b)
2185 {
2186     uint32_t res = a + b;
2187     if (res < a) {
2188         res = UINT32_MAX;
2189         env->vxsat = 0x1;
2190     }
2191     return res;
2192 }
2193 
2194 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2195                                uint64_t b)
2196 {
2197     uint64_t res = a + b;
2198     if (res < a) {
2199         res = UINT64_MAX;
2200         env->vxsat = 0x1;
2201     }
2202     return res;
2203 }
2204 
2205 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2206 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2207 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2209 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2210 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2211 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2212 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2213 
2214 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2215                           CPURISCVState *env, int vxrm);
2216 
2217 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2218 static inline void                                                  \
2219 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2220           CPURISCVState *env, int vxrm)                             \
2221 {                                                                   \
2222     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2223     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2224 }
2225 
2226 static inline void
2227 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2228              CPURISCVState *env,
2229              uint32_t vl, uint32_t vm, int vxrm,
2230              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2231 {
2232     for (uint32_t i = env->vstart; i < vl; i++) {
2233         if (!vm && !vext_elem_mask(v0, i)) {
2234             /* set masked-off elements to 1s */
2235             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2236             continue;
2237         }
2238         fn(vd, s1, vs2, i, env, vxrm);
2239     }
2240     env->vstart = 0;
2241 }
2242 
2243 static inline void
2244 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2245              CPURISCVState *env,
2246              uint32_t desc,
2247              opivx2_rm_fn *fn, uint32_t esz)
2248 {
2249     uint32_t vm = vext_vm(desc);
2250     uint32_t vl = env->vl;
2251     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2252     uint32_t vta = vext_vta(desc);
2253     uint32_t vma = vext_vma(desc);
2254 
2255     switch (env->vxrm) {
2256     case 0: /* rnu */
2257         vext_vx_rm_1(vd, v0, s1, vs2,
2258                      env, vl, vm, 0, fn, vma, esz);
2259         break;
2260     case 1: /* rne */
2261         vext_vx_rm_1(vd, v0, s1, vs2,
2262                      env, vl, vm, 1, fn, vma, esz);
2263         break;
2264     case 2: /* rdn */
2265         vext_vx_rm_1(vd, v0, s1, vs2,
2266                      env, vl, vm, 2, fn, vma, esz);
2267         break;
2268     default: /* rod */
2269         vext_vx_rm_1(vd, v0, s1, vs2,
2270                      env, vl, vm, 3, fn, vma, esz);
2271         break;
2272     }
2273     /* set tail elements to 1s */
2274     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2275 }
2276 
2277 /* generate helpers for fixed point instructions with OPIVX format */
2278 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2279 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2280         void *vs2, CPURISCVState *env, uint32_t desc)     \
2281 {                                                         \
2282     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2283                  do_##NAME, ESZ);                         \
2284 }
2285 
2286 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2287 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2290 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2291 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2292 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2293 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2294 
2295 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2296 {
2297     int8_t res = a + b;
2298     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2299         res = a > 0 ? INT8_MAX : INT8_MIN;
2300         env->vxsat = 0x1;
2301     }
2302     return res;
2303 }
2304 
2305 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2306 {
2307     int16_t res = a + b;
2308     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2309         res = a > 0 ? INT16_MAX : INT16_MIN;
2310         env->vxsat = 0x1;
2311     }
2312     return res;
2313 }
2314 
2315 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2316 {
2317     int32_t res = a + b;
2318     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2319         res = a > 0 ? INT32_MAX : INT32_MIN;
2320         env->vxsat = 0x1;
2321     }
2322     return res;
2323 }
2324 
2325 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2326 {
2327     int64_t res = a + b;
2328     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2329         res = a > 0 ? INT64_MAX : INT64_MIN;
2330         env->vxsat = 0x1;
2331     }
2332     return res;
2333 }
2334 
2335 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2336 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2337 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2338 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2339 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2340 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2341 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2342 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2343 
2344 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2345 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2346 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2347 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2348 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2349 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2350 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2351 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2352 
2353 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2354 {
2355     uint8_t res = a - b;
2356     if (res > a) {
2357         res = 0;
2358         env->vxsat = 0x1;
2359     }
2360     return res;
2361 }
2362 
2363 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2364                                uint16_t b)
2365 {
2366     uint16_t res = a - b;
2367     if (res > a) {
2368         res = 0;
2369         env->vxsat = 0x1;
2370     }
2371     return res;
2372 }
2373 
2374 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2375                                uint32_t b)
2376 {
2377     uint32_t res = a - b;
2378     if (res > a) {
2379         res = 0;
2380         env->vxsat = 0x1;
2381     }
2382     return res;
2383 }
2384 
2385 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2386                                uint64_t b)
2387 {
2388     uint64_t res = a - b;
2389     if (res > a) {
2390         res = 0;
2391         env->vxsat = 0x1;
2392     }
2393     return res;
2394 }
2395 
2396 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2397 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2398 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2399 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2400 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2401 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2402 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2403 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2404 
2405 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2406 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2407 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2408 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2409 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2410 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2411 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2412 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2413 
2414 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2415 {
2416     int8_t res = a - b;
2417     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2418         res = a >= 0 ? INT8_MAX : INT8_MIN;
2419         env->vxsat = 0x1;
2420     }
2421     return res;
2422 }
2423 
2424 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2425 {
2426     int16_t res = a - b;
2427     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2428         res = a >= 0 ? INT16_MAX : INT16_MIN;
2429         env->vxsat = 0x1;
2430     }
2431     return res;
2432 }
2433 
2434 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2435 {
2436     int32_t res = a - b;
2437     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2438         res = a >= 0 ? INT32_MAX : INT32_MIN;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2445 {
2446     int64_t res = a - b;
2447     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2448         res = a >= 0 ? INT64_MAX : INT64_MIN;
2449         env->vxsat = 0x1;
2450     }
2451     return res;
2452 }
2453 
2454 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2455 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2456 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2457 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2458 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2459 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2460 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2461 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2462 
2463 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2464 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2465 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2466 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2467 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2468 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2469 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2470 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2471 
2472 /* Vector Single-Width Averaging Add and Subtract */
2473 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2474 {
2475     uint8_t d = extract64(v, shift, 1);
2476     uint8_t d1;
2477     uint64_t D1, D2;
2478 
2479     if (shift == 0 || shift > 64) {
2480         return 0;
2481     }
2482 
2483     d1 = extract64(v, shift - 1, 1);
2484     D1 = extract64(v, 0, shift);
2485     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2486         return d1;
2487     } else if (vxrm == 1) { /* round-to-nearest-even */
2488         if (shift > 1) {
2489             D2 = extract64(v, 0, shift - 1);
2490             return d1 & ((D2 != 0) | d);
2491         } else {
2492             return d1 & d;
2493         }
2494     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2495         return !d & (D1 != 0);
2496     }
2497     return 0; /* round-down (truncate) */
2498 }
2499 
2500 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2501 {
2502     int64_t res = (int64_t)a + b;
2503     uint8_t round = get_round(vxrm, res, 1);
2504 
2505     return (res >> 1) + round;
2506 }
2507 
2508 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2509 {
2510     int64_t res = a + b;
2511     uint8_t round = get_round(vxrm, res, 1);
2512     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2513 
2514     /* With signed overflow, bit 64 is inverse of bit 63. */
2515     return ((res >> 1) ^ over) + round;
2516 }
2517 
2518 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2519 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2520 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2521 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2522 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2523 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2524 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2525 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2526 
2527 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2528 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2529 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2530 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2531 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2532 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2533 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2534 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2535 
2536 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2537                                uint32_t a, uint32_t b)
2538 {
2539     uint64_t res = (uint64_t)a + b;
2540     uint8_t round = get_round(vxrm, res, 1);
2541 
2542     return (res >> 1) + round;
2543 }
2544 
2545 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2546                                uint64_t a, uint64_t b)
2547 {
2548     uint64_t res = a + b;
2549     uint8_t round = get_round(vxrm, res, 1);
2550     uint64_t over = (uint64_t)(res < a) << 63;
2551 
2552     return ((res >> 1) | over) + round;
2553 }
2554 
2555 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2556 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2557 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2558 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2559 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2560 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2561 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2562 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2563 
2564 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2565 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2566 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2567 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2568 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2569 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2570 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2571 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2572 
2573 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2574 {
2575     int64_t res = (int64_t)a - b;
2576     uint8_t round = get_round(vxrm, res, 1);
2577 
2578     return (res >> 1) + round;
2579 }
2580 
2581 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2582 {
2583     int64_t res = (int64_t)a - b;
2584     uint8_t round = get_round(vxrm, res, 1);
2585     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2586 
2587     /* With signed overflow, bit 64 is inverse of bit 63. */
2588     return ((res >> 1) ^ over) + round;
2589 }
2590 
2591 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2592 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2593 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2594 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2595 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2596 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2597 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2598 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2599 
2600 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2601 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2602 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2603 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2604 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2605 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2606 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2607 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2608 
2609 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2610                                uint32_t a, uint32_t b)
2611 {
2612     int64_t res = (int64_t)a - b;
2613     uint8_t round = get_round(vxrm, res, 1);
2614 
2615     return (res >> 1) + round;
2616 }
2617 
2618 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2619                                uint64_t a, uint64_t b)
2620 {
2621     uint64_t res = (uint64_t)a - b;
2622     uint8_t round = get_round(vxrm, res, 1);
2623     uint64_t over = (uint64_t)(res > a) << 63;
2624 
2625     return ((res >> 1) | over) + round;
2626 }
2627 
2628 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2629 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2630 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2631 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2632 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2633 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2634 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2635 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2636 
2637 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2638 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2639 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2640 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2641 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2642 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2643 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2644 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2645 
2646 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2647 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2648 {
2649     uint8_t round;
2650     int16_t res;
2651 
2652     res = (int16_t)a * (int16_t)b;
2653     round = get_round(vxrm, res, 7);
2654     res   = (res >> 7) + round;
2655 
2656     if (res > INT8_MAX) {
2657         env->vxsat = 0x1;
2658         return INT8_MAX;
2659     } else if (res < INT8_MIN) {
2660         env->vxsat = 0x1;
2661         return INT8_MIN;
2662     } else {
2663         return res;
2664     }
2665 }
2666 
2667 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2668 {
2669     uint8_t round;
2670     int32_t res;
2671 
2672     res = (int32_t)a * (int32_t)b;
2673     round = get_round(vxrm, res, 15);
2674     res   = (res >> 15) + round;
2675 
2676     if (res > INT16_MAX) {
2677         env->vxsat = 0x1;
2678         return INT16_MAX;
2679     } else if (res < INT16_MIN) {
2680         env->vxsat = 0x1;
2681         return INT16_MIN;
2682     } else {
2683         return res;
2684     }
2685 }
2686 
2687 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2688 {
2689     uint8_t round;
2690     int64_t res;
2691 
2692     res = (int64_t)a * (int64_t)b;
2693     round = get_round(vxrm, res, 31);
2694     res   = (res >> 31) + round;
2695 
2696     if (res > INT32_MAX) {
2697         env->vxsat = 0x1;
2698         return INT32_MAX;
2699     } else if (res < INT32_MIN) {
2700         env->vxsat = 0x1;
2701         return INT32_MIN;
2702     } else {
2703         return res;
2704     }
2705 }
2706 
2707 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2708 {
2709     uint8_t round;
2710     uint64_t hi_64, lo_64;
2711     int64_t res;
2712 
2713     if (a == INT64_MIN && b == INT64_MIN) {
2714         env->vxsat = 1;
2715         return INT64_MAX;
2716     }
2717 
2718     muls64(&lo_64, &hi_64, a, b);
2719     round = get_round(vxrm, lo_64, 63);
2720     /*
2721      * Cannot overflow, as there are always
2722      * 2 sign bits after multiply.
2723      */
2724     res = (hi_64 << 1) | (lo_64 >> 63);
2725     if (round) {
2726         if (res == INT64_MAX) {
2727             env->vxsat = 1;
2728         } else {
2729             res += 1;
2730         }
2731     }
2732     return res;
2733 }
2734 
2735 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2736 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2737 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2738 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2739 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2740 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2741 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2742 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2743 
2744 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2745 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2746 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2747 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2748 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2749 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2750 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2751 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2752 
2753 /* Vector Single-Width Scaling Shift Instructions */
2754 static inline uint8_t
2755 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2756 {
2757     uint8_t round, shift = b & 0x7;
2758     uint8_t res;
2759 
2760     round = get_round(vxrm, a, shift);
2761     res   = (a >> shift)  + round;
2762     return res;
2763 }
2764 static inline uint16_t
2765 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2766 {
2767     uint8_t round, shift = b & 0xf;
2768 
2769     round = get_round(vxrm, a, shift);
2770     return (a >> shift) + round;
2771 }
2772 static inline uint32_t
2773 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2774 {
2775     uint8_t round, shift = b & 0x1f;
2776 
2777     round = get_round(vxrm, a, shift);
2778     return (a >> shift) + round;
2779 }
2780 static inline uint64_t
2781 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2782 {
2783     uint8_t round, shift = b & 0x3f;
2784 
2785     round = get_round(vxrm, a, shift);
2786     return (a >> shift) + round;
2787 }
2788 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2789 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2790 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2791 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2792 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2793 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2794 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2795 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2796 
2797 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2798 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2799 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2800 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2801 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2802 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2803 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2804 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2805 
2806 static inline int8_t
2807 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2808 {
2809     uint8_t round, shift = b & 0x7;
2810 
2811     round = get_round(vxrm, a, shift);
2812     return (a >> shift) + round;
2813 }
2814 static inline int16_t
2815 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2816 {
2817     uint8_t round, shift = b & 0xf;
2818 
2819     round = get_round(vxrm, a, shift);
2820     return (a >> shift) + round;
2821 }
2822 static inline int32_t
2823 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2824 {
2825     uint8_t round, shift = b & 0x1f;
2826 
2827     round = get_round(vxrm, a, shift);
2828     return (a >> shift) + round;
2829 }
2830 static inline int64_t
2831 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2832 {
2833     uint8_t round, shift = b & 0x3f;
2834 
2835     round = get_round(vxrm, a, shift);
2836     return (a >> shift) + round;
2837 }
2838 
2839 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2840 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2841 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2842 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2843 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2844 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2845 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2846 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2847 
2848 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2849 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2850 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2851 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2852 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2853 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2854 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2855 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2856 
2857 /* Vector Narrowing Fixed-Point Clip Instructions */
2858 static inline int8_t
2859 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2860 {
2861     uint8_t round, shift = b & 0xf;
2862     int16_t res;
2863 
2864     round = get_round(vxrm, a, shift);
2865     res   = (a >> shift)  + round;
2866     if (res > INT8_MAX) {
2867         env->vxsat = 0x1;
2868         return INT8_MAX;
2869     } else if (res < INT8_MIN) {
2870         env->vxsat = 0x1;
2871         return INT8_MIN;
2872     } else {
2873         return res;
2874     }
2875 }
2876 
2877 static inline int16_t
2878 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2879 {
2880     uint8_t round, shift = b & 0x1f;
2881     int32_t res;
2882 
2883     round = get_round(vxrm, a, shift);
2884     res   = (a >> shift)  + round;
2885     if (res > INT16_MAX) {
2886         env->vxsat = 0x1;
2887         return INT16_MAX;
2888     } else if (res < INT16_MIN) {
2889         env->vxsat = 0x1;
2890         return INT16_MIN;
2891     } else {
2892         return res;
2893     }
2894 }
2895 
2896 static inline int32_t
2897 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2898 {
2899     uint8_t round, shift = b & 0x3f;
2900     int64_t res;
2901 
2902     round = get_round(vxrm, a, shift);
2903     res   = (a >> shift)  + round;
2904     if (res > INT32_MAX) {
2905         env->vxsat = 0x1;
2906         return INT32_MAX;
2907     } else if (res < INT32_MIN) {
2908         env->vxsat = 0x1;
2909         return INT32_MIN;
2910     } else {
2911         return res;
2912     }
2913 }
2914 
2915 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2916 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2917 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2918 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2919 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2920 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2921 
2922 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2923 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2924 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2925 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2926 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2927 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2928 
2929 static inline uint8_t
2930 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2931 {
2932     uint8_t round, shift = b & 0xf;
2933     uint16_t res;
2934 
2935     round = get_round(vxrm, a, shift);
2936     res   = (a >> shift)  + round;
2937     if (res > UINT8_MAX) {
2938         env->vxsat = 0x1;
2939         return UINT8_MAX;
2940     } else {
2941         return res;
2942     }
2943 }
2944 
2945 static inline uint16_t
2946 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2947 {
2948     uint8_t round, shift = b & 0x1f;
2949     uint32_t res;
2950 
2951     round = get_round(vxrm, a, shift);
2952     res   = (a >> shift)  + round;
2953     if (res > UINT16_MAX) {
2954         env->vxsat = 0x1;
2955         return UINT16_MAX;
2956     } else {
2957         return res;
2958     }
2959 }
2960 
2961 static inline uint32_t
2962 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2963 {
2964     uint8_t round, shift = b & 0x3f;
2965     uint64_t res;
2966 
2967     round = get_round(vxrm, a, shift);
2968     res   = (a >> shift)  + round;
2969     if (res > UINT32_MAX) {
2970         env->vxsat = 0x1;
2971         return UINT32_MAX;
2972     } else {
2973         return res;
2974     }
2975 }
2976 
2977 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2978 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2979 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2980 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2981 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2982 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2983 
2984 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2985 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2986 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2987 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2988 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2989 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2990 
2991 /*
2992  *** Vector Float Point Arithmetic Instructions
2993  */
2994 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2995 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2996 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2997                       CPURISCVState *env)                      \
2998 {                                                              \
2999     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3000     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3001     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3002 }
3003 
3004 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3005 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3006                   void *vs2, CPURISCVState *env,          \
3007                   uint32_t desc)                          \
3008 {                                                         \
3009     uint32_t vm = vext_vm(desc);                          \
3010     uint32_t vl = env->vl;                                \
3011     uint32_t total_elems =                                \
3012         vext_get_total_elems(env, desc, ESZ);             \
3013     uint32_t vta = vext_vta(desc);                        \
3014     uint32_t vma = vext_vma(desc);                        \
3015     uint32_t i;                                           \
3016                                                           \
3017     for (i = env->vstart; i < vl; i++) {                  \
3018         if (!vm && !vext_elem_mask(v0, i)) {              \
3019             /* set masked-off elements to 1s */           \
3020             vext_set_elems_1s(vd, vma, i * ESZ,           \
3021                               (i + 1) * ESZ);             \
3022             continue;                                     \
3023         }                                                 \
3024         do_##NAME(vd, vs1, vs2, i, env);                  \
3025     }                                                     \
3026     env->vstart = 0;                                      \
3027     /* set tail elements to 1s */                         \
3028     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3029                       total_elems * ESZ);                 \
3030 }
3031 
3032 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3033 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3034 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3035 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3036 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3037 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3038 
3039 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3040 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3041                       CPURISCVState *env)                      \
3042 {                                                              \
3043     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3044     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3045 }
3046 
3047 #define GEN_VEXT_VF(NAME, ESZ)                            \
3048 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3049                   void *vs2, CPURISCVState *env,          \
3050                   uint32_t desc)                          \
3051 {                                                         \
3052     uint32_t vm = vext_vm(desc);                          \
3053     uint32_t vl = env->vl;                                \
3054     uint32_t total_elems =                                \
3055         vext_get_total_elems(env, desc, ESZ);              \
3056     uint32_t vta = vext_vta(desc);                        \
3057     uint32_t vma = vext_vma(desc);                        \
3058     uint32_t i;                                           \
3059                                                           \
3060     for (i = env->vstart; i < vl; i++) {                  \
3061         if (!vm && !vext_elem_mask(v0, i)) {              \
3062             /* set masked-off elements to 1s */           \
3063             vext_set_elems_1s(vd, vma, i * ESZ,           \
3064                               (i + 1) * ESZ);             \
3065             continue;                                     \
3066         }                                                 \
3067         do_##NAME(vd, s1, vs2, i, env);                   \
3068     }                                                     \
3069     env->vstart = 0;                                      \
3070     /* set tail elements to 1s */                         \
3071     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3072                       total_elems * ESZ);                 \
3073 }
3074 
3075 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3076 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3077 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3078 GEN_VEXT_VF(vfadd_vf_h, 2)
3079 GEN_VEXT_VF(vfadd_vf_w, 4)
3080 GEN_VEXT_VF(vfadd_vf_d, 8)
3081 
3082 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3083 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3084 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3085 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3086 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3087 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3088 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3089 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3090 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3091 GEN_VEXT_VF(vfsub_vf_h, 2)
3092 GEN_VEXT_VF(vfsub_vf_w, 4)
3093 GEN_VEXT_VF(vfsub_vf_d, 8)
3094 
3095 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3096 {
3097     return float16_sub(b, a, s);
3098 }
3099 
3100 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3101 {
3102     return float32_sub(b, a, s);
3103 }
3104 
3105 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3106 {
3107     return float64_sub(b, a, s);
3108 }
3109 
3110 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3111 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3112 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3113 GEN_VEXT_VF(vfrsub_vf_h, 2)
3114 GEN_VEXT_VF(vfrsub_vf_w, 4)
3115 GEN_VEXT_VF(vfrsub_vf_d, 8)
3116 
3117 /* Vector Widening Floating-Point Add/Subtract Instructions */
3118 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3119 {
3120     return float32_add(float16_to_float32(a, true, s),
3121             float16_to_float32(b, true, s), s);
3122 }
3123 
3124 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3125 {
3126     return float64_add(float32_to_float64(a, s),
3127             float32_to_float64(b, s), s);
3128 
3129 }
3130 
3131 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3132 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3133 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3134 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3135 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3136 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3137 GEN_VEXT_VF(vfwadd_vf_h, 4)
3138 GEN_VEXT_VF(vfwadd_vf_w, 8)
3139 
3140 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3141 {
3142     return float32_sub(float16_to_float32(a, true, s),
3143             float16_to_float32(b, true, s), s);
3144 }
3145 
3146 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3147 {
3148     return float64_sub(float32_to_float64(a, s),
3149             float32_to_float64(b, s), s);
3150 
3151 }
3152 
3153 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3154 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3155 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3156 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3157 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3158 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3159 GEN_VEXT_VF(vfwsub_vf_h, 4)
3160 GEN_VEXT_VF(vfwsub_vf_w, 8)
3161 
3162 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3163 {
3164     return float32_add(a, float16_to_float32(b, true, s), s);
3165 }
3166 
3167 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3168 {
3169     return float64_add(a, float32_to_float64(b, s), s);
3170 }
3171 
3172 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3173 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3174 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3175 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3176 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3177 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3178 GEN_VEXT_VF(vfwadd_wf_h, 4)
3179 GEN_VEXT_VF(vfwadd_wf_w, 8)
3180 
3181 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3182 {
3183     return float32_sub(a, float16_to_float32(b, true, s), s);
3184 }
3185 
3186 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3187 {
3188     return float64_sub(a, float32_to_float64(b, s), s);
3189 }
3190 
3191 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3192 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3193 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3194 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3195 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3196 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3197 GEN_VEXT_VF(vfwsub_wf_h, 4)
3198 GEN_VEXT_VF(vfwsub_wf_w, 8)
3199 
3200 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3201 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3202 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3203 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3204 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3205 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3206 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3207 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3208 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3209 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3210 GEN_VEXT_VF(vfmul_vf_h, 2)
3211 GEN_VEXT_VF(vfmul_vf_w, 4)
3212 GEN_VEXT_VF(vfmul_vf_d, 8)
3213 
3214 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3215 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3216 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3217 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3218 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3219 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3220 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3221 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3222 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3223 GEN_VEXT_VF(vfdiv_vf_h, 2)
3224 GEN_VEXT_VF(vfdiv_vf_w, 4)
3225 GEN_VEXT_VF(vfdiv_vf_d, 8)
3226 
3227 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3228 {
3229     return float16_div(b, a, s);
3230 }
3231 
3232 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3233 {
3234     return float32_div(b, a, s);
3235 }
3236 
3237 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3238 {
3239     return float64_div(b, a, s);
3240 }
3241 
3242 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3243 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3244 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3245 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3246 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3247 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3248 
3249 /* Vector Widening Floating-Point Multiply */
3250 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3251 {
3252     return float32_mul(float16_to_float32(a, true, s),
3253             float16_to_float32(b, true, s), s);
3254 }
3255 
3256 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3257 {
3258     return float64_mul(float32_to_float64(a, s),
3259             float32_to_float64(b, s), s);
3260 
3261 }
3262 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3263 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3264 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3265 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3266 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3267 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3268 GEN_VEXT_VF(vfwmul_vf_h, 4)
3269 GEN_VEXT_VF(vfwmul_vf_w, 8)
3270 
3271 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3272 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3273 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3274         CPURISCVState *env)                                        \
3275 {                                                                  \
3276     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3277     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3278     TD d = *((TD *)vd + HD(i));                                    \
3279     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3280 }
3281 
3282 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3283 {
3284     return float16_muladd(a, b, d, 0, s);
3285 }
3286 
3287 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3288 {
3289     return float32_muladd(a, b, d, 0, s);
3290 }
3291 
3292 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3293 {
3294     return float64_muladd(a, b, d, 0, s);
3295 }
3296 
3297 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3298 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3299 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3300 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3301 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3302 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3303 
3304 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3305 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3306         CPURISCVState *env)                                       \
3307 {                                                                 \
3308     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3309     TD d = *((TD *)vd + HD(i));                                   \
3310     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3311 }
3312 
3313 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3314 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3315 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3316 GEN_VEXT_VF(vfmacc_vf_h, 2)
3317 GEN_VEXT_VF(vfmacc_vf_w, 4)
3318 GEN_VEXT_VF(vfmacc_vf_d, 8)
3319 
3320 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3321 {
3322     return float16_muladd(a, b, d,
3323             float_muladd_negate_c | float_muladd_negate_product, s);
3324 }
3325 
3326 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3327 {
3328     return float32_muladd(a, b, d,
3329             float_muladd_negate_c | float_muladd_negate_product, s);
3330 }
3331 
3332 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3333 {
3334     return float64_muladd(a, b, d,
3335             float_muladd_negate_c | float_muladd_negate_product, s);
3336 }
3337 
3338 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3339 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3340 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3341 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3342 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3343 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3344 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3345 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3346 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3347 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3348 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3349 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3350 
3351 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3352 {
3353     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3354 }
3355 
3356 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3357 {
3358     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3359 }
3360 
3361 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3362 {
3363     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3364 }
3365 
3366 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3367 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3368 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3369 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3370 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3371 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3372 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3373 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3374 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3375 GEN_VEXT_VF(vfmsac_vf_h, 2)
3376 GEN_VEXT_VF(vfmsac_vf_w, 4)
3377 GEN_VEXT_VF(vfmsac_vf_d, 8)
3378 
3379 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3380 {
3381     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3382 }
3383 
3384 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3385 {
3386     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3387 }
3388 
3389 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3390 {
3391     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3392 }
3393 
3394 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3395 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3396 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3397 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3398 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3399 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3400 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3401 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3402 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3403 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3404 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3405 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3406 
3407 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3408 {
3409     return float16_muladd(d, b, a, 0, s);
3410 }
3411 
3412 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3413 {
3414     return float32_muladd(d, b, a, 0, s);
3415 }
3416 
3417 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3418 {
3419     return float64_muladd(d, b, a, 0, s);
3420 }
3421 
3422 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3423 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3424 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3425 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3426 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3427 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3428 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3429 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3430 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3431 GEN_VEXT_VF(vfmadd_vf_h, 2)
3432 GEN_VEXT_VF(vfmadd_vf_w, 4)
3433 GEN_VEXT_VF(vfmadd_vf_d, 8)
3434 
3435 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3436 {
3437     return float16_muladd(d, b, a,
3438             float_muladd_negate_c | float_muladd_negate_product, s);
3439 }
3440 
3441 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3442 {
3443     return float32_muladd(d, b, a,
3444             float_muladd_negate_c | float_muladd_negate_product, s);
3445 }
3446 
3447 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3448 {
3449     return float64_muladd(d, b, a,
3450             float_muladd_negate_c | float_muladd_negate_product, s);
3451 }
3452 
3453 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3454 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3455 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3456 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3457 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3458 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3459 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3460 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3461 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3462 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3463 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3464 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3465 
3466 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3467 {
3468     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3469 }
3470 
3471 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3472 {
3473     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3474 }
3475 
3476 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3477 {
3478     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3479 }
3480 
3481 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3482 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3483 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3484 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3485 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3486 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3487 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3488 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3489 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3490 GEN_VEXT_VF(vfmsub_vf_h, 2)
3491 GEN_VEXT_VF(vfmsub_vf_w, 4)
3492 GEN_VEXT_VF(vfmsub_vf_d, 8)
3493 
3494 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3495 {
3496     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3497 }
3498 
3499 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3500 {
3501     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3502 }
3503 
3504 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3505 {
3506     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3507 }
3508 
3509 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3510 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3511 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3512 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3513 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3514 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3515 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3516 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3517 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3518 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3519 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3520 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3521 
3522 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3523 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3524 {
3525     return float32_muladd(float16_to_float32(a, true, s),
3526                         float16_to_float32(b, true, s), d, 0, s);
3527 }
3528 
3529 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3530 {
3531     return float64_muladd(float32_to_float64(a, s),
3532                         float32_to_float64(b, s), d, 0, s);
3533 }
3534 
3535 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3536 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3537 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3538 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3539 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3540 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3541 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3542 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3543 
3544 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3545 {
3546     return float32_muladd(float16_to_float32(a, true, s),
3547                         float16_to_float32(b, true, s), d,
3548                         float_muladd_negate_c | float_muladd_negate_product, s);
3549 }
3550 
3551 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3552 {
3553     return float64_muladd(float32_to_float64(a, s),
3554                         float32_to_float64(b, s), d,
3555                         float_muladd_negate_c | float_muladd_negate_product, s);
3556 }
3557 
3558 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3559 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3560 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3561 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3562 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3563 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3564 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3565 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3566 
3567 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3568 {
3569     return float32_muladd(float16_to_float32(a, true, s),
3570                         float16_to_float32(b, true, s), d,
3571                         float_muladd_negate_c, s);
3572 }
3573 
3574 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3575 {
3576     return float64_muladd(float32_to_float64(a, s),
3577                         float32_to_float64(b, s), d,
3578                         float_muladd_negate_c, s);
3579 }
3580 
3581 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3582 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3583 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3584 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3585 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3586 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3587 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3588 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3589 
3590 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3591 {
3592     return float32_muladd(float16_to_float32(a, true, s),
3593                         float16_to_float32(b, true, s), d,
3594                         float_muladd_negate_product, s);
3595 }
3596 
3597 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3598 {
3599     return float64_muladd(float32_to_float64(a, s),
3600                         float32_to_float64(b, s), d,
3601                         float_muladd_negate_product, s);
3602 }
3603 
3604 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3605 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3606 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3607 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3608 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3609 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3610 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3611 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3612 
3613 /* Vector Floating-Point Square-Root Instruction */
3614 /* (TD, T2, TX2) */
3615 #define OP_UU_H uint16_t, uint16_t, uint16_t
3616 #define OP_UU_W uint32_t, uint32_t, uint32_t
3617 #define OP_UU_D uint64_t, uint64_t, uint64_t
3618 
3619 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3620 static void do_##NAME(void *vd, void *vs2, int i,      \
3621         CPURISCVState *env)                            \
3622 {                                                      \
3623     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3624     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3625 }
3626 
3627 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3628 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3629         CPURISCVState *env, uint32_t desc)             \
3630 {                                                      \
3631     uint32_t vm = vext_vm(desc);                       \
3632     uint32_t vl = env->vl;                             \
3633     uint32_t total_elems =                             \
3634         vext_get_total_elems(env, desc, ESZ);          \
3635     uint32_t vta = vext_vta(desc);                     \
3636     uint32_t vma = vext_vma(desc);                     \
3637     uint32_t i;                                        \
3638                                                        \
3639     if (vl == 0) {                                     \
3640         return;                                        \
3641     }                                                  \
3642     for (i = env->vstart; i < vl; i++) {               \
3643         if (!vm && !vext_elem_mask(v0, i)) {           \
3644             /* set masked-off elements to 1s */        \
3645             vext_set_elems_1s(vd, vma, i * ESZ,        \
3646                               (i + 1) * ESZ);          \
3647             continue;                                  \
3648         }                                              \
3649         do_##NAME(vd, vs2, i, env);                    \
3650     }                                                  \
3651     env->vstart = 0;                                   \
3652     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3653                       total_elems * ESZ);              \
3654 }
3655 
3656 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3657 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3658 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3659 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3660 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3661 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3662 
3663 /*
3664  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3665  *
3666  * Adapted from riscv-v-spec recip.c:
3667  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3668  */
3669 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3670 {
3671     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3672     uint64_t exp = extract64(f, frac_size, exp_size);
3673     uint64_t frac = extract64(f, 0, frac_size);
3674 
3675     const uint8_t lookup_table[] = {
3676         52, 51, 50, 48, 47, 46, 44, 43,
3677         42, 41, 40, 39, 38, 36, 35, 34,
3678         33, 32, 31, 30, 30, 29, 28, 27,
3679         26, 25, 24, 23, 23, 22, 21, 20,
3680         19, 19, 18, 17, 16, 16, 15, 14,
3681         14, 13, 12, 12, 11, 10, 10, 9,
3682         9, 8, 7, 7, 6, 6, 5, 4,
3683         4, 3, 3, 2, 2, 1, 1, 0,
3684         127, 125, 123, 121, 119, 118, 116, 114,
3685         113, 111, 109, 108, 106, 105, 103, 102,
3686         100, 99, 97, 96, 95, 93, 92, 91,
3687         90, 88, 87, 86, 85, 84, 83, 82,
3688         80, 79, 78, 77, 76, 75, 74, 73,
3689         72, 71, 70, 70, 69, 68, 67, 66,
3690         65, 64, 63, 63, 62, 61, 60, 59,
3691         59, 58, 57, 56, 56, 55, 54, 53
3692     };
3693     const int precision = 7;
3694 
3695     if (exp == 0 && frac != 0) { /* subnormal */
3696         /* Normalize the subnormal. */
3697         while (extract64(frac, frac_size - 1, 1) == 0) {
3698             exp--;
3699             frac <<= 1;
3700         }
3701 
3702         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3703     }
3704 
3705     int idx = ((exp & 1) << (precision - 1)) |
3706                 (frac >> (frac_size - precision + 1));
3707     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3708                             (frac_size - precision);
3709     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3710 
3711     uint64_t val = 0;
3712     val = deposit64(val, 0, frac_size, out_frac);
3713     val = deposit64(val, frac_size, exp_size, out_exp);
3714     val = deposit64(val, frac_size + exp_size, 1, sign);
3715     return val;
3716 }
3717 
3718 static float16 frsqrt7_h(float16 f, float_status *s)
3719 {
3720     int exp_size = 5, frac_size = 10;
3721     bool sign = float16_is_neg(f);
3722 
3723     /*
3724      * frsqrt7(sNaN) = canonical NaN
3725      * frsqrt7(-inf) = canonical NaN
3726      * frsqrt7(-normal) = canonical NaN
3727      * frsqrt7(-subnormal) = canonical NaN
3728      */
3729     if (float16_is_signaling_nan(f, s) ||
3730             (float16_is_infinity(f) && sign) ||
3731             (float16_is_normal(f) && sign) ||
3732             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3733         s->float_exception_flags |= float_flag_invalid;
3734         return float16_default_nan(s);
3735     }
3736 
3737     /* frsqrt7(qNaN) = canonical NaN */
3738     if (float16_is_quiet_nan(f, s)) {
3739         return float16_default_nan(s);
3740     }
3741 
3742     /* frsqrt7(+-0) = +-inf */
3743     if (float16_is_zero(f)) {
3744         s->float_exception_flags |= float_flag_divbyzero;
3745         return float16_set_sign(float16_infinity, sign);
3746     }
3747 
3748     /* frsqrt7(+inf) = +0 */
3749     if (float16_is_infinity(f) && !sign) {
3750         return float16_set_sign(float16_zero, sign);
3751     }
3752 
3753     /* +normal, +subnormal */
3754     uint64_t val = frsqrt7(f, exp_size, frac_size);
3755     return make_float16(val);
3756 }
3757 
3758 static float32 frsqrt7_s(float32 f, float_status *s)
3759 {
3760     int exp_size = 8, frac_size = 23;
3761     bool sign = float32_is_neg(f);
3762 
3763     /*
3764      * frsqrt7(sNaN) = canonical NaN
3765      * frsqrt7(-inf) = canonical NaN
3766      * frsqrt7(-normal) = canonical NaN
3767      * frsqrt7(-subnormal) = canonical NaN
3768      */
3769     if (float32_is_signaling_nan(f, s) ||
3770             (float32_is_infinity(f) && sign) ||
3771             (float32_is_normal(f) && sign) ||
3772             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3773         s->float_exception_flags |= float_flag_invalid;
3774         return float32_default_nan(s);
3775     }
3776 
3777     /* frsqrt7(qNaN) = canonical NaN */
3778     if (float32_is_quiet_nan(f, s)) {
3779         return float32_default_nan(s);
3780     }
3781 
3782     /* frsqrt7(+-0) = +-inf */
3783     if (float32_is_zero(f)) {
3784         s->float_exception_flags |= float_flag_divbyzero;
3785         return float32_set_sign(float32_infinity, sign);
3786     }
3787 
3788     /* frsqrt7(+inf) = +0 */
3789     if (float32_is_infinity(f) && !sign) {
3790         return float32_set_sign(float32_zero, sign);
3791     }
3792 
3793     /* +normal, +subnormal */
3794     uint64_t val = frsqrt7(f, exp_size, frac_size);
3795     return make_float32(val);
3796 }
3797 
3798 static float64 frsqrt7_d(float64 f, float_status *s)
3799 {
3800     int exp_size = 11, frac_size = 52;
3801     bool sign = float64_is_neg(f);
3802 
3803     /*
3804      * frsqrt7(sNaN) = canonical NaN
3805      * frsqrt7(-inf) = canonical NaN
3806      * frsqrt7(-normal) = canonical NaN
3807      * frsqrt7(-subnormal) = canonical NaN
3808      */
3809     if (float64_is_signaling_nan(f, s) ||
3810             (float64_is_infinity(f) && sign) ||
3811             (float64_is_normal(f) && sign) ||
3812             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3813         s->float_exception_flags |= float_flag_invalid;
3814         return float64_default_nan(s);
3815     }
3816 
3817     /* frsqrt7(qNaN) = canonical NaN */
3818     if (float64_is_quiet_nan(f, s)) {
3819         return float64_default_nan(s);
3820     }
3821 
3822     /* frsqrt7(+-0) = +-inf */
3823     if (float64_is_zero(f)) {
3824         s->float_exception_flags |= float_flag_divbyzero;
3825         return float64_set_sign(float64_infinity, sign);
3826     }
3827 
3828     /* frsqrt7(+inf) = +0 */
3829     if (float64_is_infinity(f) && !sign) {
3830         return float64_set_sign(float64_zero, sign);
3831     }
3832 
3833     /* +normal, +subnormal */
3834     uint64_t val = frsqrt7(f, exp_size, frac_size);
3835     return make_float64(val);
3836 }
3837 
3838 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3839 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3840 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3841 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3842 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3843 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3844 
3845 /*
3846  * Vector Floating-Point Reciprocal Estimate Instruction
3847  *
3848  * Adapted from riscv-v-spec recip.c:
3849  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3850  */
3851 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3852                       float_status *s)
3853 {
3854     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3855     uint64_t exp = extract64(f, frac_size, exp_size);
3856     uint64_t frac = extract64(f, 0, frac_size);
3857 
3858     const uint8_t lookup_table[] = {
3859         127, 125, 123, 121, 119, 117, 116, 114,
3860         112, 110, 109, 107, 105, 104, 102, 100,
3861         99, 97, 96, 94, 93, 91, 90, 88,
3862         87, 85, 84, 83, 81, 80, 79, 77,
3863         76, 75, 74, 72, 71, 70, 69, 68,
3864         66, 65, 64, 63, 62, 61, 60, 59,
3865         58, 57, 56, 55, 54, 53, 52, 51,
3866         50, 49, 48, 47, 46, 45, 44, 43,
3867         42, 41, 40, 40, 39, 38, 37, 36,
3868         35, 35, 34, 33, 32, 31, 31, 30,
3869         29, 28, 28, 27, 26, 25, 25, 24,
3870         23, 23, 22, 21, 21, 20, 19, 19,
3871         18, 17, 17, 16, 15, 15, 14, 14,
3872         13, 12, 12, 11, 11, 10, 9, 9,
3873         8, 8, 7, 7, 6, 5, 5, 4,
3874         4, 3, 3, 2, 2, 1, 1, 0
3875     };
3876     const int precision = 7;
3877 
3878     if (exp == 0 && frac != 0) { /* subnormal */
3879         /* Normalize the subnormal. */
3880         while (extract64(frac, frac_size - 1, 1) == 0) {
3881             exp--;
3882             frac <<= 1;
3883         }
3884 
3885         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3886 
3887         if (exp != 0 && exp != UINT64_MAX) {
3888             /*
3889              * Overflow to inf or max value of same sign,
3890              * depending on sign and rounding mode.
3891              */
3892             s->float_exception_flags |= (float_flag_inexact |
3893                                          float_flag_overflow);
3894 
3895             if ((s->float_rounding_mode == float_round_to_zero) ||
3896                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3897                 ((s->float_rounding_mode == float_round_up) && sign)) {
3898                 /* Return greatest/negative finite value. */
3899                 return (sign << (exp_size + frac_size)) |
3900                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3901             } else {
3902                 /* Return +-inf. */
3903                 return (sign << (exp_size + frac_size)) |
3904                     MAKE_64BIT_MASK(frac_size, exp_size);
3905             }
3906         }
3907     }
3908 
3909     int idx = frac >> (frac_size - precision);
3910     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3911                             (frac_size - precision);
3912     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3913 
3914     if (out_exp == 0 || out_exp == UINT64_MAX) {
3915         /*
3916          * The result is subnormal, but don't raise the underflow exception,
3917          * because there's no additional loss of precision.
3918          */
3919         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3920         if (out_exp == UINT64_MAX) {
3921             out_frac >>= 1;
3922             out_exp = 0;
3923         }
3924     }
3925 
3926     uint64_t val = 0;
3927     val = deposit64(val, 0, frac_size, out_frac);
3928     val = deposit64(val, frac_size, exp_size, out_exp);
3929     val = deposit64(val, frac_size + exp_size, 1, sign);
3930     return val;
3931 }
3932 
3933 static float16 frec7_h(float16 f, float_status *s)
3934 {
3935     int exp_size = 5, frac_size = 10;
3936     bool sign = float16_is_neg(f);
3937 
3938     /* frec7(+-inf) = +-0 */
3939     if (float16_is_infinity(f)) {
3940         return float16_set_sign(float16_zero, sign);
3941     }
3942 
3943     /* frec7(+-0) = +-inf */
3944     if (float16_is_zero(f)) {
3945         s->float_exception_flags |= float_flag_divbyzero;
3946         return float16_set_sign(float16_infinity, sign);
3947     }
3948 
3949     /* frec7(sNaN) = canonical NaN */
3950     if (float16_is_signaling_nan(f, s)) {
3951         s->float_exception_flags |= float_flag_invalid;
3952         return float16_default_nan(s);
3953     }
3954 
3955     /* frec7(qNaN) = canonical NaN */
3956     if (float16_is_quiet_nan(f, s)) {
3957         return float16_default_nan(s);
3958     }
3959 
3960     /* +-normal, +-subnormal */
3961     uint64_t val = frec7(f, exp_size, frac_size, s);
3962     return make_float16(val);
3963 }
3964 
3965 static float32 frec7_s(float32 f, float_status *s)
3966 {
3967     int exp_size = 8, frac_size = 23;
3968     bool sign = float32_is_neg(f);
3969 
3970     /* frec7(+-inf) = +-0 */
3971     if (float32_is_infinity(f)) {
3972         return float32_set_sign(float32_zero, sign);
3973     }
3974 
3975     /* frec7(+-0) = +-inf */
3976     if (float32_is_zero(f)) {
3977         s->float_exception_flags |= float_flag_divbyzero;
3978         return float32_set_sign(float32_infinity, sign);
3979     }
3980 
3981     /* frec7(sNaN) = canonical NaN */
3982     if (float32_is_signaling_nan(f, s)) {
3983         s->float_exception_flags |= float_flag_invalid;
3984         return float32_default_nan(s);
3985     }
3986 
3987     /* frec7(qNaN) = canonical NaN */
3988     if (float32_is_quiet_nan(f, s)) {
3989         return float32_default_nan(s);
3990     }
3991 
3992     /* +-normal, +-subnormal */
3993     uint64_t val = frec7(f, exp_size, frac_size, s);
3994     return make_float32(val);
3995 }
3996 
3997 static float64 frec7_d(float64 f, float_status *s)
3998 {
3999     int exp_size = 11, frac_size = 52;
4000     bool sign = float64_is_neg(f);
4001 
4002     /* frec7(+-inf) = +-0 */
4003     if (float64_is_infinity(f)) {
4004         return float64_set_sign(float64_zero, sign);
4005     }
4006 
4007     /* frec7(+-0) = +-inf */
4008     if (float64_is_zero(f)) {
4009         s->float_exception_flags |= float_flag_divbyzero;
4010         return float64_set_sign(float64_infinity, sign);
4011     }
4012 
4013     /* frec7(sNaN) = canonical NaN */
4014     if (float64_is_signaling_nan(f, s)) {
4015         s->float_exception_flags |= float_flag_invalid;
4016         return float64_default_nan(s);
4017     }
4018 
4019     /* frec7(qNaN) = canonical NaN */
4020     if (float64_is_quiet_nan(f, s)) {
4021         return float64_default_nan(s);
4022     }
4023 
4024     /* +-normal, +-subnormal */
4025     uint64_t val = frec7(f, exp_size, frac_size, s);
4026     return make_float64(val);
4027 }
4028 
4029 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4030 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4031 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4032 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4033 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4034 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4035 
4036 /* Vector Floating-Point MIN/MAX Instructions */
4037 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4038 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4039 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4040 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4041 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4042 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4043 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4044 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4045 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4046 GEN_VEXT_VF(vfmin_vf_h, 2)
4047 GEN_VEXT_VF(vfmin_vf_w, 4)
4048 GEN_VEXT_VF(vfmin_vf_d, 8)
4049 
4050 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4051 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4052 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4053 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4054 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4055 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4056 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4057 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4058 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4059 GEN_VEXT_VF(vfmax_vf_h, 2)
4060 GEN_VEXT_VF(vfmax_vf_w, 4)
4061 GEN_VEXT_VF(vfmax_vf_d, 8)
4062 
4063 /* Vector Floating-Point Sign-Injection Instructions */
4064 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4065 {
4066     return deposit64(b, 0, 15, a);
4067 }
4068 
4069 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4070 {
4071     return deposit64(b, 0, 31, a);
4072 }
4073 
4074 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4075 {
4076     return deposit64(b, 0, 63, a);
4077 }
4078 
4079 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4080 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4081 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4082 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4083 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4084 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4085 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4086 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4087 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4088 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4089 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4090 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4091 
4092 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4093 {
4094     return deposit64(~b, 0, 15, a);
4095 }
4096 
4097 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4098 {
4099     return deposit64(~b, 0, 31, a);
4100 }
4101 
4102 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4103 {
4104     return deposit64(~b, 0, 63, a);
4105 }
4106 
4107 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4108 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4109 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4110 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4111 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4112 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4113 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4114 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4115 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4116 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4117 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4118 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4119 
4120 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4121 {
4122     return deposit64(b ^ a, 0, 15, a);
4123 }
4124 
4125 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4126 {
4127     return deposit64(b ^ a, 0, 31, a);
4128 }
4129 
4130 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4131 {
4132     return deposit64(b ^ a, 0, 63, a);
4133 }
4134 
4135 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4136 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4137 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4138 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4139 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4140 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4141 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4142 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4143 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4144 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4145 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4146 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4147 
4148 /* Vector Floating-Point Compare Instructions */
4149 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4150 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4151                   CPURISCVState *env, uint32_t desc)          \
4152 {                                                             \
4153     uint32_t vm = vext_vm(desc);                              \
4154     uint32_t vl = env->vl;                                    \
4155     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4156     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4157     uint32_t vma = vext_vma(desc);                            \
4158     uint32_t i;                                               \
4159                                                               \
4160     for (i = env->vstart; i < vl; i++) {                      \
4161         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4162         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4163         if (!vm && !vext_elem_mask(v0, i)) {                  \
4164             /* set masked-off elements to 1s */               \
4165             if (vma) {                                        \
4166                 vext_set_elem_mask(vd, i, 1);                 \
4167             }                                                 \
4168             continue;                                         \
4169         }                                                     \
4170         vext_set_elem_mask(vd, i,                             \
4171                            DO_OP(s2, s1, &env->fp_status));   \
4172     }                                                         \
4173     env->vstart = 0;                                          \
4174     /* mask destination register are always tail-agnostic */  \
4175     /* set tail elements to 1s */                             \
4176     if (vta_all_1s) {                                         \
4177         for (; i < total_elems; i++) {                        \
4178             vext_set_elem_mask(vd, i, 1);                     \
4179         }                                                     \
4180     }                                                         \
4181 }
4182 
4183 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4184 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4185 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4186 
4187 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4188 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4189                   CPURISCVState *env, uint32_t desc)                \
4190 {                                                                   \
4191     uint32_t vm = vext_vm(desc);                                    \
4192     uint32_t vl = env->vl;                                          \
4193     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4194     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4195     uint32_t vma = vext_vma(desc);                                  \
4196     uint32_t i;                                                     \
4197                                                                     \
4198     for (i = env->vstart; i < vl; i++) {                            \
4199         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4200         if (!vm && !vext_elem_mask(v0, i)) {                        \
4201             /* set masked-off elements to 1s */                     \
4202             if (vma) {                                              \
4203                 vext_set_elem_mask(vd, i, 1);                       \
4204             }                                                       \
4205             continue;                                               \
4206         }                                                           \
4207         vext_set_elem_mask(vd, i,                                   \
4208                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4209     }                                                               \
4210     env->vstart = 0;                                                \
4211     /* mask destination register are always tail-agnostic */        \
4212     /* set tail elements to 1s */                                   \
4213     if (vta_all_1s) {                                               \
4214         for (; i < total_elems; i++) {                              \
4215             vext_set_elem_mask(vd, i, 1);                           \
4216         }                                                           \
4217     }                                                               \
4218 }
4219 
4220 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4221 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4222 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4223 
4224 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4225 {
4226     FloatRelation compare = float16_compare_quiet(a, b, s);
4227     return compare != float_relation_equal;
4228 }
4229 
4230 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4231 {
4232     FloatRelation compare = float32_compare_quiet(a, b, s);
4233     return compare != float_relation_equal;
4234 }
4235 
4236 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4237 {
4238     FloatRelation compare = float64_compare_quiet(a, b, s);
4239     return compare != float_relation_equal;
4240 }
4241 
4242 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4243 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4244 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4245 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4246 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4247 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4248 
4249 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4250 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4251 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4252 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4253 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4254 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4255 
4256 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4257 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4258 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4259 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4260 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4261 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4262 
4263 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4264 {
4265     FloatRelation compare = float16_compare(a, b, s);
4266     return compare == float_relation_greater;
4267 }
4268 
4269 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4270 {
4271     FloatRelation compare = float32_compare(a, b, s);
4272     return compare == float_relation_greater;
4273 }
4274 
4275 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4276 {
4277     FloatRelation compare = float64_compare(a, b, s);
4278     return compare == float_relation_greater;
4279 }
4280 
4281 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4282 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4283 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4284 
4285 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4286 {
4287     FloatRelation compare = float16_compare(a, b, s);
4288     return compare == float_relation_greater ||
4289            compare == float_relation_equal;
4290 }
4291 
4292 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4293 {
4294     FloatRelation compare = float32_compare(a, b, s);
4295     return compare == float_relation_greater ||
4296            compare == float_relation_equal;
4297 }
4298 
4299 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4300 {
4301     FloatRelation compare = float64_compare(a, b, s);
4302     return compare == float_relation_greater ||
4303            compare == float_relation_equal;
4304 }
4305 
4306 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4307 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4308 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4309 
4310 /* Vector Floating-Point Classify Instruction */
4311 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4312 static void do_##NAME(void *vd, void *vs2, int i)      \
4313 {                                                      \
4314     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4315     *((TD *)vd + HD(i)) = OP(s2);                      \
4316 }
4317 
4318 #define GEN_VEXT_V(NAME, ESZ)                          \
4319 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4320                   CPURISCVState *env, uint32_t desc)   \
4321 {                                                      \
4322     uint32_t vm = vext_vm(desc);                       \
4323     uint32_t vl = env->vl;                             \
4324     uint32_t total_elems =                             \
4325         vext_get_total_elems(env, desc, ESZ);          \
4326     uint32_t vta = vext_vta(desc);                     \
4327     uint32_t vma = vext_vma(desc);                     \
4328     uint32_t i;                                        \
4329                                                        \
4330     for (i = env->vstart; i < vl; i++) {               \
4331         if (!vm && !vext_elem_mask(v0, i)) {           \
4332             /* set masked-off elements to 1s */        \
4333             vext_set_elems_1s(vd, vma, i * ESZ,        \
4334                               (i + 1) * ESZ);          \
4335             continue;                                  \
4336         }                                              \
4337         do_##NAME(vd, vs2, i);                         \
4338     }                                                  \
4339     env->vstart = 0;                                   \
4340     /* set tail elements to 1s */                      \
4341     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4342                       total_elems * ESZ);              \
4343 }
4344 
4345 target_ulong fclass_h(uint64_t frs1)
4346 {
4347     float16 f = frs1;
4348     bool sign = float16_is_neg(f);
4349 
4350     if (float16_is_infinity(f)) {
4351         return sign ? 1 << 0 : 1 << 7;
4352     } else if (float16_is_zero(f)) {
4353         return sign ? 1 << 3 : 1 << 4;
4354     } else if (float16_is_zero_or_denormal(f)) {
4355         return sign ? 1 << 2 : 1 << 5;
4356     } else if (float16_is_any_nan(f)) {
4357         float_status s = { }; /* for snan_bit_is_one */
4358         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4359     } else {
4360         return sign ? 1 << 1 : 1 << 6;
4361     }
4362 }
4363 
4364 target_ulong fclass_s(uint64_t frs1)
4365 {
4366     float32 f = frs1;
4367     bool sign = float32_is_neg(f);
4368 
4369     if (float32_is_infinity(f)) {
4370         return sign ? 1 << 0 : 1 << 7;
4371     } else if (float32_is_zero(f)) {
4372         return sign ? 1 << 3 : 1 << 4;
4373     } else if (float32_is_zero_or_denormal(f)) {
4374         return sign ? 1 << 2 : 1 << 5;
4375     } else if (float32_is_any_nan(f)) {
4376         float_status s = { }; /* for snan_bit_is_one */
4377         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4378     } else {
4379         return sign ? 1 << 1 : 1 << 6;
4380     }
4381 }
4382 
4383 target_ulong fclass_d(uint64_t frs1)
4384 {
4385     float64 f = frs1;
4386     bool sign = float64_is_neg(f);
4387 
4388     if (float64_is_infinity(f)) {
4389         return sign ? 1 << 0 : 1 << 7;
4390     } else if (float64_is_zero(f)) {
4391         return sign ? 1 << 3 : 1 << 4;
4392     } else if (float64_is_zero_or_denormal(f)) {
4393         return sign ? 1 << 2 : 1 << 5;
4394     } else if (float64_is_any_nan(f)) {
4395         float_status s = { }; /* for snan_bit_is_one */
4396         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4397     } else {
4398         return sign ? 1 << 1 : 1 << 6;
4399     }
4400 }
4401 
4402 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4403 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4404 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4405 GEN_VEXT_V(vfclass_v_h, 2)
4406 GEN_VEXT_V(vfclass_v_w, 4)
4407 GEN_VEXT_V(vfclass_v_d, 8)
4408 
4409 /* Vector Floating-Point Merge Instruction */
4410 
4411 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4412 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4413                   CPURISCVState *env, uint32_t desc)          \
4414 {                                                             \
4415     uint32_t vm = vext_vm(desc);                              \
4416     uint32_t vl = env->vl;                                    \
4417     uint32_t esz = sizeof(ETYPE);                             \
4418     uint32_t total_elems =                                    \
4419         vext_get_total_elems(env, desc, esz);                 \
4420     uint32_t vta = vext_vta(desc);                            \
4421     uint32_t i;                                               \
4422                                                               \
4423     for (i = env->vstart; i < vl; i++) {                      \
4424         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4425         *((ETYPE *)vd + H(i))                                 \
4426           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4427     }                                                         \
4428     env->vstart = 0;                                          \
4429     /* set tail elements to 1s */                             \
4430     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4431 }
4432 
4433 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4434 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4435 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4436 
4437 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4438 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4439 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4440 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4441 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4442 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4443 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4444 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4445 
4446 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4447 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4448 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4449 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4450 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4451 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4452 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4453 
4454 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4455 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4456 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4457 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4458 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4459 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4460 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4461 
4462 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4463 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4464 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4465 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4466 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4467 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4468 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4469 
4470 /* Widening Floating-Point/Integer Type-Convert Instructions */
4471 /* (TD, T2, TX2) */
4472 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4473 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4474 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4475 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4476 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4477 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4478 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4479 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4480 
4481 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4482 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4483 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4484 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4485 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4486 
4487 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4488 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4489 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4490 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4491 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4492 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4493 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4494 
4495 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4496 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4497 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4498 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4499 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4500 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4501 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4502 
4503 /*
4504  * vfwcvt.f.f.v vd, vs2, vm
4505  * Convert single-width float to double-width float.
4506  */
4507 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4508 {
4509     return float16_to_float32(a, true, s);
4510 }
4511 
4512 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4513 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4514 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4515 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4516 
4517 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4518 /* (TD, T2, TX2) */
4519 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4520 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4521 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4522 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4523 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4524 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4525 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4526 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4527 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4528 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4529 
4530 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4531 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4532 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4533 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4534 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4535 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4536 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4537 
4538 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4539 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4540 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4541 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4542 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4543 
4544 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4545 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4546 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4547 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4548 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4549 
4550 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4551 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4552 {
4553     return float32_to_float16(a, true, s);
4554 }
4555 
4556 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4557 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4558 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4559 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4560 
4561 /*
4562  *** Vector Reduction Operations
4563  */
4564 /* Vector Single-Width Integer Reduction Instructions */
4565 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4566 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4567         void *vs2, CPURISCVState *env, uint32_t desc)     \
4568 {                                                         \
4569     uint32_t vm = vext_vm(desc);                          \
4570     uint32_t vl = env->vl;                                \
4571     uint32_t esz = sizeof(TD);                            \
4572     uint32_t vlenb = simd_maxsz(desc);                    \
4573     uint32_t vta = vext_vta(desc);                        \
4574     uint32_t i;                                           \
4575     TD s1 =  *((TD *)vs1 + HD(0));                        \
4576                                                           \
4577     for (i = env->vstart; i < vl; i++) {                  \
4578         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4579         if (!vm && !vext_elem_mask(v0, i)) {              \
4580             continue;                                     \
4581         }                                                 \
4582         s1 = OP(s1, (TD)s2);                              \
4583     }                                                     \
4584     *((TD *)vd + HD(0)) = s1;                             \
4585     env->vstart = 0;                                      \
4586     /* set tail elements to 1s */                         \
4587     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4588 }
4589 
4590 /* vd[0] = sum(vs1[0], vs2[*]) */
4591 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4592 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4593 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4594 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4595 
4596 /* vd[0] = maxu(vs1[0], vs2[*]) */
4597 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4598 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4599 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4600 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4601 
4602 /* vd[0] = max(vs1[0], vs2[*]) */
4603 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4604 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4605 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4606 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4607 
4608 /* vd[0] = minu(vs1[0], vs2[*]) */
4609 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4610 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4611 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4612 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4613 
4614 /* vd[0] = min(vs1[0], vs2[*]) */
4615 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4616 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4617 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4618 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4619 
4620 /* vd[0] = and(vs1[0], vs2[*]) */
4621 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4622 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4623 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4624 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4625 
4626 /* vd[0] = or(vs1[0], vs2[*]) */
4627 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4628 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4629 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4630 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4631 
4632 /* vd[0] = xor(vs1[0], vs2[*]) */
4633 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4634 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4635 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4636 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4637 
4638 /* Vector Widening Integer Reduction Instructions */
4639 /* signed sum reduction into double-width accumulator */
4640 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4641 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4642 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4643 
4644 /* Unsigned sum reduction into double-width accumulator */
4645 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4646 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4647 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4648 
4649 /* Vector Single-Width Floating-Point Reduction Instructions */
4650 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4651 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4652                   void *vs2, CPURISCVState *env,           \
4653                   uint32_t desc)                           \
4654 {                                                          \
4655     uint32_t vm = vext_vm(desc);                           \
4656     uint32_t vl = env->vl;                                 \
4657     uint32_t esz = sizeof(TD);                             \
4658     uint32_t vlenb = simd_maxsz(desc);                     \
4659     uint32_t vta = vext_vta(desc);                         \
4660     uint32_t i;                                            \
4661     TD s1 =  *((TD *)vs1 + HD(0));                         \
4662                                                            \
4663     for (i = env->vstart; i < vl; i++) {                   \
4664         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4665         if (!vm && !vext_elem_mask(v0, i)) {               \
4666             continue;                                      \
4667         }                                                  \
4668         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4669     }                                                      \
4670     *((TD *)vd + HD(0)) = s1;                              \
4671     env->vstart = 0;                                       \
4672     /* set tail elements to 1s */                          \
4673     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4674 }
4675 
4676 /* Unordered sum */
4677 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4678 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4679 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4680 
4681 /* Ordered sum */
4682 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4683 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4684 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4685 
4686 /* Maximum value */
4687 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4688 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4689 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4690 
4691 /* Minimum value */
4692 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4693 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4694 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4695 
4696 /* Vector Widening Floating-Point Add Instructions */
4697 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4698 {
4699     return float32_add(a, float16_to_float32(b, true, s), s);
4700 }
4701 
4702 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4703 {
4704     return float64_add(a, float32_to_float64(b, s), s);
4705 }
4706 
4707 /* Vector Widening Floating-Point Reduction Instructions */
4708 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4709 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4710 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4711 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4712 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4713 
4714 /*
4715  *** Vector Mask Operations
4716  */
4717 /* Vector Mask-Register Logical Instructions */
4718 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4719 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4720                   void *vs2, CPURISCVState *env,          \
4721                   uint32_t desc)                          \
4722 {                                                         \
4723     uint32_t vl = env->vl;                                \
4724     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4725     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4726     uint32_t i;                                           \
4727     int a, b;                                             \
4728                                                           \
4729     for (i = env->vstart; i < vl; i++) {                  \
4730         a = vext_elem_mask(vs1, i);                       \
4731         b = vext_elem_mask(vs2, i);                       \
4732         vext_set_elem_mask(vd, i, OP(b, a));              \
4733     }                                                     \
4734     env->vstart = 0;                                      \
4735     /* mask destination register are always tail-         \
4736      * agnostic                                           \
4737      */                                                   \
4738     /* set tail elements to 1s */                         \
4739     if (vta_all_1s) {                                     \
4740         for (; i < total_elems; i++) {                    \
4741             vext_set_elem_mask(vd, i, 1);                 \
4742         }                                                 \
4743     }                                                     \
4744 }
4745 
4746 #define DO_NAND(N, M)  (!(N & M))
4747 #define DO_ANDNOT(N, M)  (N & !M)
4748 #define DO_NOR(N, M)  (!(N | M))
4749 #define DO_ORNOT(N, M)  (N | !M)
4750 #define DO_XNOR(N, M)  (!(N ^ M))
4751 
4752 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4753 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4754 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4755 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4756 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4757 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4758 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4759 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4760 
4761 /* Vector count population in mask vcpop */
4762 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4763                              uint32_t desc)
4764 {
4765     target_ulong cnt = 0;
4766     uint32_t vm = vext_vm(desc);
4767     uint32_t vl = env->vl;
4768     int i;
4769 
4770     for (i = env->vstart; i < vl; i++) {
4771         if (vm || vext_elem_mask(v0, i)) {
4772             if (vext_elem_mask(vs2, i)) {
4773                 cnt++;
4774             }
4775         }
4776     }
4777     env->vstart = 0;
4778     return cnt;
4779 }
4780 
4781 /* vfirst find-first-set mask bit*/
4782 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4783                               uint32_t desc)
4784 {
4785     uint32_t vm = vext_vm(desc);
4786     uint32_t vl = env->vl;
4787     int i;
4788 
4789     for (i = env->vstart; i < vl; i++) {
4790         if (vm || vext_elem_mask(v0, i)) {
4791             if (vext_elem_mask(vs2, i)) {
4792                 return i;
4793             }
4794         }
4795     }
4796     env->vstart = 0;
4797     return -1LL;
4798 }
4799 
4800 enum set_mask_type {
4801     ONLY_FIRST = 1,
4802     INCLUDE_FIRST,
4803     BEFORE_FIRST,
4804 };
4805 
4806 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4807                    uint32_t desc, enum set_mask_type type)
4808 {
4809     uint32_t vm = vext_vm(desc);
4810     uint32_t vl = env->vl;
4811     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4812     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4813     uint32_t vma = vext_vma(desc);
4814     int i;
4815     bool first_mask_bit = false;
4816 
4817     for (i = env->vstart; i < vl; i++) {
4818         if (!vm && !vext_elem_mask(v0, i)) {
4819             /* set masked-off elements to 1s */
4820             if (vma) {
4821                 vext_set_elem_mask(vd, i, 1);
4822             }
4823             continue;
4824         }
4825         /* write a zero to all following active elements */
4826         if (first_mask_bit) {
4827             vext_set_elem_mask(vd, i, 0);
4828             continue;
4829         }
4830         if (vext_elem_mask(vs2, i)) {
4831             first_mask_bit = true;
4832             if (type == BEFORE_FIRST) {
4833                 vext_set_elem_mask(vd, i, 0);
4834             } else {
4835                 vext_set_elem_mask(vd, i, 1);
4836             }
4837         } else {
4838             if (type == ONLY_FIRST) {
4839                 vext_set_elem_mask(vd, i, 0);
4840             } else {
4841                 vext_set_elem_mask(vd, i, 1);
4842             }
4843         }
4844     }
4845     env->vstart = 0;
4846     /* mask destination register are always tail-agnostic */
4847     /* set tail elements to 1s */
4848     if (vta_all_1s) {
4849         for (; i < total_elems; i++) {
4850             vext_set_elem_mask(vd, i, 1);
4851         }
4852     }
4853 }
4854 
4855 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4856                      uint32_t desc)
4857 {
4858     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4859 }
4860 
4861 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4862                      uint32_t desc)
4863 {
4864     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4865 }
4866 
4867 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4868                      uint32_t desc)
4869 {
4870     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4871 }
4872 
4873 /* Vector Iota Instruction */
4874 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4875 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4876                   uint32_t desc)                                          \
4877 {                                                                         \
4878     uint32_t vm = vext_vm(desc);                                          \
4879     uint32_t vl = env->vl;                                                \
4880     uint32_t esz = sizeof(ETYPE);                                         \
4881     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4882     uint32_t vta = vext_vta(desc);                                        \
4883     uint32_t vma = vext_vma(desc);                                        \
4884     uint32_t sum = 0;                                                     \
4885     int i;                                                                \
4886                                                                           \
4887     for (i = env->vstart; i < vl; i++) {                                  \
4888         if (!vm && !vext_elem_mask(v0, i)) {                              \
4889             /* set masked-off elements to 1s */                           \
4890             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4891             continue;                                                     \
4892         }                                                                 \
4893         *((ETYPE *)vd + H(i)) = sum;                                      \
4894         if (vext_elem_mask(vs2, i)) {                                     \
4895             sum++;                                                        \
4896         }                                                                 \
4897     }                                                                     \
4898     env->vstart = 0;                                                      \
4899     /* set tail elements to 1s */                                         \
4900     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4901 }
4902 
4903 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4904 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4905 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4906 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4907 
4908 /* Vector Element Index Instruction */
4909 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4910 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4911 {                                                                         \
4912     uint32_t vm = vext_vm(desc);                                          \
4913     uint32_t vl = env->vl;                                                \
4914     uint32_t esz = sizeof(ETYPE);                                         \
4915     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4916     uint32_t vta = vext_vta(desc);                                        \
4917     uint32_t vma = vext_vma(desc);                                        \
4918     int i;                                                                \
4919                                                                           \
4920     for (i = env->vstart; i < vl; i++) {                                  \
4921         if (!vm && !vext_elem_mask(v0, i)) {                              \
4922             /* set masked-off elements to 1s */                           \
4923             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4924             continue;                                                     \
4925         }                                                                 \
4926         *((ETYPE *)vd + H(i)) = i;                                        \
4927     }                                                                     \
4928     env->vstart = 0;                                                      \
4929     /* set tail elements to 1s */                                         \
4930     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4931 }
4932 
4933 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4934 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4935 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4936 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4937 
4938 /*
4939  *** Vector Permutation Instructions
4940  */
4941 
4942 /* Vector Slide Instructions */
4943 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4944 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4945                   CPURISCVState *env, uint32_t desc)                      \
4946 {                                                                         \
4947     uint32_t vm = vext_vm(desc);                                          \
4948     uint32_t vl = env->vl;                                                \
4949     uint32_t esz = sizeof(ETYPE);                                         \
4950     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4951     uint32_t vta = vext_vta(desc);                                        \
4952     uint32_t vma = vext_vma(desc);                                        \
4953     target_ulong offset = s1, i_min, i;                                   \
4954                                                                           \
4955     i_min = MAX(env->vstart, offset);                                     \
4956     for (i = i_min; i < vl; i++) {                                        \
4957         if (!vm && !vext_elem_mask(v0, i)) {                              \
4958             /* set masked-off elements to 1s */                           \
4959             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4960             continue;                                                     \
4961         }                                                                 \
4962         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4963     }                                                                     \
4964     /* set tail elements to 1s */                                         \
4965     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4966 }
4967 
4968 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4969 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4970 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4971 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4972 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4973 
4974 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4975 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4976                   CPURISCVState *env, uint32_t desc)                      \
4977 {                                                                         \
4978     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4979     uint32_t vm = vext_vm(desc);                                          \
4980     uint32_t vl = env->vl;                                                \
4981     uint32_t esz = sizeof(ETYPE);                                         \
4982     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4983     uint32_t vta = vext_vta(desc);                                        \
4984     uint32_t vma = vext_vma(desc);                                        \
4985     target_ulong i_max, i;                                                \
4986                                                                           \
4987     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4988     for (i = env->vstart; i < i_max; ++i) {                               \
4989         if (!vm && !vext_elem_mask(v0, i)) {                              \
4990             /* set masked-off elements to 1s */                           \
4991             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4992             continue;                                                     \
4993         }                                                                 \
4994         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4995     }                                                                     \
4996                                                                           \
4997     for (i = i_max; i < vl; ++i) {                                        \
4998         if (vm || vext_elem_mask(v0, i)) {                                \
4999             *((ETYPE *)vd + H(i)) = 0;                                    \
5000         }                                                                 \
5001     }                                                                     \
5002                                                                           \
5003     env->vstart = 0;                                                      \
5004     /* set tail elements to 1s */                                         \
5005     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5006 }
5007 
5008 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5009 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5010 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5011 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5012 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5013 
5014 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5015 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5016                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5017 {                                                                           \
5018     typedef uint##BITWIDTH##_t ETYPE;                                       \
5019     uint32_t vm = vext_vm(desc);                                            \
5020     uint32_t vl = env->vl;                                                  \
5021     uint32_t esz = sizeof(ETYPE);                                           \
5022     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5023     uint32_t vta = vext_vta(desc);                                          \
5024     uint32_t vma = vext_vma(desc);                                          \
5025     uint32_t i;                                                             \
5026                                                                             \
5027     for (i = env->vstart; i < vl; i++) {                                    \
5028         if (!vm && !vext_elem_mask(v0, i)) {                                \
5029             /* set masked-off elements to 1s */                             \
5030             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5031             continue;                                                       \
5032         }                                                                   \
5033         if (i == 0) {                                                       \
5034             *((ETYPE *)vd + H(i)) = s1;                                     \
5035         } else {                                                            \
5036             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5037         }                                                                   \
5038     }                                                                       \
5039     env->vstart = 0;                                                        \
5040     /* set tail elements to 1s */                                           \
5041     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5042 }
5043 
5044 GEN_VEXT_VSLIE1UP(8,  H1)
5045 GEN_VEXT_VSLIE1UP(16, H2)
5046 GEN_VEXT_VSLIE1UP(32, H4)
5047 GEN_VEXT_VSLIE1UP(64, H8)
5048 
5049 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5050 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5051                   CPURISCVState *env, uint32_t desc)              \
5052 {                                                                 \
5053     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5054 }
5055 
5056 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5057 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5058 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5059 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5060 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5061 
5062 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5063 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5064                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5065 {                                                                             \
5066     typedef uint##BITWIDTH##_t ETYPE;                                         \
5067     uint32_t vm = vext_vm(desc);                                              \
5068     uint32_t vl = env->vl;                                                    \
5069     uint32_t esz = sizeof(ETYPE);                                             \
5070     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5071     uint32_t vta = vext_vta(desc);                                            \
5072     uint32_t vma = vext_vma(desc);                                            \
5073     uint32_t i;                                                               \
5074                                                                               \
5075     for (i = env->vstart; i < vl; i++) {                                      \
5076         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5077             /* set masked-off elements to 1s */                               \
5078             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5079             continue;                                                         \
5080         }                                                                     \
5081         if (i == vl - 1) {                                                    \
5082             *((ETYPE *)vd + H(i)) = s1;                                       \
5083         } else {                                                              \
5084             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5085         }                                                                     \
5086     }                                                                         \
5087     env->vstart = 0;                                                          \
5088     /* set tail elements to 1s */                                             \
5089     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5090 }
5091 
5092 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5093 GEN_VEXT_VSLIDE1DOWN(16, H2)
5094 GEN_VEXT_VSLIDE1DOWN(32, H4)
5095 GEN_VEXT_VSLIDE1DOWN(64, H8)
5096 
5097 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5098 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5099                   CPURISCVState *env, uint32_t desc)              \
5100 {                                                                 \
5101     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5102 }
5103 
5104 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5105 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5106 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5107 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5108 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5109 
5110 /* Vector Floating-Point Slide Instructions */
5111 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5112 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5113                   CPURISCVState *env, uint32_t desc)          \
5114 {                                                             \
5115     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5116 }
5117 
5118 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5119 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5120 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5121 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5122 
5123 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5124 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5125                   CPURISCVState *env, uint32_t desc)          \
5126 {                                                             \
5127     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5128 }
5129 
5130 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5131 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5132 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5133 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5134 
5135 /* Vector Register Gather Instruction */
5136 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5137 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5138                   CPURISCVState *env, uint32_t desc)                      \
5139 {                                                                         \
5140     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5141     uint32_t vm = vext_vm(desc);                                          \
5142     uint32_t vl = env->vl;                                                \
5143     uint32_t esz = sizeof(TS2);                                           \
5144     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5145     uint32_t vta = vext_vta(desc);                                        \
5146     uint32_t vma = vext_vma(desc);                                        \
5147     uint64_t index;                                                       \
5148     uint32_t i;                                                           \
5149                                                                           \
5150     for (i = env->vstart; i < vl; i++) {                                  \
5151         if (!vm && !vext_elem_mask(v0, i)) {                              \
5152             /* set masked-off elements to 1s */                           \
5153             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5154             continue;                                                     \
5155         }                                                                 \
5156         index = *((TS1 *)vs1 + HS1(i));                                   \
5157         if (index >= vlmax) {                                             \
5158             *((TS2 *)vd + HS2(i)) = 0;                                    \
5159         } else {                                                          \
5160             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5161         }                                                                 \
5162     }                                                                     \
5163     env->vstart = 0;                                                      \
5164     /* set tail elements to 1s */                                         \
5165     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5166 }
5167 
5168 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5169 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5170 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5171 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5172 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5173 
5174 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5175 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5176 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5177 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5178 
5179 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5180 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5181                   CPURISCVState *env, uint32_t desc)                      \
5182 {                                                                         \
5183     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5184     uint32_t vm = vext_vm(desc);                                          \
5185     uint32_t vl = env->vl;                                                \
5186     uint32_t esz = sizeof(ETYPE);                                         \
5187     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5188     uint32_t vta = vext_vta(desc);                                        \
5189     uint32_t vma = vext_vma(desc);                                        \
5190     uint64_t index = s1;                                                  \
5191     uint32_t i;                                                           \
5192                                                                           \
5193     for (i = env->vstart; i < vl; i++) {                                  \
5194         if (!vm && !vext_elem_mask(v0, i)) {                              \
5195             /* set masked-off elements to 1s */                           \
5196             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5197             continue;                                                     \
5198         }                                                                 \
5199         if (index >= vlmax) {                                             \
5200             *((ETYPE *)vd + H(i)) = 0;                                    \
5201         } else {                                                          \
5202             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5203         }                                                                 \
5204     }                                                                     \
5205     env->vstart = 0;                                                      \
5206     /* set tail elements to 1s */                                         \
5207     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5208 }
5209 
5210 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5211 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5212 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5213 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5214 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5215 
5216 /* Vector Compress Instruction */
5217 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5218 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5219                   CPURISCVState *env, uint32_t desc)                      \
5220 {                                                                         \
5221     uint32_t vl = env->vl;                                                \
5222     uint32_t esz = sizeof(ETYPE);                                         \
5223     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5224     uint32_t vta = vext_vta(desc);                                        \
5225     uint32_t num = 0, i;                                                  \
5226                                                                           \
5227     for (i = env->vstart; i < vl; i++) {                                  \
5228         if (!vext_elem_mask(vs1, i)) {                                    \
5229             continue;                                                     \
5230         }                                                                 \
5231         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5232         num++;                                                            \
5233     }                                                                     \
5234     env->vstart = 0;                                                      \
5235     /* set tail elements to 1s */                                         \
5236     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5237 }
5238 
5239 /* Compress into vd elements of vs2 where vs1 is enabled */
5240 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5241 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5242 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5243 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5244 
5245 /* Vector Whole Register Move */
5246 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5247 {
5248     /* EEW = SEW */
5249     uint32_t maxsz = simd_maxsz(desc);
5250     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5251     uint32_t startb = env->vstart * sewb;
5252     uint32_t i = startb;
5253 
5254     memcpy((uint8_t *)vd + H1(i),
5255            (uint8_t *)vs2 + H1(i),
5256            maxsz - startb);
5257 
5258     env->vstart = 0;
5259 }
5260 
5261 /* Vector Integer Extension */
5262 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5263 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5264                   CPURISCVState *env, uint32_t desc)             \
5265 {                                                                \
5266     uint32_t vl = env->vl;                                       \
5267     uint32_t vm = vext_vm(desc);                                 \
5268     uint32_t esz = sizeof(ETYPE);                                \
5269     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5270     uint32_t vta = vext_vta(desc);                               \
5271     uint32_t vma = vext_vma(desc);                               \
5272     uint32_t i;                                                  \
5273                                                                  \
5274     for (i = env->vstart; i < vl; i++) {                         \
5275         if (!vm && !vext_elem_mask(v0, i)) {                     \
5276             /* set masked-off elements to 1s */                  \
5277             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5278             continue;                                            \
5279         }                                                        \
5280         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5281     }                                                            \
5282     env->vstart = 0;                                             \
5283     /* set tail elements to 1s */                                \
5284     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5285 }
5286 
5287 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5288 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5289 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5290 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5291 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5292 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5293 
5294 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5295 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5296 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5297 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5298 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5299 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5300