xref: /openbmc/qemu/target/riscv/vector_helper.c (revision f1eed927)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 /*
131  * Get the maximum number of elements can be operated.
132  *
133  * log2_esz: log2 of element size in bytes.
134  */
135 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
136 {
137     /*
138      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
139      * so vlen in bytes (vlenb) is encoded as maxsz.
140      */
141     uint32_t vlenb = simd_maxsz(desc);
142 
143     /* Return VLMAX */
144     int scale = vext_lmul(desc) - log2_esz;
145     return scale < 0 ? vlenb >> -scale : vlenb << scale;
146 }
147 
148 /*
149  * Get number of total elements, including prestart, body and tail elements.
150  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
151  * are held in the same vector register.
152  */
153 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
154                                             uint32_t esz)
155 {
156     uint32_t vlenb = simd_maxsz(desc);
157     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
158     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
159                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
160     return (vlenb << emul) / esz;
161 }
162 
163 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
164 {
165     return (addr & env->cur_pmmask) | env->cur_pmbase;
166 }
167 
168 /*
169  * This function checks watchpoint before real load operation.
170  *
171  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
172  * In user mode, there is no watchpoint support now.
173  *
174  * It will trigger an exception if there is no mapping in TLB
175  * and page table walk can't fill the TLB entry. Then the guest
176  * software can return here after process the exception or never return.
177  */
178 static void probe_pages(CPURISCVState *env, target_ulong addr,
179                         target_ulong len, uintptr_t ra,
180                         MMUAccessType access_type)
181 {
182     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
183     target_ulong curlen = MIN(pagelen, len);
184 
185     probe_access(env, adjust_addr(env, addr), curlen, access_type,
186                  cpu_mmu_index(env, false), ra);
187     if (len > curlen) {
188         addr += curlen;
189         curlen = len - curlen;
190         probe_access(env, adjust_addr(env, addr), curlen, access_type,
191                      cpu_mmu_index(env, false), ra);
192     }
193 }
194 
195 /* set agnostic elements to 1s */
196 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
197                               uint32_t tot)
198 {
199     if (is_agnostic == 0) {
200         /* policy undisturbed */
201         return;
202     }
203     if (tot - cnt == 0) {
204         return ;
205     }
206     memset(base + cnt, -1, tot - cnt);
207 }
208 
209 static inline void vext_set_elem_mask(void *v0, int index,
210                                       uint8_t value)
211 {
212     int idx = index / 64;
213     int pos = index % 64;
214     uint64_t old = ((uint64_t *)v0)[idx];
215     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
216 }
217 
218 /*
219  * Earlier designs (pre-0.9) had a varying number of bits
220  * per mask value (MLEN). In the 0.9 design, MLEN=1.
221  * (Section 4.5)
222  */
223 static inline int vext_elem_mask(void *v0, int index)
224 {
225     int idx = index / 64;
226     int pos = index  % 64;
227     return (((uint64_t *)v0)[idx] >> pos) & 1;
228 }
229 
230 /* elements operations for load and store */
231 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
232                                uint32_t idx, void *vd, uintptr_t retaddr);
233 
234 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
235 static void NAME(CPURISCVState *env, abi_ptr addr,         \
236                  uint32_t idx, void *vd, uintptr_t retaddr)\
237 {                                                          \
238     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
239     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
240 }                                                          \
241 
242 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
243 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
244 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
245 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
246 
247 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
248 static void NAME(CPURISCVState *env, abi_ptr addr,         \
249                  uint32_t idx, void *vd, uintptr_t retaddr)\
250 {                                                          \
251     ETYPE data = *((ETYPE *)vd + H(idx));                  \
252     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
253 }
254 
255 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
256 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
257 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
258 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
259 
260 /*
261  *** stride: access vector element from strided memory
262  */
263 static void
264 vext_ldst_stride(void *vd, void *v0, target_ulong base,
265                  target_ulong stride, CPURISCVState *env,
266                  uint32_t desc, uint32_t vm,
267                  vext_ldst_elem_fn *ldst_elem,
268                  uint32_t log2_esz, uintptr_t ra)
269 {
270     uint32_t i, k;
271     uint32_t nf = vext_nf(desc);
272     uint32_t max_elems = vext_max_elems(desc, log2_esz);
273 
274     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
275         if (!vm && !vext_elem_mask(v0, i)) {
276             continue;
277         }
278 
279         k = 0;
280         while (k < nf) {
281             target_ulong addr = base + stride * i + (k << log2_esz);
282             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
283             k++;
284         }
285     }
286     env->vstart = 0;
287 }
288 
289 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
290 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
291                   target_ulong stride, CPURISCVState *env,              \
292                   uint32_t desc)                                        \
293 {                                                                       \
294     uint32_t vm = vext_vm(desc);                                        \
295     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
296                      ctzl(sizeof(ETYPE)), GETPC());                     \
297 }
298 
299 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
300 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
301 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
302 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
303 
304 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
305 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
306                   target_ulong stride, CPURISCVState *env,              \
307                   uint32_t desc)                                        \
308 {                                                                       \
309     uint32_t vm = vext_vm(desc);                                        \
310     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
311                      ctzl(sizeof(ETYPE)), GETPC());                     \
312 }
313 
314 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
315 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
316 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
317 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
318 
319 /*
320  *** unit-stride: access elements stored contiguously in memory
321  */
322 
323 /* unmasked unit-stride load and store operation*/
324 static void
325 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
326              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
327              uintptr_t ra)
328 {
329     uint32_t i, k;
330     uint32_t nf = vext_nf(desc);
331     uint32_t max_elems = vext_max_elems(desc, log2_esz);
332 
333     /* load bytes from guest memory */
334     for (i = env->vstart; i < evl; i++, env->vstart++) {
335         k = 0;
336         while (k < nf) {
337             target_ulong addr = base + ((i * nf + k) << log2_esz);
338             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
339             k++;
340         }
341     }
342     env->vstart = 0;
343 }
344 
345 /*
346  * masked unit-stride load and store operation will be a special case of stride,
347  * stride = NF * sizeof (MTYPE)
348  */
349 
350 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
351 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
352                          CPURISCVState *env, uint32_t desc)             \
353 {                                                                       \
354     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
355     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
356                      ctzl(sizeof(ETYPE)), GETPC());                     \
357 }                                                                       \
358                                                                         \
359 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
360                   CPURISCVState *env, uint32_t desc)                    \
361 {                                                                       \
362     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
363                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
364 }
365 
366 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
367 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
368 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
369 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
370 
371 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
372 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
373                          CPURISCVState *env, uint32_t desc)              \
374 {                                                                        \
375     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
376     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
377                      ctzl(sizeof(ETYPE)), GETPC());                      \
378 }                                                                        \
379                                                                          \
380 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
381                   CPURISCVState *env, uint32_t desc)                     \
382 {                                                                        \
383     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
384                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
385 }
386 
387 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
388 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
389 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
390 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
391 
392 /*
393  *** unit stride mask load and store, EEW = 1
394  */
395 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
396                     CPURISCVState *env, uint32_t desc)
397 {
398     /* evl = ceil(vl/8) */
399     uint8_t evl = (env->vl + 7) >> 3;
400     vext_ldst_us(vd, base, env, desc, lde_b,
401                  0, evl, GETPC());
402 }
403 
404 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
405                     CPURISCVState *env, uint32_t desc)
406 {
407     /* evl = ceil(vl/8) */
408     uint8_t evl = (env->vl + 7) >> 3;
409     vext_ldst_us(vd, base, env, desc, ste_b,
410                  0, evl, GETPC());
411 }
412 
413 /*
414  *** index: access vector element from indexed memory
415  */
416 typedef target_ulong vext_get_index_addr(target_ulong base,
417         uint32_t idx, void *vs2);
418 
419 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
420 static target_ulong NAME(target_ulong base,            \
421                          uint32_t idx, void *vs2)      \
422 {                                                      \
423     return (base + *((ETYPE *)vs2 + H(idx)));          \
424 }
425 
426 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
427 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
428 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
429 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
430 
431 static inline void
432 vext_ldst_index(void *vd, void *v0, target_ulong base,
433                 void *vs2, CPURISCVState *env, uint32_t desc,
434                 vext_get_index_addr get_index_addr,
435                 vext_ldst_elem_fn *ldst_elem,
436                 uint32_t log2_esz, uintptr_t ra)
437 {
438     uint32_t i, k;
439     uint32_t nf = vext_nf(desc);
440     uint32_t vm = vext_vm(desc);
441     uint32_t max_elems = vext_max_elems(desc, log2_esz);
442 
443     /* load bytes from guest memory */
444     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
445         if (!vm && !vext_elem_mask(v0, i)) {
446             continue;
447         }
448 
449         k = 0;
450         while (k < nf) {
451             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
452             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
453             k++;
454         }
455     }
456     env->vstart = 0;
457 }
458 
459 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
460 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
461                   void *vs2, CPURISCVState *env, uint32_t desc)            \
462 {                                                                          \
463     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
464                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
465 }
466 
467 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
468 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
469 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
470 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
471 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
472 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
473 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
474 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
475 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
476 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
477 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
478 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
479 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
480 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
481 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
482 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
483 
484 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
485 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
486                   void *vs2, CPURISCVState *env, uint32_t desc)  \
487 {                                                                \
488     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
489                     STORE_FN, ctzl(sizeof(ETYPE)),               \
490                     GETPC());                                    \
491 }
492 
493 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
494 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
495 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
496 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
497 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
498 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
499 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
500 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
501 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
502 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
503 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
504 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
505 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
506 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
507 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
508 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
509 
510 /*
511  *** unit-stride fault-only-fisrt load instructions
512  */
513 static inline void
514 vext_ldff(void *vd, void *v0, target_ulong base,
515           CPURISCVState *env, uint32_t desc,
516           vext_ldst_elem_fn *ldst_elem,
517           uint32_t log2_esz, uintptr_t ra)
518 {
519     void *host;
520     uint32_t i, k, vl = 0;
521     uint32_t nf = vext_nf(desc);
522     uint32_t vm = vext_vm(desc);
523     uint32_t max_elems = vext_max_elems(desc, log2_esz);
524     target_ulong addr, offset, remain;
525 
526     /* probe every access*/
527     for (i = env->vstart; i < env->vl; i++) {
528         if (!vm && !vext_elem_mask(v0, i)) {
529             continue;
530         }
531         addr = adjust_addr(env, base + i * (nf << log2_esz));
532         if (i == 0) {
533             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
534         } else {
535             /* if it triggers an exception, no need to check watchpoint */
536             remain = nf << log2_esz;
537             while (remain > 0) {
538                 offset = -(addr | TARGET_PAGE_MASK);
539                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
540                                          cpu_mmu_index(env, false));
541                 if (host) {
542 #ifdef CONFIG_USER_ONLY
543                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
544                         vl = i;
545                         goto ProbeSuccess;
546                     }
547 #else
548                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
549 #endif
550                 } else {
551                     vl = i;
552                     goto ProbeSuccess;
553                 }
554                 if (remain <=  offset) {
555                     break;
556                 }
557                 remain -= offset;
558                 addr = adjust_addr(env, addr + offset);
559             }
560         }
561     }
562 ProbeSuccess:
563     /* load bytes from guest memory */
564     if (vl != 0) {
565         env->vl = vl;
566     }
567     for (i = env->vstart; i < env->vl; i++) {
568         k = 0;
569         if (!vm && !vext_elem_mask(v0, i)) {
570             continue;
571         }
572         while (k < nf) {
573             target_ulong addr = base + ((i * nf + k) << log2_esz);
574             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
575             k++;
576         }
577     }
578     env->vstart = 0;
579 }
580 
581 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
582 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
583                   CPURISCVState *env, uint32_t desc)      \
584 {                                                         \
585     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
586               ctzl(sizeof(ETYPE)), GETPC());              \
587 }
588 
589 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
590 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
591 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
592 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
593 
594 #define DO_SWAP(N, M) (M)
595 #define DO_AND(N, M)  (N & M)
596 #define DO_XOR(N, M)  (N ^ M)
597 #define DO_OR(N, M)   (N | M)
598 #define DO_ADD(N, M)  (N + M)
599 
600 /* Signed min/max */
601 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
602 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
603 
604 /* Unsigned min/max */
605 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
606 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
607 
608 /*
609  *** load and store whole register instructions
610  */
611 static void
612 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
613                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
614 {
615     uint32_t i, k, off, pos;
616     uint32_t nf = vext_nf(desc);
617     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
618     uint32_t max_elems = vlenb >> log2_esz;
619 
620     k = env->vstart / max_elems;
621     off = env->vstart % max_elems;
622 
623     if (off) {
624         /* load/store rest of elements of current segment pointed by vstart */
625         for (pos = off; pos < max_elems; pos++, env->vstart++) {
626             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
627             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
628         }
629         k++;
630     }
631 
632     /* load/store elements for rest of segments */
633     for (; k < nf; k++) {
634         for (i = 0; i < max_elems; i++, env->vstart++) {
635             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
636             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
637         }
638     }
639 
640     env->vstart = 0;
641 }
642 
643 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
644 void HELPER(NAME)(void *vd, target_ulong base,       \
645                   CPURISCVState *env, uint32_t desc) \
646 {                                                    \
647     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
648                     ctzl(sizeof(ETYPE)), GETPC());   \
649 }
650 
651 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
652 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
653 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
654 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
655 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
656 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
657 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
658 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
659 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
660 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
661 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
662 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
663 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
664 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
665 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
666 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
667 
668 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
669 void HELPER(NAME)(void *vd, target_ulong base,       \
670                   CPURISCVState *env, uint32_t desc) \
671 {                                                    \
672     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
673                     ctzl(sizeof(ETYPE)), GETPC());   \
674 }
675 
676 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
677 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
678 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
679 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
680 
681 /*
682  *** Vector Integer Arithmetic Instructions
683  */
684 
685 /* expand macro args before macro */
686 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
687 
688 /* (TD, T1, T2, TX1, TX2) */
689 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
690 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
691 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
692 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
693 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
694 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
695 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
696 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
697 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
698 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
699 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
700 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
701 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
702 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
703 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
704 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
705 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
706 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
707 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
708 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
709 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
710 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
711 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
712 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
713 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
714 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
715 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
716 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
717 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
718 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
719 
720 /* operation of two vector elements */
721 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
722 
723 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
724 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
725 {                                                               \
726     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
727     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
728     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
729 }
730 #define DO_SUB(N, M) (N - M)
731 #define DO_RSUB(N, M) (M - N)
732 
733 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
734 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
735 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
736 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
737 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
738 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
739 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
740 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
741 
742 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
743                        CPURISCVState *env, uint32_t desc,
744                        opivv2_fn *fn, uint32_t esz)
745 {
746     uint32_t vm = vext_vm(desc);
747     uint32_t vl = env->vl;
748     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
749     uint32_t vta = vext_vta(desc);
750     uint32_t i;
751 
752     for (i = env->vstart; i < vl; i++) {
753         if (!vm && !vext_elem_mask(v0, i)) {
754             continue;
755         }
756         fn(vd, vs1, vs2, i);
757     }
758     env->vstart = 0;
759     /* set tail elements to 1s */
760     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
761 }
762 
763 /* generate the helpers for OPIVV */
764 #define GEN_VEXT_VV(NAME, ESZ)                            \
765 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
766                   void *vs2, CPURISCVState *env,          \
767                   uint32_t desc)                          \
768 {                                                         \
769     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
770                do_##NAME, ESZ);                           \
771 }
772 
773 GEN_VEXT_VV(vadd_vv_b, 1)
774 GEN_VEXT_VV(vadd_vv_h, 2)
775 GEN_VEXT_VV(vadd_vv_w, 4)
776 GEN_VEXT_VV(vadd_vv_d, 8)
777 GEN_VEXT_VV(vsub_vv_b, 1)
778 GEN_VEXT_VV(vsub_vv_h, 2)
779 GEN_VEXT_VV(vsub_vv_w, 4)
780 GEN_VEXT_VV(vsub_vv_d, 8)
781 
782 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
783 
784 /*
785  * (T1)s1 gives the real operator type.
786  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
787  */
788 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
789 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
790 {                                                                   \
791     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
792     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
793 }
794 
795 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
796 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
797 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
798 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
799 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
800 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
801 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
802 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
803 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
804 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
805 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
806 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
807 
808 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
809                        CPURISCVState *env, uint32_t desc,
810                        opivx2_fn fn)
811 {
812     uint32_t vm = vext_vm(desc);
813     uint32_t vl = env->vl;
814     uint32_t i;
815 
816     for (i = env->vstart; i < vl; i++) {
817         if (!vm && !vext_elem_mask(v0, i)) {
818             continue;
819         }
820         fn(vd, s1, vs2, i);
821     }
822     env->vstart = 0;
823 }
824 
825 /* generate the helpers for OPIVX */
826 #define GEN_VEXT_VX(NAME)                                 \
827 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
828                   void *vs2, CPURISCVState *env,          \
829                   uint32_t desc)                          \
830 {                                                         \
831     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
832                do_##NAME);                                \
833 }
834 
835 GEN_VEXT_VX(vadd_vx_b)
836 GEN_VEXT_VX(vadd_vx_h)
837 GEN_VEXT_VX(vadd_vx_w)
838 GEN_VEXT_VX(vadd_vx_d)
839 GEN_VEXT_VX(vsub_vx_b)
840 GEN_VEXT_VX(vsub_vx_h)
841 GEN_VEXT_VX(vsub_vx_w)
842 GEN_VEXT_VX(vsub_vx_d)
843 GEN_VEXT_VX(vrsub_vx_b)
844 GEN_VEXT_VX(vrsub_vx_h)
845 GEN_VEXT_VX(vrsub_vx_w)
846 GEN_VEXT_VX(vrsub_vx_d)
847 
848 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
849 {
850     intptr_t oprsz = simd_oprsz(desc);
851     intptr_t i;
852 
853     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
854         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
855     }
856 }
857 
858 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
859 {
860     intptr_t oprsz = simd_oprsz(desc);
861     intptr_t i;
862 
863     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
864         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
865     }
866 }
867 
868 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
869 {
870     intptr_t oprsz = simd_oprsz(desc);
871     intptr_t i;
872 
873     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
874         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
875     }
876 }
877 
878 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
879 {
880     intptr_t oprsz = simd_oprsz(desc);
881     intptr_t i;
882 
883     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
884         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
885     }
886 }
887 
888 /* Vector Widening Integer Add/Subtract */
889 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
890 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
891 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
892 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
893 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
894 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
895 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
896 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
897 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
898 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
899 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
900 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
901 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
902 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
903 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
904 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
905 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
906 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
907 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
908 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
909 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
910 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
911 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
912 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
913 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
914 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
915 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
916 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
917 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
918 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
919 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
920 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
921 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
922 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
923 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
924 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
925 GEN_VEXT_VV(vwaddu_vv_b, 2)
926 GEN_VEXT_VV(vwaddu_vv_h, 4)
927 GEN_VEXT_VV(vwaddu_vv_w, 8)
928 GEN_VEXT_VV(vwsubu_vv_b, 2)
929 GEN_VEXT_VV(vwsubu_vv_h, 4)
930 GEN_VEXT_VV(vwsubu_vv_w, 8)
931 GEN_VEXT_VV(vwadd_vv_b, 2)
932 GEN_VEXT_VV(vwadd_vv_h, 4)
933 GEN_VEXT_VV(vwadd_vv_w, 8)
934 GEN_VEXT_VV(vwsub_vv_b, 2)
935 GEN_VEXT_VV(vwsub_vv_h, 4)
936 GEN_VEXT_VV(vwsub_vv_w, 8)
937 GEN_VEXT_VV(vwaddu_wv_b, 2)
938 GEN_VEXT_VV(vwaddu_wv_h, 4)
939 GEN_VEXT_VV(vwaddu_wv_w, 8)
940 GEN_VEXT_VV(vwsubu_wv_b, 2)
941 GEN_VEXT_VV(vwsubu_wv_h, 4)
942 GEN_VEXT_VV(vwsubu_wv_w, 8)
943 GEN_VEXT_VV(vwadd_wv_b, 2)
944 GEN_VEXT_VV(vwadd_wv_h, 4)
945 GEN_VEXT_VV(vwadd_wv_w, 8)
946 GEN_VEXT_VV(vwsub_wv_b, 2)
947 GEN_VEXT_VV(vwsub_wv_h, 4)
948 GEN_VEXT_VV(vwsub_wv_w, 8)
949 
950 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
951 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
952 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
953 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
954 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
955 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
956 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
957 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
958 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
959 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
960 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
961 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
962 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
963 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
964 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
965 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
966 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
967 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
968 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
969 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
970 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
971 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
972 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
973 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
974 GEN_VEXT_VX(vwaddu_vx_b)
975 GEN_VEXT_VX(vwaddu_vx_h)
976 GEN_VEXT_VX(vwaddu_vx_w)
977 GEN_VEXT_VX(vwsubu_vx_b)
978 GEN_VEXT_VX(vwsubu_vx_h)
979 GEN_VEXT_VX(vwsubu_vx_w)
980 GEN_VEXT_VX(vwadd_vx_b)
981 GEN_VEXT_VX(vwadd_vx_h)
982 GEN_VEXT_VX(vwadd_vx_w)
983 GEN_VEXT_VX(vwsub_vx_b)
984 GEN_VEXT_VX(vwsub_vx_h)
985 GEN_VEXT_VX(vwsub_vx_w)
986 GEN_VEXT_VX(vwaddu_wx_b)
987 GEN_VEXT_VX(vwaddu_wx_h)
988 GEN_VEXT_VX(vwaddu_wx_w)
989 GEN_VEXT_VX(vwsubu_wx_b)
990 GEN_VEXT_VX(vwsubu_wx_h)
991 GEN_VEXT_VX(vwsubu_wx_w)
992 GEN_VEXT_VX(vwadd_wx_b)
993 GEN_VEXT_VX(vwadd_wx_h)
994 GEN_VEXT_VX(vwadd_wx_w)
995 GEN_VEXT_VX(vwsub_wx_b)
996 GEN_VEXT_VX(vwsub_wx_h)
997 GEN_VEXT_VX(vwsub_wx_w)
998 
999 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1000 #define DO_VADC(N, M, C) (N + M + C)
1001 #define DO_VSBC(N, M, C) (N - M - C)
1002 
1003 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1004 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1005                   CPURISCVState *env, uint32_t desc)          \
1006 {                                                             \
1007     uint32_t vl = env->vl;                                    \
1008     uint32_t i;                                               \
1009                                                               \
1010     for (i = env->vstart; i < vl; i++) {                      \
1011         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1012         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1013         ETYPE carry = vext_elem_mask(v0, i);                  \
1014                                                               \
1015         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1016     }                                                         \
1017     env->vstart = 0;                                          \
1018 }
1019 
1020 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1021 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1022 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1023 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1024 
1025 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1026 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1027 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1028 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1029 
1030 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1031 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1032                   CPURISCVState *env, uint32_t desc)                     \
1033 {                                                                        \
1034     uint32_t vl = env->vl;                                               \
1035     uint32_t i;                                                          \
1036                                                                          \
1037     for (i = env->vstart; i < vl; i++) {                                 \
1038         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1039         ETYPE carry = vext_elem_mask(v0, i);                             \
1040                                                                          \
1041         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1042     }                                                                    \
1043     env->vstart = 0;                                          \
1044 }
1045 
1046 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1047 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1048 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1049 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1050 
1051 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1052 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1053 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1054 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1055 
1056 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1057                           (__typeof(N))(N + M) < N)
1058 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1059 
1060 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1061 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1062                   CPURISCVState *env, uint32_t desc)          \
1063 {                                                             \
1064     uint32_t vl = env->vl;                                    \
1065     uint32_t vm = vext_vm(desc);                              \
1066     uint32_t i;                                               \
1067                                                               \
1068     for (i = env->vstart; i < vl; i++) {                      \
1069         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1070         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1071         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1072         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1073     }                                                         \
1074     env->vstart = 0;                                          \
1075 }
1076 
1077 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1078 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1079 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1080 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1081 
1082 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1083 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1084 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1085 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1086 
1087 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1088 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1089                   void *vs2, CPURISCVState *env, uint32_t desc) \
1090 {                                                               \
1091     uint32_t vl = env->vl;                                      \
1092     uint32_t vm = vext_vm(desc);                                \
1093     uint32_t i;                                                 \
1094                                                                 \
1095     for (i = env->vstart; i < vl; i++) {                        \
1096         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1097         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1098         vext_set_elem_mask(vd, i,                               \
1099                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1100     }                                                           \
1101     env->vstart = 0;                                            \
1102 }
1103 
1104 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1105 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1106 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1107 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1108 
1109 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1110 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1111 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1112 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1113 
1114 /* Vector Bitwise Logical Instructions */
1115 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1116 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1117 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1118 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1119 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1120 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1121 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1122 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1123 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1124 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1125 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1126 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1127 GEN_VEXT_VV(vand_vv_b, 1)
1128 GEN_VEXT_VV(vand_vv_h, 2)
1129 GEN_VEXT_VV(vand_vv_w, 4)
1130 GEN_VEXT_VV(vand_vv_d, 8)
1131 GEN_VEXT_VV(vor_vv_b, 1)
1132 GEN_VEXT_VV(vor_vv_h, 2)
1133 GEN_VEXT_VV(vor_vv_w, 4)
1134 GEN_VEXT_VV(vor_vv_d, 8)
1135 GEN_VEXT_VV(vxor_vv_b, 1)
1136 GEN_VEXT_VV(vxor_vv_h, 2)
1137 GEN_VEXT_VV(vxor_vv_w, 4)
1138 GEN_VEXT_VV(vxor_vv_d, 8)
1139 
1140 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1141 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1142 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1143 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1144 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1145 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1146 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1147 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1148 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1149 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1150 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1151 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1152 GEN_VEXT_VX(vand_vx_b)
1153 GEN_VEXT_VX(vand_vx_h)
1154 GEN_VEXT_VX(vand_vx_w)
1155 GEN_VEXT_VX(vand_vx_d)
1156 GEN_VEXT_VX(vor_vx_b)
1157 GEN_VEXT_VX(vor_vx_h)
1158 GEN_VEXT_VX(vor_vx_w)
1159 GEN_VEXT_VX(vor_vx_d)
1160 GEN_VEXT_VX(vxor_vx_b)
1161 GEN_VEXT_VX(vxor_vx_h)
1162 GEN_VEXT_VX(vxor_vx_w)
1163 GEN_VEXT_VX(vxor_vx_d)
1164 
1165 /* Vector Single-Width Bit Shift Instructions */
1166 #define DO_SLL(N, M)  (N << (M))
1167 #define DO_SRL(N, M)  (N >> (M))
1168 
1169 /* generate the helpers for shift instructions with two vector operators */
1170 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1171 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1172                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1173 {                                                                         \
1174     uint32_t vm = vext_vm(desc);                                          \
1175     uint32_t vl = env->vl;                                                \
1176     uint32_t i;                                                           \
1177                                                                           \
1178     for (i = env->vstart; i < vl; i++) {                                  \
1179         if (!vm && !vext_elem_mask(v0, i)) {                              \
1180             continue;                                                     \
1181         }                                                                 \
1182         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1183         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1184         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1185     }                                                                     \
1186     env->vstart = 0;                                                      \
1187 }
1188 
1189 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1190 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1191 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1192 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1193 
1194 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1195 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1196 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1197 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1198 
1199 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1200 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1201 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1202 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1203 
1204 /* generate the helpers for shift instructions with one vector and one scalar */
1205 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1206 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1207         void *vs2, CPURISCVState *env, uint32_t desc)       \
1208 {                                                           \
1209     uint32_t vm = vext_vm(desc);                            \
1210     uint32_t vl = env->vl;                                  \
1211     uint32_t i;                                             \
1212                                                             \
1213     for (i = env->vstart; i < vl; i++) {                    \
1214         if (!vm && !vext_elem_mask(v0, i)) {                \
1215             continue;                                       \
1216         }                                                   \
1217         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1218         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1219     }                                                       \
1220     env->vstart = 0;                                        \
1221 }
1222 
1223 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1224 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1225 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1226 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1227 
1228 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1229 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1230 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1231 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1232 
1233 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1234 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1235 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1236 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1237 
1238 /* Vector Narrowing Integer Right Shift Instructions */
1239 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1240 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1241 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1242 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1243 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1244 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1245 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1246 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1247 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1248 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1249 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1250 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1251 
1252 /* Vector Integer Comparison Instructions */
1253 #define DO_MSEQ(N, M) (N == M)
1254 #define DO_MSNE(N, M) (N != M)
1255 #define DO_MSLT(N, M) (N < M)
1256 #define DO_MSLE(N, M) (N <= M)
1257 #define DO_MSGT(N, M) (N > M)
1258 
1259 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1260 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1261                   CPURISCVState *env, uint32_t desc)          \
1262 {                                                             \
1263     uint32_t vm = vext_vm(desc);                              \
1264     uint32_t vl = env->vl;                                    \
1265     uint32_t i;                                               \
1266                                                               \
1267     for (i = env->vstart; i < vl; i++) {                      \
1268         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1269         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1270         if (!vm && !vext_elem_mask(v0, i)) {                  \
1271             continue;                                         \
1272         }                                                     \
1273         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1274     }                                                         \
1275     env->vstart = 0;                                          \
1276 }
1277 
1278 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1279 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1280 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1281 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1282 
1283 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1284 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1285 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1286 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1287 
1288 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1289 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1290 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1291 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1292 
1293 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1294 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1295 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1296 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1297 
1298 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1299 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1300 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1301 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1302 
1303 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1304 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1305 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1306 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1307 
1308 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1309 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1310                   CPURISCVState *env, uint32_t desc)                \
1311 {                                                                   \
1312     uint32_t vm = vext_vm(desc);                                    \
1313     uint32_t vl = env->vl;                                          \
1314     uint32_t i;                                                     \
1315                                                                     \
1316     for (i = env->vstart; i < vl; i++) {                            \
1317         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1318         if (!vm && !vext_elem_mask(v0, i)) {                        \
1319             continue;                                               \
1320         }                                                           \
1321         vext_set_elem_mask(vd, i,                                   \
1322                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1323     }                                                               \
1324     env->vstart = 0;                                                \
1325 }
1326 
1327 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1328 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1329 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1330 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1331 
1332 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1333 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1334 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1335 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1336 
1337 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1338 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1339 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1340 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1341 
1342 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1343 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1344 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1345 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1346 
1347 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1348 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1349 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1350 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1351 
1352 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1353 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1354 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1355 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1356 
1357 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1358 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1359 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1360 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1361 
1362 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1363 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1364 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1365 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1366 
1367 /* Vector Integer Min/Max Instructions */
1368 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1369 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1370 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1371 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1372 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1373 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1374 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1375 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1376 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1377 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1378 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1379 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1380 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1381 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1382 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1383 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1384 GEN_VEXT_VV(vminu_vv_b, 1)
1385 GEN_VEXT_VV(vminu_vv_h, 2)
1386 GEN_VEXT_VV(vminu_vv_w, 4)
1387 GEN_VEXT_VV(vminu_vv_d, 8)
1388 GEN_VEXT_VV(vmin_vv_b, 1)
1389 GEN_VEXT_VV(vmin_vv_h, 2)
1390 GEN_VEXT_VV(vmin_vv_w, 4)
1391 GEN_VEXT_VV(vmin_vv_d, 8)
1392 GEN_VEXT_VV(vmaxu_vv_b, 1)
1393 GEN_VEXT_VV(vmaxu_vv_h, 2)
1394 GEN_VEXT_VV(vmaxu_vv_w, 4)
1395 GEN_VEXT_VV(vmaxu_vv_d, 8)
1396 GEN_VEXT_VV(vmax_vv_b, 1)
1397 GEN_VEXT_VV(vmax_vv_h, 2)
1398 GEN_VEXT_VV(vmax_vv_w, 4)
1399 GEN_VEXT_VV(vmax_vv_d, 8)
1400 
1401 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1402 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1403 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1404 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1405 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1406 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1407 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1408 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1409 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1410 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1411 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1412 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1413 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1414 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1415 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1416 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1417 GEN_VEXT_VX(vminu_vx_b)
1418 GEN_VEXT_VX(vminu_vx_h)
1419 GEN_VEXT_VX(vminu_vx_w)
1420 GEN_VEXT_VX(vminu_vx_d)
1421 GEN_VEXT_VX(vmin_vx_b)
1422 GEN_VEXT_VX(vmin_vx_h)
1423 GEN_VEXT_VX(vmin_vx_w)
1424 GEN_VEXT_VX(vmin_vx_d)
1425 GEN_VEXT_VX(vmaxu_vx_b)
1426 GEN_VEXT_VX(vmaxu_vx_h)
1427 GEN_VEXT_VX(vmaxu_vx_w)
1428 GEN_VEXT_VX(vmaxu_vx_d)
1429 GEN_VEXT_VX(vmax_vx_b)
1430 GEN_VEXT_VX(vmax_vx_h)
1431 GEN_VEXT_VX(vmax_vx_w)
1432 GEN_VEXT_VX(vmax_vx_d)
1433 
1434 /* Vector Single-Width Integer Multiply Instructions */
1435 #define DO_MUL(N, M) (N * M)
1436 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1437 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1438 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1439 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1440 GEN_VEXT_VV(vmul_vv_b, 1)
1441 GEN_VEXT_VV(vmul_vv_h, 2)
1442 GEN_VEXT_VV(vmul_vv_w, 4)
1443 GEN_VEXT_VV(vmul_vv_d, 8)
1444 
1445 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1446 {
1447     return (int16_t)s2 * (int16_t)s1 >> 8;
1448 }
1449 
1450 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1451 {
1452     return (int32_t)s2 * (int32_t)s1 >> 16;
1453 }
1454 
1455 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1456 {
1457     return (int64_t)s2 * (int64_t)s1 >> 32;
1458 }
1459 
1460 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1461 {
1462     uint64_t hi_64, lo_64;
1463 
1464     muls64(&lo_64, &hi_64, s1, s2);
1465     return hi_64;
1466 }
1467 
1468 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1469 {
1470     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1471 }
1472 
1473 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1474 {
1475     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1476 }
1477 
1478 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1479 {
1480     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1481 }
1482 
1483 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1484 {
1485     uint64_t hi_64, lo_64;
1486 
1487     mulu64(&lo_64, &hi_64, s2, s1);
1488     return hi_64;
1489 }
1490 
1491 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1492 {
1493     return (int16_t)s2 * (uint16_t)s1 >> 8;
1494 }
1495 
1496 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1497 {
1498     return (int32_t)s2 * (uint32_t)s1 >> 16;
1499 }
1500 
1501 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1502 {
1503     return (int64_t)s2 * (uint64_t)s1 >> 32;
1504 }
1505 
1506 /*
1507  * Let  A = signed operand,
1508  *      B = unsigned operand
1509  *      P = mulu64(A, B), unsigned product
1510  *
1511  * LET  X = 2 ** 64  - A, 2's complement of A
1512  *      SP = signed product
1513  * THEN
1514  *      IF A < 0
1515  *          SP = -X * B
1516  *             = -(2 ** 64 - A) * B
1517  *             = A * B - 2 ** 64 * B
1518  *             = P - 2 ** 64 * B
1519  *      ELSE
1520  *          SP = P
1521  * THEN
1522  *      HI_P -= (A < 0 ? B : 0)
1523  */
1524 
1525 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1526 {
1527     uint64_t hi_64, lo_64;
1528 
1529     mulu64(&lo_64, &hi_64, s2, s1);
1530 
1531     hi_64 -= s2 < 0 ? s1 : 0;
1532     return hi_64;
1533 }
1534 
1535 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1536 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1537 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1538 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1539 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1540 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1541 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1542 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1543 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1544 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1545 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1546 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1547 GEN_VEXT_VV(vmulh_vv_b, 1)
1548 GEN_VEXT_VV(vmulh_vv_h, 2)
1549 GEN_VEXT_VV(vmulh_vv_w, 4)
1550 GEN_VEXT_VV(vmulh_vv_d, 8)
1551 GEN_VEXT_VV(vmulhu_vv_b, 1)
1552 GEN_VEXT_VV(vmulhu_vv_h, 2)
1553 GEN_VEXT_VV(vmulhu_vv_w, 4)
1554 GEN_VEXT_VV(vmulhu_vv_d, 8)
1555 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1556 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1557 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1558 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1559 
1560 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1561 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1562 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1563 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1564 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1565 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1566 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1567 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1568 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1569 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1570 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1571 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1572 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1573 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1574 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1575 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1576 GEN_VEXT_VX(vmul_vx_b)
1577 GEN_VEXT_VX(vmul_vx_h)
1578 GEN_VEXT_VX(vmul_vx_w)
1579 GEN_VEXT_VX(vmul_vx_d)
1580 GEN_VEXT_VX(vmulh_vx_b)
1581 GEN_VEXT_VX(vmulh_vx_h)
1582 GEN_VEXT_VX(vmulh_vx_w)
1583 GEN_VEXT_VX(vmulh_vx_d)
1584 GEN_VEXT_VX(vmulhu_vx_b)
1585 GEN_VEXT_VX(vmulhu_vx_h)
1586 GEN_VEXT_VX(vmulhu_vx_w)
1587 GEN_VEXT_VX(vmulhu_vx_d)
1588 GEN_VEXT_VX(vmulhsu_vx_b)
1589 GEN_VEXT_VX(vmulhsu_vx_h)
1590 GEN_VEXT_VX(vmulhsu_vx_w)
1591 GEN_VEXT_VX(vmulhsu_vx_d)
1592 
1593 /* Vector Integer Divide Instructions */
1594 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1595 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1596 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1597         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1598 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1599         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1600 
1601 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1602 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1603 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1604 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1605 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1606 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1607 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1608 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1609 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1610 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1611 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1612 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1613 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1614 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1615 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1616 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1617 GEN_VEXT_VV(vdivu_vv_b, 1)
1618 GEN_VEXT_VV(vdivu_vv_h, 2)
1619 GEN_VEXT_VV(vdivu_vv_w, 4)
1620 GEN_VEXT_VV(vdivu_vv_d, 8)
1621 GEN_VEXT_VV(vdiv_vv_b, 1)
1622 GEN_VEXT_VV(vdiv_vv_h, 2)
1623 GEN_VEXT_VV(vdiv_vv_w, 4)
1624 GEN_VEXT_VV(vdiv_vv_d, 8)
1625 GEN_VEXT_VV(vremu_vv_b, 1)
1626 GEN_VEXT_VV(vremu_vv_h, 2)
1627 GEN_VEXT_VV(vremu_vv_w, 4)
1628 GEN_VEXT_VV(vremu_vv_d, 8)
1629 GEN_VEXT_VV(vrem_vv_b, 1)
1630 GEN_VEXT_VV(vrem_vv_h, 2)
1631 GEN_VEXT_VV(vrem_vv_w, 4)
1632 GEN_VEXT_VV(vrem_vv_d, 8)
1633 
1634 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1635 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1636 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1637 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1638 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1639 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1640 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1641 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1642 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1643 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1644 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1645 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1646 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1647 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1648 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1649 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1650 GEN_VEXT_VX(vdivu_vx_b)
1651 GEN_VEXT_VX(vdivu_vx_h)
1652 GEN_VEXT_VX(vdivu_vx_w)
1653 GEN_VEXT_VX(vdivu_vx_d)
1654 GEN_VEXT_VX(vdiv_vx_b)
1655 GEN_VEXT_VX(vdiv_vx_h)
1656 GEN_VEXT_VX(vdiv_vx_w)
1657 GEN_VEXT_VX(vdiv_vx_d)
1658 GEN_VEXT_VX(vremu_vx_b)
1659 GEN_VEXT_VX(vremu_vx_h)
1660 GEN_VEXT_VX(vremu_vx_w)
1661 GEN_VEXT_VX(vremu_vx_d)
1662 GEN_VEXT_VX(vrem_vx_b)
1663 GEN_VEXT_VX(vrem_vx_h)
1664 GEN_VEXT_VX(vrem_vx_w)
1665 GEN_VEXT_VX(vrem_vx_d)
1666 
1667 /* Vector Widening Integer Multiply Instructions */
1668 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1669 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1670 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1671 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1672 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1673 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1674 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1675 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1676 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1677 GEN_VEXT_VV(vwmul_vv_b, 2)
1678 GEN_VEXT_VV(vwmul_vv_h, 4)
1679 GEN_VEXT_VV(vwmul_vv_w, 8)
1680 GEN_VEXT_VV(vwmulu_vv_b, 2)
1681 GEN_VEXT_VV(vwmulu_vv_h, 4)
1682 GEN_VEXT_VV(vwmulu_vv_w, 8)
1683 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1684 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1685 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1686 
1687 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1688 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1689 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1690 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1691 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1692 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1693 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1694 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1695 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1696 GEN_VEXT_VX(vwmul_vx_b)
1697 GEN_VEXT_VX(vwmul_vx_h)
1698 GEN_VEXT_VX(vwmul_vx_w)
1699 GEN_VEXT_VX(vwmulu_vx_b)
1700 GEN_VEXT_VX(vwmulu_vx_h)
1701 GEN_VEXT_VX(vwmulu_vx_w)
1702 GEN_VEXT_VX(vwmulsu_vx_b)
1703 GEN_VEXT_VX(vwmulsu_vx_h)
1704 GEN_VEXT_VX(vwmulsu_vx_w)
1705 
1706 /* Vector Single-Width Integer Multiply-Add Instructions */
1707 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1708 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1709 {                                                                  \
1710     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1711     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1712     TD d = *((TD *)vd + HD(i));                                    \
1713     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1714 }
1715 
1716 #define DO_MACC(N, M, D) (M * N + D)
1717 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1718 #define DO_MADD(N, M, D) (M * D + N)
1719 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1720 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1721 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1722 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1723 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1724 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1725 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1726 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1727 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1728 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1729 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1730 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1731 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1732 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1733 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1734 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1735 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1736 GEN_VEXT_VV(vmacc_vv_b, 1)
1737 GEN_VEXT_VV(vmacc_vv_h, 2)
1738 GEN_VEXT_VV(vmacc_vv_w, 4)
1739 GEN_VEXT_VV(vmacc_vv_d, 8)
1740 GEN_VEXT_VV(vnmsac_vv_b, 1)
1741 GEN_VEXT_VV(vnmsac_vv_h, 2)
1742 GEN_VEXT_VV(vnmsac_vv_w, 4)
1743 GEN_VEXT_VV(vnmsac_vv_d, 8)
1744 GEN_VEXT_VV(vmadd_vv_b, 1)
1745 GEN_VEXT_VV(vmadd_vv_h, 2)
1746 GEN_VEXT_VV(vmadd_vv_w, 4)
1747 GEN_VEXT_VV(vmadd_vv_d, 8)
1748 GEN_VEXT_VV(vnmsub_vv_b, 1)
1749 GEN_VEXT_VV(vnmsub_vv_h, 2)
1750 GEN_VEXT_VV(vnmsub_vv_w, 4)
1751 GEN_VEXT_VV(vnmsub_vv_d, 8)
1752 
1753 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1754 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1755 {                                                                   \
1756     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1757     TD d = *((TD *)vd + HD(i));                                     \
1758     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1759 }
1760 
1761 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1762 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1763 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1764 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1765 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1766 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1767 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1768 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1769 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1770 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1771 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1772 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1773 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1774 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1775 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1776 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1777 GEN_VEXT_VX(vmacc_vx_b)
1778 GEN_VEXT_VX(vmacc_vx_h)
1779 GEN_VEXT_VX(vmacc_vx_w)
1780 GEN_VEXT_VX(vmacc_vx_d)
1781 GEN_VEXT_VX(vnmsac_vx_b)
1782 GEN_VEXT_VX(vnmsac_vx_h)
1783 GEN_VEXT_VX(vnmsac_vx_w)
1784 GEN_VEXT_VX(vnmsac_vx_d)
1785 GEN_VEXT_VX(vmadd_vx_b)
1786 GEN_VEXT_VX(vmadd_vx_h)
1787 GEN_VEXT_VX(vmadd_vx_w)
1788 GEN_VEXT_VX(vmadd_vx_d)
1789 GEN_VEXT_VX(vnmsub_vx_b)
1790 GEN_VEXT_VX(vnmsub_vx_h)
1791 GEN_VEXT_VX(vnmsub_vx_w)
1792 GEN_VEXT_VX(vnmsub_vx_d)
1793 
1794 /* Vector Widening Integer Multiply-Add Instructions */
1795 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1796 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1797 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1798 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1799 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1800 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1801 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1802 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1803 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1804 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1805 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1806 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1807 GEN_VEXT_VV(vwmacc_vv_b, 2)
1808 GEN_VEXT_VV(vwmacc_vv_h, 4)
1809 GEN_VEXT_VV(vwmacc_vv_w, 8)
1810 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1811 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1812 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1813 
1814 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1815 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1816 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1817 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1818 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1819 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1820 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1821 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1822 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1823 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1824 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1825 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1826 GEN_VEXT_VX(vwmaccu_vx_b)
1827 GEN_VEXT_VX(vwmaccu_vx_h)
1828 GEN_VEXT_VX(vwmaccu_vx_w)
1829 GEN_VEXT_VX(vwmacc_vx_b)
1830 GEN_VEXT_VX(vwmacc_vx_h)
1831 GEN_VEXT_VX(vwmacc_vx_w)
1832 GEN_VEXT_VX(vwmaccsu_vx_b)
1833 GEN_VEXT_VX(vwmaccsu_vx_h)
1834 GEN_VEXT_VX(vwmaccsu_vx_w)
1835 GEN_VEXT_VX(vwmaccus_vx_b)
1836 GEN_VEXT_VX(vwmaccus_vx_h)
1837 GEN_VEXT_VX(vwmaccus_vx_w)
1838 
1839 /* Vector Integer Merge and Move Instructions */
1840 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1841 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1842                   uint32_t desc)                                     \
1843 {                                                                    \
1844     uint32_t vl = env->vl;                                           \
1845     uint32_t i;                                                      \
1846                                                                      \
1847     for (i = env->vstart; i < vl; i++) {                             \
1848         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1849         *((ETYPE *)vd + H(i)) = s1;                                  \
1850     }                                                                \
1851     env->vstart = 0;                                                 \
1852 }
1853 
1854 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1855 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1856 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1857 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1858 
1859 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1860 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1861                   uint32_t desc)                                     \
1862 {                                                                    \
1863     uint32_t vl = env->vl;                                           \
1864     uint32_t i;                                                      \
1865                                                                      \
1866     for (i = env->vstart; i < vl; i++) {                             \
1867         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1868     }                                                                \
1869     env->vstart = 0;                                                 \
1870 }
1871 
1872 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1873 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1874 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1875 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1876 
1877 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1878 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1879                   CPURISCVState *env, uint32_t desc)                 \
1880 {                                                                    \
1881     uint32_t vl = env->vl;                                           \
1882     uint32_t i;                                                      \
1883                                                                      \
1884     for (i = env->vstart; i < vl; i++) {                             \
1885         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1886         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1887     }                                                                \
1888     env->vstart = 0;                                                 \
1889 }
1890 
1891 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1892 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1893 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1894 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1895 
1896 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1897 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1898                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1899 {                                                                    \
1900     uint32_t vl = env->vl;                                           \
1901     uint32_t i;                                                      \
1902                                                                      \
1903     for (i = env->vstart; i < vl; i++) {                             \
1904         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1905         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1906                    (ETYPE)(target_long)s1);                          \
1907         *((ETYPE *)vd + H(i)) = d;                                   \
1908     }                                                                \
1909     env->vstart = 0;                                                 \
1910 }
1911 
1912 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1913 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1914 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1915 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1916 
1917 /*
1918  *** Vector Fixed-Point Arithmetic Instructions
1919  */
1920 
1921 /* Vector Single-Width Saturating Add and Subtract */
1922 
1923 /*
1924  * As fixed point instructions probably have round mode and saturation,
1925  * define common macros for fixed point here.
1926  */
1927 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1928                           CPURISCVState *env, int vxrm);
1929 
1930 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1931 static inline void                                                  \
1932 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1933           CPURISCVState *env, int vxrm)                             \
1934 {                                                                   \
1935     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1936     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1937     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1938 }
1939 
1940 static inline void
1941 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1942              CPURISCVState *env,
1943              uint32_t vl, uint32_t vm, int vxrm,
1944              opivv2_rm_fn *fn)
1945 {
1946     for (uint32_t i = env->vstart; i < vl; i++) {
1947         if (!vm && !vext_elem_mask(v0, i)) {
1948             continue;
1949         }
1950         fn(vd, vs1, vs2, i, env, vxrm);
1951     }
1952     env->vstart = 0;
1953 }
1954 
1955 static inline void
1956 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1957              CPURISCVState *env,
1958              uint32_t desc,
1959              opivv2_rm_fn *fn)
1960 {
1961     uint32_t vm = vext_vm(desc);
1962     uint32_t vl = env->vl;
1963 
1964     switch (env->vxrm) {
1965     case 0: /* rnu */
1966         vext_vv_rm_1(vd, v0, vs1, vs2,
1967                      env, vl, vm, 0, fn);
1968         break;
1969     case 1: /* rne */
1970         vext_vv_rm_1(vd, v0, vs1, vs2,
1971                      env, vl, vm, 1, fn);
1972         break;
1973     case 2: /* rdn */
1974         vext_vv_rm_1(vd, v0, vs1, vs2,
1975                      env, vl, vm, 2, fn);
1976         break;
1977     default: /* rod */
1978         vext_vv_rm_1(vd, v0, vs1, vs2,
1979                      env, vl, vm, 3, fn);
1980         break;
1981     }
1982 }
1983 
1984 /* generate helpers for fixed point instructions with OPIVV format */
1985 #define GEN_VEXT_VV_RM(NAME)                                    \
1986 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1987                   CPURISCVState *env, uint32_t desc)            \
1988 {                                                               \
1989     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
1990                  do_##NAME);                                    \
1991 }
1992 
1993 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
1994 {
1995     uint8_t res = a + b;
1996     if (res < a) {
1997         res = UINT8_MAX;
1998         env->vxsat = 0x1;
1999     }
2000     return res;
2001 }
2002 
2003 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2004                                uint16_t b)
2005 {
2006     uint16_t res = a + b;
2007     if (res < a) {
2008         res = UINT16_MAX;
2009         env->vxsat = 0x1;
2010     }
2011     return res;
2012 }
2013 
2014 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2015                                uint32_t b)
2016 {
2017     uint32_t res = a + b;
2018     if (res < a) {
2019         res = UINT32_MAX;
2020         env->vxsat = 0x1;
2021     }
2022     return res;
2023 }
2024 
2025 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2026                                uint64_t b)
2027 {
2028     uint64_t res = a + b;
2029     if (res < a) {
2030         res = UINT64_MAX;
2031         env->vxsat = 0x1;
2032     }
2033     return res;
2034 }
2035 
2036 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2037 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2038 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2039 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2040 GEN_VEXT_VV_RM(vsaddu_vv_b)
2041 GEN_VEXT_VV_RM(vsaddu_vv_h)
2042 GEN_VEXT_VV_RM(vsaddu_vv_w)
2043 GEN_VEXT_VV_RM(vsaddu_vv_d)
2044 
2045 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2046                           CPURISCVState *env, int vxrm);
2047 
2048 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2049 static inline void                                                  \
2050 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2051           CPURISCVState *env, int vxrm)                             \
2052 {                                                                   \
2053     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2054     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2055 }
2056 
2057 static inline void
2058 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2059              CPURISCVState *env,
2060              uint32_t vl, uint32_t vm, int vxrm,
2061              opivx2_rm_fn *fn)
2062 {
2063     for (uint32_t i = env->vstart; i < vl; i++) {
2064         if (!vm && !vext_elem_mask(v0, i)) {
2065             continue;
2066         }
2067         fn(vd, s1, vs2, i, env, vxrm);
2068     }
2069     env->vstart = 0;
2070 }
2071 
2072 static inline void
2073 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2074              CPURISCVState *env,
2075              uint32_t desc,
2076              opivx2_rm_fn *fn)
2077 {
2078     uint32_t vm = vext_vm(desc);
2079     uint32_t vl = env->vl;
2080 
2081     switch (env->vxrm) {
2082     case 0: /* rnu */
2083         vext_vx_rm_1(vd, v0, s1, vs2,
2084                      env, vl, vm, 0, fn);
2085         break;
2086     case 1: /* rne */
2087         vext_vx_rm_1(vd, v0, s1, vs2,
2088                      env, vl, vm, 1, fn);
2089         break;
2090     case 2: /* rdn */
2091         vext_vx_rm_1(vd, v0, s1, vs2,
2092                      env, vl, vm, 2, fn);
2093         break;
2094     default: /* rod */
2095         vext_vx_rm_1(vd, v0, s1, vs2,
2096                      env, vl, vm, 3, fn);
2097         break;
2098     }
2099 }
2100 
2101 /* generate helpers for fixed point instructions with OPIVX format */
2102 #define GEN_VEXT_VX_RM(NAME)                              \
2103 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2104         void *vs2, CPURISCVState *env, uint32_t desc)     \
2105 {                                                         \
2106     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2107                  do_##NAME);                              \
2108 }
2109 
2110 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2111 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2112 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2113 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2114 GEN_VEXT_VX_RM(vsaddu_vx_b)
2115 GEN_VEXT_VX_RM(vsaddu_vx_h)
2116 GEN_VEXT_VX_RM(vsaddu_vx_w)
2117 GEN_VEXT_VX_RM(vsaddu_vx_d)
2118 
2119 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2120 {
2121     int8_t res = a + b;
2122     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2123         res = a > 0 ? INT8_MAX : INT8_MIN;
2124         env->vxsat = 0x1;
2125     }
2126     return res;
2127 }
2128 
2129 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2130 {
2131     int16_t res = a + b;
2132     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2133         res = a > 0 ? INT16_MAX : INT16_MIN;
2134         env->vxsat = 0x1;
2135     }
2136     return res;
2137 }
2138 
2139 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2140 {
2141     int32_t res = a + b;
2142     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2143         res = a > 0 ? INT32_MAX : INT32_MIN;
2144         env->vxsat = 0x1;
2145     }
2146     return res;
2147 }
2148 
2149 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2150 {
2151     int64_t res = a + b;
2152     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2153         res = a > 0 ? INT64_MAX : INT64_MIN;
2154         env->vxsat = 0x1;
2155     }
2156     return res;
2157 }
2158 
2159 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2160 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2161 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2162 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2163 GEN_VEXT_VV_RM(vsadd_vv_b)
2164 GEN_VEXT_VV_RM(vsadd_vv_h)
2165 GEN_VEXT_VV_RM(vsadd_vv_w)
2166 GEN_VEXT_VV_RM(vsadd_vv_d)
2167 
2168 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2169 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2170 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2171 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2172 GEN_VEXT_VX_RM(vsadd_vx_b)
2173 GEN_VEXT_VX_RM(vsadd_vx_h)
2174 GEN_VEXT_VX_RM(vsadd_vx_w)
2175 GEN_VEXT_VX_RM(vsadd_vx_d)
2176 
2177 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2178 {
2179     uint8_t res = a - b;
2180     if (res > a) {
2181         res = 0;
2182         env->vxsat = 0x1;
2183     }
2184     return res;
2185 }
2186 
2187 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2188                                uint16_t b)
2189 {
2190     uint16_t res = a - b;
2191     if (res > a) {
2192         res = 0;
2193         env->vxsat = 0x1;
2194     }
2195     return res;
2196 }
2197 
2198 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2199                                uint32_t b)
2200 {
2201     uint32_t res = a - b;
2202     if (res > a) {
2203         res = 0;
2204         env->vxsat = 0x1;
2205     }
2206     return res;
2207 }
2208 
2209 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2210                                uint64_t b)
2211 {
2212     uint64_t res = a - b;
2213     if (res > a) {
2214         res = 0;
2215         env->vxsat = 0x1;
2216     }
2217     return res;
2218 }
2219 
2220 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2221 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2222 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2223 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2224 GEN_VEXT_VV_RM(vssubu_vv_b)
2225 GEN_VEXT_VV_RM(vssubu_vv_h)
2226 GEN_VEXT_VV_RM(vssubu_vv_w)
2227 GEN_VEXT_VV_RM(vssubu_vv_d)
2228 
2229 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2230 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2231 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2232 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2233 GEN_VEXT_VX_RM(vssubu_vx_b)
2234 GEN_VEXT_VX_RM(vssubu_vx_h)
2235 GEN_VEXT_VX_RM(vssubu_vx_w)
2236 GEN_VEXT_VX_RM(vssubu_vx_d)
2237 
2238 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2239 {
2240     int8_t res = a - b;
2241     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2242         res = a >= 0 ? INT8_MAX : INT8_MIN;
2243         env->vxsat = 0x1;
2244     }
2245     return res;
2246 }
2247 
2248 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2249 {
2250     int16_t res = a - b;
2251     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2252         res = a >= 0 ? INT16_MAX : INT16_MIN;
2253         env->vxsat = 0x1;
2254     }
2255     return res;
2256 }
2257 
2258 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2259 {
2260     int32_t res = a - b;
2261     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2262         res = a >= 0 ? INT32_MAX : INT32_MIN;
2263         env->vxsat = 0x1;
2264     }
2265     return res;
2266 }
2267 
2268 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2269 {
2270     int64_t res = a - b;
2271     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2272         res = a >= 0 ? INT64_MAX : INT64_MIN;
2273         env->vxsat = 0x1;
2274     }
2275     return res;
2276 }
2277 
2278 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2279 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2280 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2281 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2282 GEN_VEXT_VV_RM(vssub_vv_b)
2283 GEN_VEXT_VV_RM(vssub_vv_h)
2284 GEN_VEXT_VV_RM(vssub_vv_w)
2285 GEN_VEXT_VV_RM(vssub_vv_d)
2286 
2287 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2288 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2289 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2290 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2291 GEN_VEXT_VX_RM(vssub_vx_b)
2292 GEN_VEXT_VX_RM(vssub_vx_h)
2293 GEN_VEXT_VX_RM(vssub_vx_w)
2294 GEN_VEXT_VX_RM(vssub_vx_d)
2295 
2296 /* Vector Single-Width Averaging Add and Subtract */
2297 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2298 {
2299     uint8_t d = extract64(v, shift, 1);
2300     uint8_t d1;
2301     uint64_t D1, D2;
2302 
2303     if (shift == 0 || shift > 64) {
2304         return 0;
2305     }
2306 
2307     d1 = extract64(v, shift - 1, 1);
2308     D1 = extract64(v, 0, shift);
2309     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2310         return d1;
2311     } else if (vxrm == 1) { /* round-to-nearest-even */
2312         if (shift > 1) {
2313             D2 = extract64(v, 0, shift - 1);
2314             return d1 & ((D2 != 0) | d);
2315         } else {
2316             return d1 & d;
2317         }
2318     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2319         return !d & (D1 != 0);
2320     }
2321     return 0; /* round-down (truncate) */
2322 }
2323 
2324 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2325 {
2326     int64_t res = (int64_t)a + b;
2327     uint8_t round = get_round(vxrm, res, 1);
2328 
2329     return (res >> 1) + round;
2330 }
2331 
2332 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2333 {
2334     int64_t res = a + b;
2335     uint8_t round = get_round(vxrm, res, 1);
2336     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2337 
2338     /* With signed overflow, bit 64 is inverse of bit 63. */
2339     return ((res >> 1) ^ over) + round;
2340 }
2341 
2342 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2343 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2344 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2345 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2346 GEN_VEXT_VV_RM(vaadd_vv_b)
2347 GEN_VEXT_VV_RM(vaadd_vv_h)
2348 GEN_VEXT_VV_RM(vaadd_vv_w)
2349 GEN_VEXT_VV_RM(vaadd_vv_d)
2350 
2351 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2352 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2353 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2354 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2355 GEN_VEXT_VX_RM(vaadd_vx_b)
2356 GEN_VEXT_VX_RM(vaadd_vx_h)
2357 GEN_VEXT_VX_RM(vaadd_vx_w)
2358 GEN_VEXT_VX_RM(vaadd_vx_d)
2359 
2360 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2361                                uint32_t a, uint32_t b)
2362 {
2363     uint64_t res = (uint64_t)a + b;
2364     uint8_t round = get_round(vxrm, res, 1);
2365 
2366     return (res >> 1) + round;
2367 }
2368 
2369 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2370                                uint64_t a, uint64_t b)
2371 {
2372     uint64_t res = a + b;
2373     uint8_t round = get_round(vxrm, res, 1);
2374     uint64_t over = (uint64_t)(res < a) << 63;
2375 
2376     return ((res >> 1) | over) + round;
2377 }
2378 
2379 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2380 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2381 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2382 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2383 GEN_VEXT_VV_RM(vaaddu_vv_b)
2384 GEN_VEXT_VV_RM(vaaddu_vv_h)
2385 GEN_VEXT_VV_RM(vaaddu_vv_w)
2386 GEN_VEXT_VV_RM(vaaddu_vv_d)
2387 
2388 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2389 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2390 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2391 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2392 GEN_VEXT_VX_RM(vaaddu_vx_b)
2393 GEN_VEXT_VX_RM(vaaddu_vx_h)
2394 GEN_VEXT_VX_RM(vaaddu_vx_w)
2395 GEN_VEXT_VX_RM(vaaddu_vx_d)
2396 
2397 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2398 {
2399     int64_t res = (int64_t)a - b;
2400     uint8_t round = get_round(vxrm, res, 1);
2401 
2402     return (res >> 1) + round;
2403 }
2404 
2405 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2406 {
2407     int64_t res = (int64_t)a - b;
2408     uint8_t round = get_round(vxrm, res, 1);
2409     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2410 
2411     /* With signed overflow, bit 64 is inverse of bit 63. */
2412     return ((res >> 1) ^ over) + round;
2413 }
2414 
2415 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2416 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2417 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2418 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2419 GEN_VEXT_VV_RM(vasub_vv_b)
2420 GEN_VEXT_VV_RM(vasub_vv_h)
2421 GEN_VEXT_VV_RM(vasub_vv_w)
2422 GEN_VEXT_VV_RM(vasub_vv_d)
2423 
2424 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2425 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2426 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2427 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2428 GEN_VEXT_VX_RM(vasub_vx_b)
2429 GEN_VEXT_VX_RM(vasub_vx_h)
2430 GEN_VEXT_VX_RM(vasub_vx_w)
2431 GEN_VEXT_VX_RM(vasub_vx_d)
2432 
2433 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2434                                uint32_t a, uint32_t b)
2435 {
2436     int64_t res = (int64_t)a - b;
2437     uint8_t round = get_round(vxrm, res, 1);
2438 
2439     return (res >> 1) + round;
2440 }
2441 
2442 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2443                                uint64_t a, uint64_t b)
2444 {
2445     uint64_t res = (uint64_t)a - b;
2446     uint8_t round = get_round(vxrm, res, 1);
2447     uint64_t over = (uint64_t)(res > a) << 63;
2448 
2449     return ((res >> 1) | over) + round;
2450 }
2451 
2452 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2453 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2454 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2455 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2456 GEN_VEXT_VV_RM(vasubu_vv_b)
2457 GEN_VEXT_VV_RM(vasubu_vv_h)
2458 GEN_VEXT_VV_RM(vasubu_vv_w)
2459 GEN_VEXT_VV_RM(vasubu_vv_d)
2460 
2461 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2462 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2463 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2464 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2465 GEN_VEXT_VX_RM(vasubu_vx_b)
2466 GEN_VEXT_VX_RM(vasubu_vx_h)
2467 GEN_VEXT_VX_RM(vasubu_vx_w)
2468 GEN_VEXT_VX_RM(vasubu_vx_d)
2469 
2470 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2471 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2472 {
2473     uint8_t round;
2474     int16_t res;
2475 
2476     res = (int16_t)a * (int16_t)b;
2477     round = get_round(vxrm, res, 7);
2478     res   = (res >> 7) + round;
2479 
2480     if (res > INT8_MAX) {
2481         env->vxsat = 0x1;
2482         return INT8_MAX;
2483     } else if (res < INT8_MIN) {
2484         env->vxsat = 0x1;
2485         return INT8_MIN;
2486     } else {
2487         return res;
2488     }
2489 }
2490 
2491 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2492 {
2493     uint8_t round;
2494     int32_t res;
2495 
2496     res = (int32_t)a * (int32_t)b;
2497     round = get_round(vxrm, res, 15);
2498     res   = (res >> 15) + round;
2499 
2500     if (res > INT16_MAX) {
2501         env->vxsat = 0x1;
2502         return INT16_MAX;
2503     } else if (res < INT16_MIN) {
2504         env->vxsat = 0x1;
2505         return INT16_MIN;
2506     } else {
2507         return res;
2508     }
2509 }
2510 
2511 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2512 {
2513     uint8_t round;
2514     int64_t res;
2515 
2516     res = (int64_t)a * (int64_t)b;
2517     round = get_round(vxrm, res, 31);
2518     res   = (res >> 31) + round;
2519 
2520     if (res > INT32_MAX) {
2521         env->vxsat = 0x1;
2522         return INT32_MAX;
2523     } else if (res < INT32_MIN) {
2524         env->vxsat = 0x1;
2525         return INT32_MIN;
2526     } else {
2527         return res;
2528     }
2529 }
2530 
2531 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2532 {
2533     uint8_t round;
2534     uint64_t hi_64, lo_64;
2535     int64_t res;
2536 
2537     if (a == INT64_MIN && b == INT64_MIN) {
2538         env->vxsat = 1;
2539         return INT64_MAX;
2540     }
2541 
2542     muls64(&lo_64, &hi_64, a, b);
2543     round = get_round(vxrm, lo_64, 63);
2544     /*
2545      * Cannot overflow, as there are always
2546      * 2 sign bits after multiply.
2547      */
2548     res = (hi_64 << 1) | (lo_64 >> 63);
2549     if (round) {
2550         if (res == INT64_MAX) {
2551             env->vxsat = 1;
2552         } else {
2553             res += 1;
2554         }
2555     }
2556     return res;
2557 }
2558 
2559 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2560 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2561 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2562 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2563 GEN_VEXT_VV_RM(vsmul_vv_b)
2564 GEN_VEXT_VV_RM(vsmul_vv_h)
2565 GEN_VEXT_VV_RM(vsmul_vv_w)
2566 GEN_VEXT_VV_RM(vsmul_vv_d)
2567 
2568 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2569 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2570 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2571 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2572 GEN_VEXT_VX_RM(vsmul_vx_b)
2573 GEN_VEXT_VX_RM(vsmul_vx_h)
2574 GEN_VEXT_VX_RM(vsmul_vx_w)
2575 GEN_VEXT_VX_RM(vsmul_vx_d)
2576 
2577 /* Vector Single-Width Scaling Shift Instructions */
2578 static inline uint8_t
2579 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2580 {
2581     uint8_t round, shift = b & 0x7;
2582     uint8_t res;
2583 
2584     round = get_round(vxrm, a, shift);
2585     res   = (a >> shift)  + round;
2586     return res;
2587 }
2588 static inline uint16_t
2589 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2590 {
2591     uint8_t round, shift = b & 0xf;
2592     uint16_t res;
2593 
2594     round = get_round(vxrm, a, shift);
2595     res   = (a >> shift)  + round;
2596     return res;
2597 }
2598 static inline uint32_t
2599 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2600 {
2601     uint8_t round, shift = b & 0x1f;
2602     uint32_t res;
2603 
2604     round = get_round(vxrm, a, shift);
2605     res   = (a >> shift)  + round;
2606     return res;
2607 }
2608 static inline uint64_t
2609 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2610 {
2611     uint8_t round, shift = b & 0x3f;
2612     uint64_t res;
2613 
2614     round = get_round(vxrm, a, shift);
2615     res   = (a >> shift)  + round;
2616     return res;
2617 }
2618 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2619 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2620 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2621 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2622 GEN_VEXT_VV_RM(vssrl_vv_b)
2623 GEN_VEXT_VV_RM(vssrl_vv_h)
2624 GEN_VEXT_VV_RM(vssrl_vv_w)
2625 GEN_VEXT_VV_RM(vssrl_vv_d)
2626 
2627 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2628 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2629 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2630 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2631 GEN_VEXT_VX_RM(vssrl_vx_b)
2632 GEN_VEXT_VX_RM(vssrl_vx_h)
2633 GEN_VEXT_VX_RM(vssrl_vx_w)
2634 GEN_VEXT_VX_RM(vssrl_vx_d)
2635 
2636 static inline int8_t
2637 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2638 {
2639     uint8_t round, shift = b & 0x7;
2640     int8_t res;
2641 
2642     round = get_round(vxrm, a, shift);
2643     res   = (a >> shift)  + round;
2644     return res;
2645 }
2646 static inline int16_t
2647 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2648 {
2649     uint8_t round, shift = b & 0xf;
2650     int16_t res;
2651 
2652     round = get_round(vxrm, a, shift);
2653     res   = (a >> shift)  + round;
2654     return res;
2655 }
2656 static inline int32_t
2657 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2658 {
2659     uint8_t round, shift = b & 0x1f;
2660     int32_t res;
2661 
2662     round = get_round(vxrm, a, shift);
2663     res   = (a >> shift)  + round;
2664     return res;
2665 }
2666 static inline int64_t
2667 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2668 {
2669     uint8_t round, shift = b & 0x3f;
2670     int64_t res;
2671 
2672     round = get_round(vxrm, a, shift);
2673     res   = (a >> shift)  + round;
2674     return res;
2675 }
2676 
2677 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2678 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2679 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2680 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2681 GEN_VEXT_VV_RM(vssra_vv_b)
2682 GEN_VEXT_VV_RM(vssra_vv_h)
2683 GEN_VEXT_VV_RM(vssra_vv_w)
2684 GEN_VEXT_VV_RM(vssra_vv_d)
2685 
2686 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2687 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2688 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2689 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2690 GEN_VEXT_VX_RM(vssra_vx_b)
2691 GEN_VEXT_VX_RM(vssra_vx_h)
2692 GEN_VEXT_VX_RM(vssra_vx_w)
2693 GEN_VEXT_VX_RM(vssra_vx_d)
2694 
2695 /* Vector Narrowing Fixed-Point Clip Instructions */
2696 static inline int8_t
2697 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2698 {
2699     uint8_t round, shift = b & 0xf;
2700     int16_t res;
2701 
2702     round = get_round(vxrm, a, shift);
2703     res   = (a >> shift)  + round;
2704     if (res > INT8_MAX) {
2705         env->vxsat = 0x1;
2706         return INT8_MAX;
2707     } else if (res < INT8_MIN) {
2708         env->vxsat = 0x1;
2709         return INT8_MIN;
2710     } else {
2711         return res;
2712     }
2713 }
2714 
2715 static inline int16_t
2716 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2717 {
2718     uint8_t round, shift = b & 0x1f;
2719     int32_t res;
2720 
2721     round = get_round(vxrm, a, shift);
2722     res   = (a >> shift)  + round;
2723     if (res > INT16_MAX) {
2724         env->vxsat = 0x1;
2725         return INT16_MAX;
2726     } else if (res < INT16_MIN) {
2727         env->vxsat = 0x1;
2728         return INT16_MIN;
2729     } else {
2730         return res;
2731     }
2732 }
2733 
2734 static inline int32_t
2735 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2736 {
2737     uint8_t round, shift = b & 0x3f;
2738     int64_t res;
2739 
2740     round = get_round(vxrm, a, shift);
2741     res   = (a >> shift)  + round;
2742     if (res > INT32_MAX) {
2743         env->vxsat = 0x1;
2744         return INT32_MAX;
2745     } else if (res < INT32_MIN) {
2746         env->vxsat = 0x1;
2747         return INT32_MIN;
2748     } else {
2749         return res;
2750     }
2751 }
2752 
2753 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2754 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2755 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2756 GEN_VEXT_VV_RM(vnclip_wv_b)
2757 GEN_VEXT_VV_RM(vnclip_wv_h)
2758 GEN_VEXT_VV_RM(vnclip_wv_w)
2759 
2760 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2761 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2762 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2763 GEN_VEXT_VX_RM(vnclip_wx_b)
2764 GEN_VEXT_VX_RM(vnclip_wx_h)
2765 GEN_VEXT_VX_RM(vnclip_wx_w)
2766 
2767 static inline uint8_t
2768 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2769 {
2770     uint8_t round, shift = b & 0xf;
2771     uint16_t res;
2772 
2773     round = get_round(vxrm, a, shift);
2774     res   = (a >> shift)  + round;
2775     if (res > UINT8_MAX) {
2776         env->vxsat = 0x1;
2777         return UINT8_MAX;
2778     } else {
2779         return res;
2780     }
2781 }
2782 
2783 static inline uint16_t
2784 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2785 {
2786     uint8_t round, shift = b & 0x1f;
2787     uint32_t res;
2788 
2789     round = get_round(vxrm, a, shift);
2790     res   = (a >> shift)  + round;
2791     if (res > UINT16_MAX) {
2792         env->vxsat = 0x1;
2793         return UINT16_MAX;
2794     } else {
2795         return res;
2796     }
2797 }
2798 
2799 static inline uint32_t
2800 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2801 {
2802     uint8_t round, shift = b & 0x3f;
2803     uint64_t res;
2804 
2805     round = get_round(vxrm, a, shift);
2806     res   = (a >> shift)  + round;
2807     if (res > UINT32_MAX) {
2808         env->vxsat = 0x1;
2809         return UINT32_MAX;
2810     } else {
2811         return res;
2812     }
2813 }
2814 
2815 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2816 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2817 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2818 GEN_VEXT_VV_RM(vnclipu_wv_b)
2819 GEN_VEXT_VV_RM(vnclipu_wv_h)
2820 GEN_VEXT_VV_RM(vnclipu_wv_w)
2821 
2822 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2823 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2824 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2825 GEN_VEXT_VX_RM(vnclipu_wx_b)
2826 GEN_VEXT_VX_RM(vnclipu_wx_h)
2827 GEN_VEXT_VX_RM(vnclipu_wx_w)
2828 
2829 /*
2830  *** Vector Float Point Arithmetic Instructions
2831  */
2832 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2833 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2834 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2835                       CPURISCVState *env)                      \
2836 {                                                              \
2837     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2838     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2839     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2840 }
2841 
2842 #define GEN_VEXT_VV_ENV(NAME)                             \
2843 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2844                   void *vs2, CPURISCVState *env,          \
2845                   uint32_t desc)                          \
2846 {                                                         \
2847     uint32_t vm = vext_vm(desc);                          \
2848     uint32_t vl = env->vl;                                \
2849     uint32_t i;                                           \
2850                                                           \
2851     for (i = env->vstart; i < vl; i++) {                  \
2852         if (!vm && !vext_elem_mask(v0, i)) {              \
2853             continue;                                     \
2854         }                                                 \
2855         do_##NAME(vd, vs1, vs2, i, env);                  \
2856     }                                                     \
2857     env->vstart = 0;                                      \
2858 }
2859 
2860 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2861 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2862 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2863 GEN_VEXT_VV_ENV(vfadd_vv_h)
2864 GEN_VEXT_VV_ENV(vfadd_vv_w)
2865 GEN_VEXT_VV_ENV(vfadd_vv_d)
2866 
2867 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2868 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2869                       CPURISCVState *env)                      \
2870 {                                                              \
2871     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2872     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2873 }
2874 
2875 #define GEN_VEXT_VF(NAME)                                 \
2876 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2877                   void *vs2, CPURISCVState *env,          \
2878                   uint32_t desc)                          \
2879 {                                                         \
2880     uint32_t vm = vext_vm(desc);                          \
2881     uint32_t vl = env->vl;                                \
2882     uint32_t i;                                           \
2883                                                           \
2884     for (i = env->vstart; i < vl; i++) {                  \
2885         if (!vm && !vext_elem_mask(v0, i)) {              \
2886             continue;                                     \
2887         }                                                 \
2888         do_##NAME(vd, s1, vs2, i, env);                   \
2889     }                                                     \
2890     env->vstart = 0;                                      \
2891 }
2892 
2893 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2894 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2895 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2896 GEN_VEXT_VF(vfadd_vf_h)
2897 GEN_VEXT_VF(vfadd_vf_w)
2898 GEN_VEXT_VF(vfadd_vf_d)
2899 
2900 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2901 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2902 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2903 GEN_VEXT_VV_ENV(vfsub_vv_h)
2904 GEN_VEXT_VV_ENV(vfsub_vv_w)
2905 GEN_VEXT_VV_ENV(vfsub_vv_d)
2906 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2907 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2908 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2909 GEN_VEXT_VF(vfsub_vf_h)
2910 GEN_VEXT_VF(vfsub_vf_w)
2911 GEN_VEXT_VF(vfsub_vf_d)
2912 
2913 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2914 {
2915     return float16_sub(b, a, s);
2916 }
2917 
2918 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2919 {
2920     return float32_sub(b, a, s);
2921 }
2922 
2923 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2924 {
2925     return float64_sub(b, a, s);
2926 }
2927 
2928 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2929 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2930 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2931 GEN_VEXT_VF(vfrsub_vf_h)
2932 GEN_VEXT_VF(vfrsub_vf_w)
2933 GEN_VEXT_VF(vfrsub_vf_d)
2934 
2935 /* Vector Widening Floating-Point Add/Subtract Instructions */
2936 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2937 {
2938     return float32_add(float16_to_float32(a, true, s),
2939             float16_to_float32(b, true, s), s);
2940 }
2941 
2942 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2943 {
2944     return float64_add(float32_to_float64(a, s),
2945             float32_to_float64(b, s), s);
2946 
2947 }
2948 
2949 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
2950 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
2951 GEN_VEXT_VV_ENV(vfwadd_vv_h)
2952 GEN_VEXT_VV_ENV(vfwadd_vv_w)
2953 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
2954 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
2955 GEN_VEXT_VF(vfwadd_vf_h)
2956 GEN_VEXT_VF(vfwadd_vf_w)
2957 
2958 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
2959 {
2960     return float32_sub(float16_to_float32(a, true, s),
2961             float16_to_float32(b, true, s), s);
2962 }
2963 
2964 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
2965 {
2966     return float64_sub(float32_to_float64(a, s),
2967             float32_to_float64(b, s), s);
2968 
2969 }
2970 
2971 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
2972 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
2973 GEN_VEXT_VV_ENV(vfwsub_vv_h)
2974 GEN_VEXT_VV_ENV(vfwsub_vv_w)
2975 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
2976 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
2977 GEN_VEXT_VF(vfwsub_vf_h)
2978 GEN_VEXT_VF(vfwsub_vf_w)
2979 
2980 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
2981 {
2982     return float32_add(a, float16_to_float32(b, true, s), s);
2983 }
2984 
2985 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
2986 {
2987     return float64_add(a, float32_to_float64(b, s), s);
2988 }
2989 
2990 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
2991 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
2992 GEN_VEXT_VV_ENV(vfwadd_wv_h)
2993 GEN_VEXT_VV_ENV(vfwadd_wv_w)
2994 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
2995 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
2996 GEN_VEXT_VF(vfwadd_wf_h)
2997 GEN_VEXT_VF(vfwadd_wf_w)
2998 
2999 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3000 {
3001     return float32_sub(a, float16_to_float32(b, true, s), s);
3002 }
3003 
3004 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3005 {
3006     return float64_sub(a, float32_to_float64(b, s), s);
3007 }
3008 
3009 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3010 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3011 GEN_VEXT_VV_ENV(vfwsub_wv_h)
3012 GEN_VEXT_VV_ENV(vfwsub_wv_w)
3013 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3014 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3015 GEN_VEXT_VF(vfwsub_wf_h)
3016 GEN_VEXT_VF(vfwsub_wf_w)
3017 
3018 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3019 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3020 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3021 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3022 GEN_VEXT_VV_ENV(vfmul_vv_h)
3023 GEN_VEXT_VV_ENV(vfmul_vv_w)
3024 GEN_VEXT_VV_ENV(vfmul_vv_d)
3025 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3026 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3027 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3028 GEN_VEXT_VF(vfmul_vf_h)
3029 GEN_VEXT_VF(vfmul_vf_w)
3030 GEN_VEXT_VF(vfmul_vf_d)
3031 
3032 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3033 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3034 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3035 GEN_VEXT_VV_ENV(vfdiv_vv_h)
3036 GEN_VEXT_VV_ENV(vfdiv_vv_w)
3037 GEN_VEXT_VV_ENV(vfdiv_vv_d)
3038 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3039 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3040 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3041 GEN_VEXT_VF(vfdiv_vf_h)
3042 GEN_VEXT_VF(vfdiv_vf_w)
3043 GEN_VEXT_VF(vfdiv_vf_d)
3044 
3045 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3046 {
3047     return float16_div(b, a, s);
3048 }
3049 
3050 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3051 {
3052     return float32_div(b, a, s);
3053 }
3054 
3055 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3056 {
3057     return float64_div(b, a, s);
3058 }
3059 
3060 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3061 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3062 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3063 GEN_VEXT_VF(vfrdiv_vf_h)
3064 GEN_VEXT_VF(vfrdiv_vf_w)
3065 GEN_VEXT_VF(vfrdiv_vf_d)
3066 
3067 /* Vector Widening Floating-Point Multiply */
3068 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3069 {
3070     return float32_mul(float16_to_float32(a, true, s),
3071             float16_to_float32(b, true, s), s);
3072 }
3073 
3074 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3075 {
3076     return float64_mul(float32_to_float64(a, s),
3077             float32_to_float64(b, s), s);
3078 
3079 }
3080 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3081 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3082 GEN_VEXT_VV_ENV(vfwmul_vv_h)
3083 GEN_VEXT_VV_ENV(vfwmul_vv_w)
3084 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3085 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3086 GEN_VEXT_VF(vfwmul_vf_h)
3087 GEN_VEXT_VF(vfwmul_vf_w)
3088 
3089 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3090 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3091 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3092         CPURISCVState *env)                                        \
3093 {                                                                  \
3094     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3095     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3096     TD d = *((TD *)vd + HD(i));                                    \
3097     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3098 }
3099 
3100 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3101 {
3102     return float16_muladd(a, b, d, 0, s);
3103 }
3104 
3105 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3106 {
3107     return float32_muladd(a, b, d, 0, s);
3108 }
3109 
3110 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3111 {
3112     return float64_muladd(a, b, d, 0, s);
3113 }
3114 
3115 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3116 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3117 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3118 GEN_VEXT_VV_ENV(vfmacc_vv_h)
3119 GEN_VEXT_VV_ENV(vfmacc_vv_w)
3120 GEN_VEXT_VV_ENV(vfmacc_vv_d)
3121 
3122 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3123 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3124         CPURISCVState *env)                                       \
3125 {                                                                 \
3126     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3127     TD d = *((TD *)vd + HD(i));                                   \
3128     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3129 }
3130 
3131 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3132 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3133 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3134 GEN_VEXT_VF(vfmacc_vf_h)
3135 GEN_VEXT_VF(vfmacc_vf_w)
3136 GEN_VEXT_VF(vfmacc_vf_d)
3137 
3138 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3139 {
3140     return float16_muladd(a, b, d,
3141             float_muladd_negate_c | float_muladd_negate_product, s);
3142 }
3143 
3144 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3145 {
3146     return float32_muladd(a, b, d,
3147             float_muladd_negate_c | float_muladd_negate_product, s);
3148 }
3149 
3150 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3151 {
3152     return float64_muladd(a, b, d,
3153             float_muladd_negate_c | float_muladd_negate_product, s);
3154 }
3155 
3156 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3157 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3158 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3159 GEN_VEXT_VV_ENV(vfnmacc_vv_h)
3160 GEN_VEXT_VV_ENV(vfnmacc_vv_w)
3161 GEN_VEXT_VV_ENV(vfnmacc_vv_d)
3162 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3163 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3164 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3165 GEN_VEXT_VF(vfnmacc_vf_h)
3166 GEN_VEXT_VF(vfnmacc_vf_w)
3167 GEN_VEXT_VF(vfnmacc_vf_d)
3168 
3169 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3170 {
3171     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3172 }
3173 
3174 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3175 {
3176     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3177 }
3178 
3179 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3180 {
3181     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3182 }
3183 
3184 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3185 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3186 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3187 GEN_VEXT_VV_ENV(vfmsac_vv_h)
3188 GEN_VEXT_VV_ENV(vfmsac_vv_w)
3189 GEN_VEXT_VV_ENV(vfmsac_vv_d)
3190 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3191 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3192 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3193 GEN_VEXT_VF(vfmsac_vf_h)
3194 GEN_VEXT_VF(vfmsac_vf_w)
3195 GEN_VEXT_VF(vfmsac_vf_d)
3196 
3197 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3198 {
3199     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3200 }
3201 
3202 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3203 {
3204     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3205 }
3206 
3207 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3208 {
3209     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3210 }
3211 
3212 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3213 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3214 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3215 GEN_VEXT_VV_ENV(vfnmsac_vv_h)
3216 GEN_VEXT_VV_ENV(vfnmsac_vv_w)
3217 GEN_VEXT_VV_ENV(vfnmsac_vv_d)
3218 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3219 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3220 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3221 GEN_VEXT_VF(vfnmsac_vf_h)
3222 GEN_VEXT_VF(vfnmsac_vf_w)
3223 GEN_VEXT_VF(vfnmsac_vf_d)
3224 
3225 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3226 {
3227     return float16_muladd(d, b, a, 0, s);
3228 }
3229 
3230 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3231 {
3232     return float32_muladd(d, b, a, 0, s);
3233 }
3234 
3235 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3236 {
3237     return float64_muladd(d, b, a, 0, s);
3238 }
3239 
3240 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3241 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3242 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3243 GEN_VEXT_VV_ENV(vfmadd_vv_h)
3244 GEN_VEXT_VV_ENV(vfmadd_vv_w)
3245 GEN_VEXT_VV_ENV(vfmadd_vv_d)
3246 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3247 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3248 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3249 GEN_VEXT_VF(vfmadd_vf_h)
3250 GEN_VEXT_VF(vfmadd_vf_w)
3251 GEN_VEXT_VF(vfmadd_vf_d)
3252 
3253 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3254 {
3255     return float16_muladd(d, b, a,
3256             float_muladd_negate_c | float_muladd_negate_product, s);
3257 }
3258 
3259 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3260 {
3261     return float32_muladd(d, b, a,
3262             float_muladd_negate_c | float_muladd_negate_product, s);
3263 }
3264 
3265 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3266 {
3267     return float64_muladd(d, b, a,
3268             float_muladd_negate_c | float_muladd_negate_product, s);
3269 }
3270 
3271 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3272 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3273 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3274 GEN_VEXT_VV_ENV(vfnmadd_vv_h)
3275 GEN_VEXT_VV_ENV(vfnmadd_vv_w)
3276 GEN_VEXT_VV_ENV(vfnmadd_vv_d)
3277 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3278 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3279 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3280 GEN_VEXT_VF(vfnmadd_vf_h)
3281 GEN_VEXT_VF(vfnmadd_vf_w)
3282 GEN_VEXT_VF(vfnmadd_vf_d)
3283 
3284 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3285 {
3286     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3287 }
3288 
3289 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3290 {
3291     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3292 }
3293 
3294 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3295 {
3296     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3297 }
3298 
3299 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3300 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3301 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3302 GEN_VEXT_VV_ENV(vfmsub_vv_h)
3303 GEN_VEXT_VV_ENV(vfmsub_vv_w)
3304 GEN_VEXT_VV_ENV(vfmsub_vv_d)
3305 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3306 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3307 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3308 GEN_VEXT_VF(vfmsub_vf_h)
3309 GEN_VEXT_VF(vfmsub_vf_w)
3310 GEN_VEXT_VF(vfmsub_vf_d)
3311 
3312 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3313 {
3314     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3315 }
3316 
3317 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3318 {
3319     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3320 }
3321 
3322 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3323 {
3324     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3325 }
3326 
3327 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3328 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3329 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3330 GEN_VEXT_VV_ENV(vfnmsub_vv_h)
3331 GEN_VEXT_VV_ENV(vfnmsub_vv_w)
3332 GEN_VEXT_VV_ENV(vfnmsub_vv_d)
3333 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3334 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3335 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3336 GEN_VEXT_VF(vfnmsub_vf_h)
3337 GEN_VEXT_VF(vfnmsub_vf_w)
3338 GEN_VEXT_VF(vfnmsub_vf_d)
3339 
3340 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3341 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3342 {
3343     return float32_muladd(float16_to_float32(a, true, s),
3344                         float16_to_float32(b, true, s), d, 0, s);
3345 }
3346 
3347 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3348 {
3349     return float64_muladd(float32_to_float64(a, s),
3350                         float32_to_float64(b, s), d, 0, s);
3351 }
3352 
3353 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3354 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3355 GEN_VEXT_VV_ENV(vfwmacc_vv_h)
3356 GEN_VEXT_VV_ENV(vfwmacc_vv_w)
3357 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3358 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3359 GEN_VEXT_VF(vfwmacc_vf_h)
3360 GEN_VEXT_VF(vfwmacc_vf_w)
3361 
3362 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3363 {
3364     return float32_muladd(float16_to_float32(a, true, s),
3365                         float16_to_float32(b, true, s), d,
3366                         float_muladd_negate_c | float_muladd_negate_product, s);
3367 }
3368 
3369 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3370 {
3371     return float64_muladd(float32_to_float64(a, s),
3372                         float32_to_float64(b, s), d,
3373                         float_muladd_negate_c | float_muladd_negate_product, s);
3374 }
3375 
3376 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3377 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3378 GEN_VEXT_VV_ENV(vfwnmacc_vv_h)
3379 GEN_VEXT_VV_ENV(vfwnmacc_vv_w)
3380 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3381 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3382 GEN_VEXT_VF(vfwnmacc_vf_h)
3383 GEN_VEXT_VF(vfwnmacc_vf_w)
3384 
3385 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3386 {
3387     return float32_muladd(float16_to_float32(a, true, s),
3388                         float16_to_float32(b, true, s), d,
3389                         float_muladd_negate_c, s);
3390 }
3391 
3392 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3393 {
3394     return float64_muladd(float32_to_float64(a, s),
3395                         float32_to_float64(b, s), d,
3396                         float_muladd_negate_c, s);
3397 }
3398 
3399 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3400 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3401 GEN_VEXT_VV_ENV(vfwmsac_vv_h)
3402 GEN_VEXT_VV_ENV(vfwmsac_vv_w)
3403 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3404 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3405 GEN_VEXT_VF(vfwmsac_vf_h)
3406 GEN_VEXT_VF(vfwmsac_vf_w)
3407 
3408 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3409 {
3410     return float32_muladd(float16_to_float32(a, true, s),
3411                         float16_to_float32(b, true, s), d,
3412                         float_muladd_negate_product, s);
3413 }
3414 
3415 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3416 {
3417     return float64_muladd(float32_to_float64(a, s),
3418                         float32_to_float64(b, s), d,
3419                         float_muladd_negate_product, s);
3420 }
3421 
3422 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3423 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3424 GEN_VEXT_VV_ENV(vfwnmsac_vv_h)
3425 GEN_VEXT_VV_ENV(vfwnmsac_vv_w)
3426 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3427 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3428 GEN_VEXT_VF(vfwnmsac_vf_h)
3429 GEN_VEXT_VF(vfwnmsac_vf_w)
3430 
3431 /* Vector Floating-Point Square-Root Instruction */
3432 /* (TD, T2, TX2) */
3433 #define OP_UU_H uint16_t, uint16_t, uint16_t
3434 #define OP_UU_W uint32_t, uint32_t, uint32_t
3435 #define OP_UU_D uint64_t, uint64_t, uint64_t
3436 
3437 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3438 static void do_##NAME(void *vd, void *vs2, int i,      \
3439         CPURISCVState *env)                            \
3440 {                                                      \
3441     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3442     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3443 }
3444 
3445 #define GEN_VEXT_V_ENV(NAME)                           \
3446 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3447         CPURISCVState *env, uint32_t desc)             \
3448 {                                                      \
3449     uint32_t vm = vext_vm(desc);                       \
3450     uint32_t vl = env->vl;                             \
3451     uint32_t i;                                        \
3452                                                        \
3453     if (vl == 0) {                                     \
3454         return;                                        \
3455     }                                                  \
3456     for (i = env->vstart; i < vl; i++) {               \
3457         if (!vm && !vext_elem_mask(v0, i)) {           \
3458             continue;                                  \
3459         }                                              \
3460         do_##NAME(vd, vs2, i, env);                    \
3461     }                                                  \
3462     env->vstart = 0;                                   \
3463 }
3464 
3465 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3466 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3467 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3468 GEN_VEXT_V_ENV(vfsqrt_v_h)
3469 GEN_VEXT_V_ENV(vfsqrt_v_w)
3470 GEN_VEXT_V_ENV(vfsqrt_v_d)
3471 
3472 /*
3473  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3474  *
3475  * Adapted from riscv-v-spec recip.c:
3476  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3477  */
3478 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3479 {
3480     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3481     uint64_t exp = extract64(f, frac_size, exp_size);
3482     uint64_t frac = extract64(f, 0, frac_size);
3483 
3484     const uint8_t lookup_table[] = {
3485         52, 51, 50, 48, 47, 46, 44, 43,
3486         42, 41, 40, 39, 38, 36, 35, 34,
3487         33, 32, 31, 30, 30, 29, 28, 27,
3488         26, 25, 24, 23, 23, 22, 21, 20,
3489         19, 19, 18, 17, 16, 16, 15, 14,
3490         14, 13, 12, 12, 11, 10, 10, 9,
3491         9, 8, 7, 7, 6, 6, 5, 4,
3492         4, 3, 3, 2, 2, 1, 1, 0,
3493         127, 125, 123, 121, 119, 118, 116, 114,
3494         113, 111, 109, 108, 106, 105, 103, 102,
3495         100, 99, 97, 96, 95, 93, 92, 91,
3496         90, 88, 87, 86, 85, 84, 83, 82,
3497         80, 79, 78, 77, 76, 75, 74, 73,
3498         72, 71, 70, 70, 69, 68, 67, 66,
3499         65, 64, 63, 63, 62, 61, 60, 59,
3500         59, 58, 57, 56, 56, 55, 54, 53
3501     };
3502     const int precision = 7;
3503 
3504     if (exp == 0 && frac != 0) { /* subnormal */
3505         /* Normalize the subnormal. */
3506         while (extract64(frac, frac_size - 1, 1) == 0) {
3507             exp--;
3508             frac <<= 1;
3509         }
3510 
3511         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3512     }
3513 
3514     int idx = ((exp & 1) << (precision - 1)) |
3515                 (frac >> (frac_size - precision + 1));
3516     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3517                             (frac_size - precision);
3518     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3519 
3520     uint64_t val = 0;
3521     val = deposit64(val, 0, frac_size, out_frac);
3522     val = deposit64(val, frac_size, exp_size, out_exp);
3523     val = deposit64(val, frac_size + exp_size, 1, sign);
3524     return val;
3525 }
3526 
3527 static float16 frsqrt7_h(float16 f, float_status *s)
3528 {
3529     int exp_size = 5, frac_size = 10;
3530     bool sign = float16_is_neg(f);
3531 
3532     /*
3533      * frsqrt7(sNaN) = canonical NaN
3534      * frsqrt7(-inf) = canonical NaN
3535      * frsqrt7(-normal) = canonical NaN
3536      * frsqrt7(-subnormal) = canonical NaN
3537      */
3538     if (float16_is_signaling_nan(f, s) ||
3539             (float16_is_infinity(f) && sign) ||
3540             (float16_is_normal(f) && sign) ||
3541             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3542         s->float_exception_flags |= float_flag_invalid;
3543         return float16_default_nan(s);
3544     }
3545 
3546     /* frsqrt7(qNaN) = canonical NaN */
3547     if (float16_is_quiet_nan(f, s)) {
3548         return float16_default_nan(s);
3549     }
3550 
3551     /* frsqrt7(+-0) = +-inf */
3552     if (float16_is_zero(f)) {
3553         s->float_exception_flags |= float_flag_divbyzero;
3554         return float16_set_sign(float16_infinity, sign);
3555     }
3556 
3557     /* frsqrt7(+inf) = +0 */
3558     if (float16_is_infinity(f) && !sign) {
3559         return float16_set_sign(float16_zero, sign);
3560     }
3561 
3562     /* +normal, +subnormal */
3563     uint64_t val = frsqrt7(f, exp_size, frac_size);
3564     return make_float16(val);
3565 }
3566 
3567 static float32 frsqrt7_s(float32 f, float_status *s)
3568 {
3569     int exp_size = 8, frac_size = 23;
3570     bool sign = float32_is_neg(f);
3571 
3572     /*
3573      * frsqrt7(sNaN) = canonical NaN
3574      * frsqrt7(-inf) = canonical NaN
3575      * frsqrt7(-normal) = canonical NaN
3576      * frsqrt7(-subnormal) = canonical NaN
3577      */
3578     if (float32_is_signaling_nan(f, s) ||
3579             (float32_is_infinity(f) && sign) ||
3580             (float32_is_normal(f) && sign) ||
3581             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3582         s->float_exception_flags |= float_flag_invalid;
3583         return float32_default_nan(s);
3584     }
3585 
3586     /* frsqrt7(qNaN) = canonical NaN */
3587     if (float32_is_quiet_nan(f, s)) {
3588         return float32_default_nan(s);
3589     }
3590 
3591     /* frsqrt7(+-0) = +-inf */
3592     if (float32_is_zero(f)) {
3593         s->float_exception_flags |= float_flag_divbyzero;
3594         return float32_set_sign(float32_infinity, sign);
3595     }
3596 
3597     /* frsqrt7(+inf) = +0 */
3598     if (float32_is_infinity(f) && !sign) {
3599         return float32_set_sign(float32_zero, sign);
3600     }
3601 
3602     /* +normal, +subnormal */
3603     uint64_t val = frsqrt7(f, exp_size, frac_size);
3604     return make_float32(val);
3605 }
3606 
3607 static float64 frsqrt7_d(float64 f, float_status *s)
3608 {
3609     int exp_size = 11, frac_size = 52;
3610     bool sign = float64_is_neg(f);
3611 
3612     /*
3613      * frsqrt7(sNaN) = canonical NaN
3614      * frsqrt7(-inf) = canonical NaN
3615      * frsqrt7(-normal) = canonical NaN
3616      * frsqrt7(-subnormal) = canonical NaN
3617      */
3618     if (float64_is_signaling_nan(f, s) ||
3619             (float64_is_infinity(f) && sign) ||
3620             (float64_is_normal(f) && sign) ||
3621             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3622         s->float_exception_flags |= float_flag_invalid;
3623         return float64_default_nan(s);
3624     }
3625 
3626     /* frsqrt7(qNaN) = canonical NaN */
3627     if (float64_is_quiet_nan(f, s)) {
3628         return float64_default_nan(s);
3629     }
3630 
3631     /* frsqrt7(+-0) = +-inf */
3632     if (float64_is_zero(f)) {
3633         s->float_exception_flags |= float_flag_divbyzero;
3634         return float64_set_sign(float64_infinity, sign);
3635     }
3636 
3637     /* frsqrt7(+inf) = +0 */
3638     if (float64_is_infinity(f) && !sign) {
3639         return float64_set_sign(float64_zero, sign);
3640     }
3641 
3642     /* +normal, +subnormal */
3643     uint64_t val = frsqrt7(f, exp_size, frac_size);
3644     return make_float64(val);
3645 }
3646 
3647 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3648 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3649 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3650 GEN_VEXT_V_ENV(vfrsqrt7_v_h)
3651 GEN_VEXT_V_ENV(vfrsqrt7_v_w)
3652 GEN_VEXT_V_ENV(vfrsqrt7_v_d)
3653 
3654 /*
3655  * Vector Floating-Point Reciprocal Estimate Instruction
3656  *
3657  * Adapted from riscv-v-spec recip.c:
3658  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3659  */
3660 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3661                       float_status *s)
3662 {
3663     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3664     uint64_t exp = extract64(f, frac_size, exp_size);
3665     uint64_t frac = extract64(f, 0, frac_size);
3666 
3667     const uint8_t lookup_table[] = {
3668         127, 125, 123, 121, 119, 117, 116, 114,
3669         112, 110, 109, 107, 105, 104, 102, 100,
3670         99, 97, 96, 94, 93, 91, 90, 88,
3671         87, 85, 84, 83, 81, 80, 79, 77,
3672         76, 75, 74, 72, 71, 70, 69, 68,
3673         66, 65, 64, 63, 62, 61, 60, 59,
3674         58, 57, 56, 55, 54, 53, 52, 51,
3675         50, 49, 48, 47, 46, 45, 44, 43,
3676         42, 41, 40, 40, 39, 38, 37, 36,
3677         35, 35, 34, 33, 32, 31, 31, 30,
3678         29, 28, 28, 27, 26, 25, 25, 24,
3679         23, 23, 22, 21, 21, 20, 19, 19,
3680         18, 17, 17, 16, 15, 15, 14, 14,
3681         13, 12, 12, 11, 11, 10, 9, 9,
3682         8, 8, 7, 7, 6, 5, 5, 4,
3683         4, 3, 3, 2, 2, 1, 1, 0
3684     };
3685     const int precision = 7;
3686 
3687     if (exp == 0 && frac != 0) { /* subnormal */
3688         /* Normalize the subnormal. */
3689         while (extract64(frac, frac_size - 1, 1) == 0) {
3690             exp--;
3691             frac <<= 1;
3692         }
3693 
3694         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3695 
3696         if (exp != 0 && exp != UINT64_MAX) {
3697             /*
3698              * Overflow to inf or max value of same sign,
3699              * depending on sign and rounding mode.
3700              */
3701             s->float_exception_flags |= (float_flag_inexact |
3702                                          float_flag_overflow);
3703 
3704             if ((s->float_rounding_mode == float_round_to_zero) ||
3705                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3706                 ((s->float_rounding_mode == float_round_up) && sign)) {
3707                 /* Return greatest/negative finite value. */
3708                 return (sign << (exp_size + frac_size)) |
3709                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3710             } else {
3711                 /* Return +-inf. */
3712                 return (sign << (exp_size + frac_size)) |
3713                     MAKE_64BIT_MASK(frac_size, exp_size);
3714             }
3715         }
3716     }
3717 
3718     int idx = frac >> (frac_size - precision);
3719     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3720                             (frac_size - precision);
3721     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3722 
3723     if (out_exp == 0 || out_exp == UINT64_MAX) {
3724         /*
3725          * The result is subnormal, but don't raise the underflow exception,
3726          * because there's no additional loss of precision.
3727          */
3728         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3729         if (out_exp == UINT64_MAX) {
3730             out_frac >>= 1;
3731             out_exp = 0;
3732         }
3733     }
3734 
3735     uint64_t val = 0;
3736     val = deposit64(val, 0, frac_size, out_frac);
3737     val = deposit64(val, frac_size, exp_size, out_exp);
3738     val = deposit64(val, frac_size + exp_size, 1, sign);
3739     return val;
3740 }
3741 
3742 static float16 frec7_h(float16 f, float_status *s)
3743 {
3744     int exp_size = 5, frac_size = 10;
3745     bool sign = float16_is_neg(f);
3746 
3747     /* frec7(+-inf) = +-0 */
3748     if (float16_is_infinity(f)) {
3749         return float16_set_sign(float16_zero, sign);
3750     }
3751 
3752     /* frec7(+-0) = +-inf */
3753     if (float16_is_zero(f)) {
3754         s->float_exception_flags |= float_flag_divbyzero;
3755         return float16_set_sign(float16_infinity, sign);
3756     }
3757 
3758     /* frec7(sNaN) = canonical NaN */
3759     if (float16_is_signaling_nan(f, s)) {
3760         s->float_exception_flags |= float_flag_invalid;
3761         return float16_default_nan(s);
3762     }
3763 
3764     /* frec7(qNaN) = canonical NaN */
3765     if (float16_is_quiet_nan(f, s)) {
3766         return float16_default_nan(s);
3767     }
3768 
3769     /* +-normal, +-subnormal */
3770     uint64_t val = frec7(f, exp_size, frac_size, s);
3771     return make_float16(val);
3772 }
3773 
3774 static float32 frec7_s(float32 f, float_status *s)
3775 {
3776     int exp_size = 8, frac_size = 23;
3777     bool sign = float32_is_neg(f);
3778 
3779     /* frec7(+-inf) = +-0 */
3780     if (float32_is_infinity(f)) {
3781         return float32_set_sign(float32_zero, sign);
3782     }
3783 
3784     /* frec7(+-0) = +-inf */
3785     if (float32_is_zero(f)) {
3786         s->float_exception_flags |= float_flag_divbyzero;
3787         return float32_set_sign(float32_infinity, sign);
3788     }
3789 
3790     /* frec7(sNaN) = canonical NaN */
3791     if (float32_is_signaling_nan(f, s)) {
3792         s->float_exception_flags |= float_flag_invalid;
3793         return float32_default_nan(s);
3794     }
3795 
3796     /* frec7(qNaN) = canonical NaN */
3797     if (float32_is_quiet_nan(f, s)) {
3798         return float32_default_nan(s);
3799     }
3800 
3801     /* +-normal, +-subnormal */
3802     uint64_t val = frec7(f, exp_size, frac_size, s);
3803     return make_float32(val);
3804 }
3805 
3806 static float64 frec7_d(float64 f, float_status *s)
3807 {
3808     int exp_size = 11, frac_size = 52;
3809     bool sign = float64_is_neg(f);
3810 
3811     /* frec7(+-inf) = +-0 */
3812     if (float64_is_infinity(f)) {
3813         return float64_set_sign(float64_zero, sign);
3814     }
3815 
3816     /* frec7(+-0) = +-inf */
3817     if (float64_is_zero(f)) {
3818         s->float_exception_flags |= float_flag_divbyzero;
3819         return float64_set_sign(float64_infinity, sign);
3820     }
3821 
3822     /* frec7(sNaN) = canonical NaN */
3823     if (float64_is_signaling_nan(f, s)) {
3824         s->float_exception_flags |= float_flag_invalid;
3825         return float64_default_nan(s);
3826     }
3827 
3828     /* frec7(qNaN) = canonical NaN */
3829     if (float64_is_quiet_nan(f, s)) {
3830         return float64_default_nan(s);
3831     }
3832 
3833     /* +-normal, +-subnormal */
3834     uint64_t val = frec7(f, exp_size, frac_size, s);
3835     return make_float64(val);
3836 }
3837 
3838 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3839 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3840 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3841 GEN_VEXT_V_ENV(vfrec7_v_h)
3842 GEN_VEXT_V_ENV(vfrec7_v_w)
3843 GEN_VEXT_V_ENV(vfrec7_v_d)
3844 
3845 /* Vector Floating-Point MIN/MAX Instructions */
3846 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3847 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3848 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3849 GEN_VEXT_VV_ENV(vfmin_vv_h)
3850 GEN_VEXT_VV_ENV(vfmin_vv_w)
3851 GEN_VEXT_VV_ENV(vfmin_vv_d)
3852 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3853 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3854 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3855 GEN_VEXT_VF(vfmin_vf_h)
3856 GEN_VEXT_VF(vfmin_vf_w)
3857 GEN_VEXT_VF(vfmin_vf_d)
3858 
3859 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3860 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3861 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3862 GEN_VEXT_VV_ENV(vfmax_vv_h)
3863 GEN_VEXT_VV_ENV(vfmax_vv_w)
3864 GEN_VEXT_VV_ENV(vfmax_vv_d)
3865 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3866 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3867 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3868 GEN_VEXT_VF(vfmax_vf_h)
3869 GEN_VEXT_VF(vfmax_vf_w)
3870 GEN_VEXT_VF(vfmax_vf_d)
3871 
3872 /* Vector Floating-Point Sign-Injection Instructions */
3873 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3874 {
3875     return deposit64(b, 0, 15, a);
3876 }
3877 
3878 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3879 {
3880     return deposit64(b, 0, 31, a);
3881 }
3882 
3883 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3884 {
3885     return deposit64(b, 0, 63, a);
3886 }
3887 
3888 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3889 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3890 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3891 GEN_VEXT_VV_ENV(vfsgnj_vv_h)
3892 GEN_VEXT_VV_ENV(vfsgnj_vv_w)
3893 GEN_VEXT_VV_ENV(vfsgnj_vv_d)
3894 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3895 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3896 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3897 GEN_VEXT_VF(vfsgnj_vf_h)
3898 GEN_VEXT_VF(vfsgnj_vf_w)
3899 GEN_VEXT_VF(vfsgnj_vf_d)
3900 
3901 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3902 {
3903     return deposit64(~b, 0, 15, a);
3904 }
3905 
3906 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3907 {
3908     return deposit64(~b, 0, 31, a);
3909 }
3910 
3911 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3912 {
3913     return deposit64(~b, 0, 63, a);
3914 }
3915 
3916 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3917 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3918 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3919 GEN_VEXT_VV_ENV(vfsgnjn_vv_h)
3920 GEN_VEXT_VV_ENV(vfsgnjn_vv_w)
3921 GEN_VEXT_VV_ENV(vfsgnjn_vv_d)
3922 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3923 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3924 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3925 GEN_VEXT_VF(vfsgnjn_vf_h)
3926 GEN_VEXT_VF(vfsgnjn_vf_w)
3927 GEN_VEXT_VF(vfsgnjn_vf_d)
3928 
3929 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3930 {
3931     return deposit64(b ^ a, 0, 15, a);
3932 }
3933 
3934 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3935 {
3936     return deposit64(b ^ a, 0, 31, a);
3937 }
3938 
3939 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3940 {
3941     return deposit64(b ^ a, 0, 63, a);
3942 }
3943 
3944 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3945 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3946 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3947 GEN_VEXT_VV_ENV(vfsgnjx_vv_h)
3948 GEN_VEXT_VV_ENV(vfsgnjx_vv_w)
3949 GEN_VEXT_VV_ENV(vfsgnjx_vv_d)
3950 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3951 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3952 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3953 GEN_VEXT_VF(vfsgnjx_vf_h)
3954 GEN_VEXT_VF(vfsgnjx_vf_w)
3955 GEN_VEXT_VF(vfsgnjx_vf_d)
3956 
3957 /* Vector Floating-Point Compare Instructions */
3958 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3959 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3960                   CPURISCVState *env, uint32_t desc)          \
3961 {                                                             \
3962     uint32_t vm = vext_vm(desc);                              \
3963     uint32_t vl = env->vl;                                    \
3964     uint32_t i;                                               \
3965                                                               \
3966     for (i = env->vstart; i < vl; i++) {                      \
3967         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3968         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3969         if (!vm && !vext_elem_mask(v0, i)) {                  \
3970             continue;                                         \
3971         }                                                     \
3972         vext_set_elem_mask(vd, i,                             \
3973                            DO_OP(s2, s1, &env->fp_status));   \
3974     }                                                         \
3975     env->vstart = 0;                                          \
3976 }
3977 
3978 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
3979 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
3980 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
3981 
3982 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
3983 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
3984                   CPURISCVState *env, uint32_t desc)                \
3985 {                                                                   \
3986     uint32_t vm = vext_vm(desc);                                    \
3987     uint32_t vl = env->vl;                                          \
3988     uint32_t i;                                                     \
3989                                                                     \
3990     for (i = env->vstart; i < vl; i++) {                            \
3991         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
3992         if (!vm && !vext_elem_mask(v0, i)) {                        \
3993             continue;                                               \
3994         }                                                           \
3995         vext_set_elem_mask(vd, i,                                   \
3996                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
3997     }                                                               \
3998     env->vstart = 0;                                                \
3999 }
4000 
4001 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4002 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4003 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4004 
4005 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4006 {
4007     FloatRelation compare = float16_compare_quiet(a, b, s);
4008     return compare != float_relation_equal;
4009 }
4010 
4011 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4012 {
4013     FloatRelation compare = float32_compare_quiet(a, b, s);
4014     return compare != float_relation_equal;
4015 }
4016 
4017 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4018 {
4019     FloatRelation compare = float64_compare_quiet(a, b, s);
4020     return compare != float_relation_equal;
4021 }
4022 
4023 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4024 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4025 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4026 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4027 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4028 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4029 
4030 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4031 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4032 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4033 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4034 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4035 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4036 
4037 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4038 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4039 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4040 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4041 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4042 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4043 
4044 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4045 {
4046     FloatRelation compare = float16_compare(a, b, s);
4047     return compare == float_relation_greater;
4048 }
4049 
4050 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4051 {
4052     FloatRelation compare = float32_compare(a, b, s);
4053     return compare == float_relation_greater;
4054 }
4055 
4056 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4057 {
4058     FloatRelation compare = float64_compare(a, b, s);
4059     return compare == float_relation_greater;
4060 }
4061 
4062 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4063 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4064 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4065 
4066 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4067 {
4068     FloatRelation compare = float16_compare(a, b, s);
4069     return compare == float_relation_greater ||
4070            compare == float_relation_equal;
4071 }
4072 
4073 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4074 {
4075     FloatRelation compare = float32_compare(a, b, s);
4076     return compare == float_relation_greater ||
4077            compare == float_relation_equal;
4078 }
4079 
4080 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4081 {
4082     FloatRelation compare = float64_compare(a, b, s);
4083     return compare == float_relation_greater ||
4084            compare == float_relation_equal;
4085 }
4086 
4087 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4088 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4089 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4090 
4091 /* Vector Floating-Point Classify Instruction */
4092 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4093 static void do_##NAME(void *vd, void *vs2, int i)      \
4094 {                                                      \
4095     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4096     *((TD *)vd + HD(i)) = OP(s2);                      \
4097 }
4098 
4099 #define GEN_VEXT_V(NAME)                               \
4100 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4101                   CPURISCVState *env, uint32_t desc)   \
4102 {                                                      \
4103     uint32_t vm = vext_vm(desc);                       \
4104     uint32_t vl = env->vl;                             \
4105     uint32_t i;                                        \
4106                                                        \
4107     for (i = env->vstart; i < vl; i++) {               \
4108         if (!vm && !vext_elem_mask(v0, i)) {           \
4109             continue;                                  \
4110         }                                              \
4111         do_##NAME(vd, vs2, i);                         \
4112     }                                                  \
4113     env->vstart = 0;                                   \
4114 }
4115 
4116 target_ulong fclass_h(uint64_t frs1)
4117 {
4118     float16 f = frs1;
4119     bool sign = float16_is_neg(f);
4120 
4121     if (float16_is_infinity(f)) {
4122         return sign ? 1 << 0 : 1 << 7;
4123     } else if (float16_is_zero(f)) {
4124         return sign ? 1 << 3 : 1 << 4;
4125     } else if (float16_is_zero_or_denormal(f)) {
4126         return sign ? 1 << 2 : 1 << 5;
4127     } else if (float16_is_any_nan(f)) {
4128         float_status s = { }; /* for snan_bit_is_one */
4129         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4130     } else {
4131         return sign ? 1 << 1 : 1 << 6;
4132     }
4133 }
4134 
4135 target_ulong fclass_s(uint64_t frs1)
4136 {
4137     float32 f = frs1;
4138     bool sign = float32_is_neg(f);
4139 
4140     if (float32_is_infinity(f)) {
4141         return sign ? 1 << 0 : 1 << 7;
4142     } else if (float32_is_zero(f)) {
4143         return sign ? 1 << 3 : 1 << 4;
4144     } else if (float32_is_zero_or_denormal(f)) {
4145         return sign ? 1 << 2 : 1 << 5;
4146     } else if (float32_is_any_nan(f)) {
4147         float_status s = { }; /* for snan_bit_is_one */
4148         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4149     } else {
4150         return sign ? 1 << 1 : 1 << 6;
4151     }
4152 }
4153 
4154 target_ulong fclass_d(uint64_t frs1)
4155 {
4156     float64 f = frs1;
4157     bool sign = float64_is_neg(f);
4158 
4159     if (float64_is_infinity(f)) {
4160         return sign ? 1 << 0 : 1 << 7;
4161     } else if (float64_is_zero(f)) {
4162         return sign ? 1 << 3 : 1 << 4;
4163     } else if (float64_is_zero_or_denormal(f)) {
4164         return sign ? 1 << 2 : 1 << 5;
4165     } else if (float64_is_any_nan(f)) {
4166         float_status s = { }; /* for snan_bit_is_one */
4167         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4168     } else {
4169         return sign ? 1 << 1 : 1 << 6;
4170     }
4171 }
4172 
4173 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4174 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4175 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4176 GEN_VEXT_V(vfclass_v_h)
4177 GEN_VEXT_V(vfclass_v_w)
4178 GEN_VEXT_V(vfclass_v_d)
4179 
4180 /* Vector Floating-Point Merge Instruction */
4181 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4182 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4183                   CPURISCVState *env, uint32_t desc)          \
4184 {                                                             \
4185     uint32_t vm = vext_vm(desc);                              \
4186     uint32_t vl = env->vl;                                    \
4187     uint32_t i;                                               \
4188                                                               \
4189     for (i = env->vstart; i < vl; i++) {                      \
4190         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4191         *((ETYPE *)vd + H(i))                                 \
4192           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4193     }                                                         \
4194     env->vstart = 0;                                          \
4195 }
4196 
4197 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4198 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4199 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4200 
4201 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4202 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4203 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4204 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4205 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4206 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h)
4207 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w)
4208 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d)
4209 
4210 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4211 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4212 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4213 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4214 GEN_VEXT_V_ENV(vfcvt_x_f_v_h)
4215 GEN_VEXT_V_ENV(vfcvt_x_f_v_w)
4216 GEN_VEXT_V_ENV(vfcvt_x_f_v_d)
4217 
4218 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4219 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4220 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4221 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4222 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h)
4223 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w)
4224 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d)
4225 
4226 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4227 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4228 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4229 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4230 GEN_VEXT_V_ENV(vfcvt_f_x_v_h)
4231 GEN_VEXT_V_ENV(vfcvt_f_x_v_w)
4232 GEN_VEXT_V_ENV(vfcvt_f_x_v_d)
4233 
4234 /* Widening Floating-Point/Integer Type-Convert Instructions */
4235 /* (TD, T2, TX2) */
4236 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4237 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4238 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4239 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4240 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4241 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4242 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h)
4243 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w)
4244 
4245 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4246 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4247 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4248 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h)
4249 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w)
4250 
4251 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4252 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4253 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4254 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4255 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b)
4256 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h)
4257 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w)
4258 
4259 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4260 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4261 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4262 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4263 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b)
4264 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h)
4265 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w)
4266 
4267 /*
4268  * vfwcvt.f.f.v vd, vs2, vm
4269  * Convert single-width float to double-width float.
4270  */
4271 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4272 {
4273     return float16_to_float32(a, true, s);
4274 }
4275 
4276 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4277 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4278 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h)
4279 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w)
4280 
4281 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4282 /* (TD, T2, TX2) */
4283 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4284 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4285 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4286 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4287 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4288 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4289 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4290 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b)
4291 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h)
4292 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w)
4293 
4294 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4295 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4296 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4297 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4298 GEN_VEXT_V_ENV(vfncvt_x_f_w_b)
4299 GEN_VEXT_V_ENV(vfncvt_x_f_w_h)
4300 GEN_VEXT_V_ENV(vfncvt_x_f_w_w)
4301 
4302 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4303 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4304 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4305 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h)
4306 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w)
4307 
4308 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4309 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4310 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4311 GEN_VEXT_V_ENV(vfncvt_f_x_w_h)
4312 GEN_VEXT_V_ENV(vfncvt_f_x_w_w)
4313 
4314 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4315 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4316 {
4317     return float32_to_float16(a, true, s);
4318 }
4319 
4320 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4321 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4322 GEN_VEXT_V_ENV(vfncvt_f_f_w_h)
4323 GEN_VEXT_V_ENV(vfncvt_f_f_w_w)
4324 
4325 /*
4326  *** Vector Reduction Operations
4327  */
4328 /* Vector Single-Width Integer Reduction Instructions */
4329 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4330 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4331         void *vs2, CPURISCVState *env, uint32_t desc)     \
4332 {                                                         \
4333     uint32_t vm = vext_vm(desc);                          \
4334     uint32_t vl = env->vl;                                \
4335     uint32_t i;                                           \
4336     TD s1 =  *((TD *)vs1 + HD(0));                        \
4337                                                           \
4338     for (i = env->vstart; i < vl; i++) {                  \
4339         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4340         if (!vm && !vext_elem_mask(v0, i)) {              \
4341             continue;                                     \
4342         }                                                 \
4343         s1 = OP(s1, (TD)s2);                              \
4344     }                                                     \
4345     *((TD *)vd + HD(0)) = s1;                             \
4346     env->vstart = 0;                                      \
4347 }
4348 
4349 /* vd[0] = sum(vs1[0], vs2[*]) */
4350 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4351 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4352 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4353 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4354 
4355 /* vd[0] = maxu(vs1[0], vs2[*]) */
4356 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4357 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4358 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4359 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4360 
4361 /* vd[0] = max(vs1[0], vs2[*]) */
4362 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4363 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4364 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4365 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4366 
4367 /* vd[0] = minu(vs1[0], vs2[*]) */
4368 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4369 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4370 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4371 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4372 
4373 /* vd[0] = min(vs1[0], vs2[*]) */
4374 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4375 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4376 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4377 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4378 
4379 /* vd[0] = and(vs1[0], vs2[*]) */
4380 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4381 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4382 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4383 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4384 
4385 /* vd[0] = or(vs1[0], vs2[*]) */
4386 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4387 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4388 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4389 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4390 
4391 /* vd[0] = xor(vs1[0], vs2[*]) */
4392 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4393 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4394 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4395 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4396 
4397 /* Vector Widening Integer Reduction Instructions */
4398 /* signed sum reduction into double-width accumulator */
4399 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4400 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4401 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4402 
4403 /* Unsigned sum reduction into double-width accumulator */
4404 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4405 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4406 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4407 
4408 /* Vector Single-Width Floating-Point Reduction Instructions */
4409 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4410 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4411                   void *vs2, CPURISCVState *env,           \
4412                   uint32_t desc)                           \
4413 {                                                          \
4414     uint32_t vm = vext_vm(desc);                           \
4415     uint32_t vl = env->vl;                                 \
4416     uint32_t i;                                            \
4417     TD s1 =  *((TD *)vs1 + HD(0));                         \
4418                                                            \
4419     for (i = env->vstart; i < vl; i++) {                   \
4420         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4421         if (!vm && !vext_elem_mask(v0, i)) {               \
4422             continue;                                      \
4423         }                                                  \
4424         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4425     }                                                      \
4426     *((TD *)vd + HD(0)) = s1;                              \
4427     env->vstart = 0;                                       \
4428 }
4429 
4430 /* Unordered sum */
4431 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4432 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4433 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4434 
4435 /* Maximum value */
4436 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4437 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4438 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4439 
4440 /* Minimum value */
4441 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4442 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4443 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4444 
4445 /* Vector Widening Floating-Point Reduction Instructions */
4446 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4447 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4448                             void *vs2, CPURISCVState *env, uint32_t desc)
4449 {
4450     uint32_t vm = vext_vm(desc);
4451     uint32_t vl = env->vl;
4452     uint32_t i;
4453     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4454 
4455     for (i = env->vstart; i < vl; i++) {
4456         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4457         if (!vm && !vext_elem_mask(v0, i)) {
4458             continue;
4459         }
4460         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4461                          &env->fp_status);
4462     }
4463     *((uint32_t *)vd + H4(0)) = s1;
4464     env->vstart = 0;
4465 }
4466 
4467 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4468                             void *vs2, CPURISCVState *env, uint32_t desc)
4469 {
4470     uint32_t vm = vext_vm(desc);
4471     uint32_t vl = env->vl;
4472     uint32_t i;
4473     uint64_t s1 =  *((uint64_t *)vs1);
4474 
4475     for (i = env->vstart; i < vl; i++) {
4476         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4477         if (!vm && !vext_elem_mask(v0, i)) {
4478             continue;
4479         }
4480         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4481                          &env->fp_status);
4482     }
4483     *((uint64_t *)vd) = s1;
4484     env->vstart = 0;
4485 }
4486 
4487 /*
4488  *** Vector Mask Operations
4489  */
4490 /* Vector Mask-Register Logical Instructions */
4491 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4492 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4493                   void *vs2, CPURISCVState *env,          \
4494                   uint32_t desc)                          \
4495 {                                                         \
4496     uint32_t vl = env->vl;                                \
4497     uint32_t i;                                           \
4498     int a, b;                                             \
4499                                                           \
4500     for (i = env->vstart; i < vl; i++) {                  \
4501         a = vext_elem_mask(vs1, i);                       \
4502         b = vext_elem_mask(vs2, i);                       \
4503         vext_set_elem_mask(vd, i, OP(b, a));              \
4504     }                                                     \
4505     env->vstart = 0;                                      \
4506 }
4507 
4508 #define DO_NAND(N, M)  (!(N & M))
4509 #define DO_ANDNOT(N, M)  (N & !M)
4510 #define DO_NOR(N, M)  (!(N | M))
4511 #define DO_ORNOT(N, M)  (N | !M)
4512 #define DO_XNOR(N, M)  (!(N ^ M))
4513 
4514 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4515 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4516 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4517 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4518 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4519 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4520 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4521 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4522 
4523 /* Vector count population in mask vcpop */
4524 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4525                              uint32_t desc)
4526 {
4527     target_ulong cnt = 0;
4528     uint32_t vm = vext_vm(desc);
4529     uint32_t vl = env->vl;
4530     int i;
4531 
4532     for (i = env->vstart; i < vl; i++) {
4533         if (vm || vext_elem_mask(v0, i)) {
4534             if (vext_elem_mask(vs2, i)) {
4535                 cnt++;
4536             }
4537         }
4538     }
4539     env->vstart = 0;
4540     return cnt;
4541 }
4542 
4543 /* vfirst find-first-set mask bit*/
4544 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4545                               uint32_t desc)
4546 {
4547     uint32_t vm = vext_vm(desc);
4548     uint32_t vl = env->vl;
4549     int i;
4550 
4551     for (i = env->vstart; i < vl; i++) {
4552         if (vm || vext_elem_mask(v0, i)) {
4553             if (vext_elem_mask(vs2, i)) {
4554                 return i;
4555             }
4556         }
4557     }
4558     env->vstart = 0;
4559     return -1LL;
4560 }
4561 
4562 enum set_mask_type {
4563     ONLY_FIRST = 1,
4564     INCLUDE_FIRST,
4565     BEFORE_FIRST,
4566 };
4567 
4568 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4569                    uint32_t desc, enum set_mask_type type)
4570 {
4571     uint32_t vm = vext_vm(desc);
4572     uint32_t vl = env->vl;
4573     int i;
4574     bool first_mask_bit = false;
4575 
4576     for (i = env->vstart; i < vl; i++) {
4577         if (!vm && !vext_elem_mask(v0, i)) {
4578             continue;
4579         }
4580         /* write a zero to all following active elements */
4581         if (first_mask_bit) {
4582             vext_set_elem_mask(vd, i, 0);
4583             continue;
4584         }
4585         if (vext_elem_mask(vs2, i)) {
4586             first_mask_bit = true;
4587             if (type == BEFORE_FIRST) {
4588                 vext_set_elem_mask(vd, i, 0);
4589             } else {
4590                 vext_set_elem_mask(vd, i, 1);
4591             }
4592         } else {
4593             if (type == ONLY_FIRST) {
4594                 vext_set_elem_mask(vd, i, 0);
4595             } else {
4596                 vext_set_elem_mask(vd, i, 1);
4597             }
4598         }
4599     }
4600     env->vstart = 0;
4601 }
4602 
4603 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4604                      uint32_t desc)
4605 {
4606     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4607 }
4608 
4609 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4610                      uint32_t desc)
4611 {
4612     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4613 }
4614 
4615 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4616                      uint32_t desc)
4617 {
4618     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4619 }
4620 
4621 /* Vector Iota Instruction */
4622 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4623 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4624                   uint32_t desc)                                          \
4625 {                                                                         \
4626     uint32_t vm = vext_vm(desc);                                          \
4627     uint32_t vl = env->vl;                                                \
4628     uint32_t sum = 0;                                                     \
4629     int i;                                                                \
4630                                                                           \
4631     for (i = env->vstart; i < vl; i++) {                                  \
4632         if (!vm && !vext_elem_mask(v0, i)) {                              \
4633             continue;                                                     \
4634         }                                                                 \
4635         *((ETYPE *)vd + H(i)) = sum;                                      \
4636         if (vext_elem_mask(vs2, i)) {                                     \
4637             sum++;                                                        \
4638         }                                                                 \
4639     }                                                                     \
4640     env->vstart = 0;                                                      \
4641 }
4642 
4643 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4644 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4645 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4646 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4647 
4648 /* Vector Element Index Instruction */
4649 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4650 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4651 {                                                                         \
4652     uint32_t vm = vext_vm(desc);                                          \
4653     uint32_t vl = env->vl;                                                \
4654     int i;                                                                \
4655                                                                           \
4656     for (i = env->vstart; i < vl; i++) {                                  \
4657         if (!vm && !vext_elem_mask(v0, i)) {                              \
4658             continue;                                                     \
4659         }                                                                 \
4660         *((ETYPE *)vd + H(i)) = i;                                        \
4661     }                                                                     \
4662     env->vstart = 0;                                                      \
4663 }
4664 
4665 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4666 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4667 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4668 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4669 
4670 /*
4671  *** Vector Permutation Instructions
4672  */
4673 
4674 /* Vector Slide Instructions */
4675 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4676 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4677                   CPURISCVState *env, uint32_t desc)                      \
4678 {                                                                         \
4679     uint32_t vm = vext_vm(desc);                                          \
4680     uint32_t vl = env->vl;                                                \
4681     target_ulong offset = s1, i_min, i;                                   \
4682                                                                           \
4683     i_min = MAX(env->vstart, offset);                                     \
4684     for (i = i_min; i < vl; i++) {                                        \
4685         if (!vm && !vext_elem_mask(v0, i)) {                              \
4686             continue;                                                     \
4687         }                                                                 \
4688         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4689     }                                                                     \
4690 }
4691 
4692 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4693 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4694 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4695 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4696 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4697 
4698 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4699 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4700                   CPURISCVState *env, uint32_t desc)                      \
4701 {                                                                         \
4702     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4703     uint32_t vm = vext_vm(desc);                                          \
4704     uint32_t vl = env->vl;                                                \
4705     target_ulong i_max, i;                                                \
4706                                                                           \
4707     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4708     for (i = env->vstart; i < i_max; ++i) {                               \
4709         if (vm || vext_elem_mask(v0, i)) {                                \
4710             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4711         }                                                                 \
4712     }                                                                     \
4713                                                                           \
4714     for (i = i_max; i < vl; ++i) {                                        \
4715         if (vm || vext_elem_mask(v0, i)) {                                \
4716             *((ETYPE *)vd + H(i)) = 0;                                    \
4717         }                                                                 \
4718     }                                                                     \
4719                                                                           \
4720     env->vstart = 0;                                                      \
4721 }
4722 
4723 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4724 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4725 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4726 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4727 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4728 
4729 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4730 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4731                      void *vs2, CPURISCVState *env, uint32_t desc)          \
4732 {                                                                           \
4733     typedef uint##BITWIDTH##_t ETYPE;                                       \
4734     uint32_t vm = vext_vm(desc);                                            \
4735     uint32_t vl = env->vl;                                                  \
4736     uint32_t i;                                                             \
4737                                                                             \
4738     for (i = env->vstart; i < vl; i++) {                                    \
4739         if (!vm && !vext_elem_mask(v0, i)) {                                \
4740             continue;                                                       \
4741         }                                                                   \
4742         if (i == 0) {                                                       \
4743             *((ETYPE *)vd + H(i)) = s1;                                     \
4744         } else {                                                            \
4745             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4746         }                                                                   \
4747     }                                                                       \
4748     env->vstart = 0;                                                        \
4749 }
4750 
4751 GEN_VEXT_VSLIE1UP(8,  H1)
4752 GEN_VEXT_VSLIE1UP(16, H2)
4753 GEN_VEXT_VSLIE1UP(32, H4)
4754 GEN_VEXT_VSLIE1UP(64, H8)
4755 
4756 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4757 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4758                   CPURISCVState *env, uint32_t desc)              \
4759 {                                                                 \
4760     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4761 }
4762 
4763 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4764 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4765 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4766 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4767 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4768 
4769 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4770 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4771                        void *vs2, CPURISCVState *env, uint32_t desc)          \
4772 {                                                                             \
4773     typedef uint##BITWIDTH##_t ETYPE;                                         \
4774     uint32_t vm = vext_vm(desc);                                              \
4775     uint32_t vl = env->vl;                                                    \
4776     uint32_t i;                                                               \
4777                                                                               \
4778     for (i = env->vstart; i < vl; i++) {                                      \
4779         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4780             continue;                                                         \
4781         }                                                                     \
4782         if (i == vl - 1) {                                                    \
4783             *((ETYPE *)vd + H(i)) = s1;                                       \
4784         } else {                                                              \
4785             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4786         }                                                                     \
4787     }                                                                         \
4788     env->vstart = 0;                                                          \
4789 }
4790 
4791 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4792 GEN_VEXT_VSLIDE1DOWN(16, H2)
4793 GEN_VEXT_VSLIDE1DOWN(32, H4)
4794 GEN_VEXT_VSLIDE1DOWN(64, H8)
4795 
4796 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4797 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4798                   CPURISCVState *env, uint32_t desc)              \
4799 {                                                                 \
4800     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4801 }
4802 
4803 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4804 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4805 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4806 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4807 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4808 
4809 /* Vector Floating-Point Slide Instructions */
4810 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4811 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4812                   CPURISCVState *env, uint32_t desc)          \
4813 {                                                             \
4814     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4815 }
4816 
4817 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4818 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4819 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4820 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4821 
4822 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4823 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4824                   CPURISCVState *env, uint32_t desc)          \
4825 {                                                             \
4826     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4827 }
4828 
4829 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4830 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4831 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4832 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4833 
4834 /* Vector Register Gather Instruction */
4835 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4836 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4837                   CPURISCVState *env, uint32_t desc)                      \
4838 {                                                                         \
4839     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4840     uint32_t vm = vext_vm(desc);                                          \
4841     uint32_t vl = env->vl;                                                \
4842     uint64_t index;                                                       \
4843     uint32_t i;                                                           \
4844                                                                           \
4845     for (i = env->vstart; i < vl; i++) {                                  \
4846         if (!vm && !vext_elem_mask(v0, i)) {                              \
4847             continue;                                                     \
4848         }                                                                 \
4849         index = *((TS1 *)vs1 + HS1(i));                                   \
4850         if (index >= vlmax) {                                             \
4851             *((TS2 *)vd + HS2(i)) = 0;                                    \
4852         } else {                                                          \
4853             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4854         }                                                                 \
4855     }                                                                     \
4856     env->vstart = 0;                                                      \
4857 }
4858 
4859 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4860 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4861 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4862 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4863 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4864 
4865 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4866 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4867 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
4868 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
4869 
4870 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4871 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4872                   CPURISCVState *env, uint32_t desc)                      \
4873 {                                                                         \
4874     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4875     uint32_t vm = vext_vm(desc);                                          \
4876     uint32_t vl = env->vl;                                                \
4877     uint64_t index = s1;                                                  \
4878     uint32_t i;                                                           \
4879                                                                           \
4880     for (i = env->vstart; i < vl; i++) {                                  \
4881         if (!vm && !vext_elem_mask(v0, i)) {                              \
4882             continue;                                                     \
4883         }                                                                 \
4884         if (index >= vlmax) {                                             \
4885             *((ETYPE *)vd + H(i)) = 0;                                    \
4886         } else {                                                          \
4887             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
4888         }                                                                 \
4889     }                                                                     \
4890     env->vstart = 0;                                                      \
4891 }
4892 
4893 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
4894 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
4895 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
4896 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
4897 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
4898 
4899 /* Vector Compress Instruction */
4900 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
4901 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4902                   CPURISCVState *env, uint32_t desc)                      \
4903 {                                                                         \
4904     uint32_t vl = env->vl;                                                \
4905     uint32_t num = 0, i;                                                  \
4906                                                                           \
4907     for (i = env->vstart; i < vl; i++) {                                  \
4908         if (!vext_elem_mask(vs1, i)) {                                    \
4909             continue;                                                     \
4910         }                                                                 \
4911         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
4912         num++;                                                            \
4913     }                                                                     \
4914     env->vstart = 0;                                                      \
4915 }
4916 
4917 /* Compress into vd elements of vs2 where vs1 is enabled */
4918 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
4919 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
4920 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
4921 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
4922 
4923 /* Vector Whole Register Move */
4924 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
4925 {
4926     /* EEW = SEW */
4927     uint32_t maxsz = simd_maxsz(desc);
4928     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
4929     uint32_t startb = env->vstart * sewb;
4930     uint32_t i = startb;
4931 
4932     memcpy((uint8_t *)vd + H1(i),
4933            (uint8_t *)vs2 + H1(i),
4934            maxsz - startb);
4935 
4936     env->vstart = 0;
4937 }
4938 
4939 /* Vector Integer Extension */
4940 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
4941 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
4942                   CPURISCVState *env, uint32_t desc)             \
4943 {                                                                \
4944     uint32_t vl = env->vl;                                       \
4945     uint32_t vm = vext_vm(desc);                                 \
4946     uint32_t i;                                                  \
4947                                                                  \
4948     for (i = env->vstart; i < vl; i++) {                         \
4949         if (!vm && !vext_elem_mask(v0, i)) {                     \
4950             continue;                                            \
4951         }                                                        \
4952         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
4953     }                                                            \
4954     env->vstart = 0;                                             \
4955 }
4956 
4957 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
4958 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
4959 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
4960 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
4961 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
4962 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
4963 
4964 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
4965 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
4966 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
4967 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
4968 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
4969 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
4970