xref: /openbmc/qemu/target/riscv/vector_helper.c (revision b9c0a2e0)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg-gvec-desc.h"
30 #include "internals.h"
31 #include "vector_internals.h"
32 #include <math.h>
33 
34 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                             target_ulong s2)
36 {
37     int vlmax, vl;
38     RISCVCPU *cpu = env_archcpu(env);
39     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41     uint16_t sew = 8 << vsew;
42     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43     int xlen = riscv_cpu_xlen(env);
44     bool vill = (s2 >> (xlen - 1)) & 0x1;
45     target_ulong reserved = s2 &
46                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48     uint16_t vlen = cpu->cfg.vlenb << 3;
49     int8_t lmul;
50 
51     if (vlmul & 4) {
52         /*
53          * Fractional LMUL, check:
54          *
55          * VLEN * LMUL >= SEW
56          * VLEN >> (8 - lmul) >= sew
57          * (vlenb << 3) >> (8 - lmul) >= sew
58          */
59         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else {
79         vl = vlmax;
80     }
81     env->vl = vl;
82     env->vtype = s2;
83     env->vstart = 0;
84     env->vill = 0;
85     return vl;
86 }
87 
88 /*
89  * Get the maximum number of elements can be operated.
90  *
91  * log2_esz: log2 of element size in bytes.
92  */
93 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
94 {
95     /*
96      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
97      * so vlen in bytes (vlenb) is encoded as maxsz.
98      */
99     uint32_t vlenb = simd_maxsz(desc);
100 
101     /* Return VLMAX */
102     int scale = vext_lmul(desc) - log2_esz;
103     return scale < 0 ? vlenb >> -scale : vlenb << scale;
104 }
105 
106 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
107 {
108     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
109 }
110 
111 /*
112  * This function checks watchpoint before real load operation.
113  *
114  * In system mode, the TLB API probe_access is enough for watchpoint check.
115  * In user mode, there is no watchpoint support now.
116  *
117  * It will trigger an exception if there is no mapping in TLB
118  * and page table walk can't fill the TLB entry. Then the guest
119  * software can return here after process the exception or never return.
120  */
121 static void probe_pages(CPURISCVState *env, target_ulong addr,
122                         target_ulong len, uintptr_t ra,
123                         MMUAccessType access_type)
124 {
125     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
126     target_ulong curlen = MIN(pagelen, len);
127     int mmu_index = riscv_env_mmu_index(env, false);
128 
129     probe_access(env, adjust_addr(env, addr), curlen, access_type,
130                  mmu_index, ra);
131     if (len > curlen) {
132         addr += curlen;
133         curlen = len - curlen;
134         probe_access(env, adjust_addr(env, addr), curlen, access_type,
135                      mmu_index, ra);
136     }
137 }
138 
139 static inline void vext_set_elem_mask(void *v0, int index,
140                                       uint8_t value)
141 {
142     int idx = index / 64;
143     int pos = index % 64;
144     uint64_t old = ((uint64_t *)v0)[idx];
145     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
146 }
147 
148 /* elements operations for load and store */
149 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
150                                uint32_t idx, void *vd, uintptr_t retaddr);
151 
152 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
153 static void NAME(CPURISCVState *env, abi_ptr addr,         \
154                  uint32_t idx, void *vd, uintptr_t retaddr)\
155 {                                                          \
156     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
157     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
158 }                                                          \
159 
160 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
161 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
162 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
163 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
164 
165 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
166 static void NAME(CPURISCVState *env, abi_ptr addr,         \
167                  uint32_t idx, void *vd, uintptr_t retaddr)\
168 {                                                          \
169     ETYPE data = *((ETYPE *)vd + H(idx));                  \
170     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
171 }
172 
173 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
174 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
175 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
176 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
177 
178 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
179                                    uint32_t desc, uint32_t nf,
180                                    uint32_t esz, uint32_t max_elems)
181 {
182     uint32_t vta = vext_vta(desc);
183     int k;
184 
185     if (vta == 0) {
186         return;
187     }
188 
189     for (k = 0; k < nf; ++k) {
190         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
191                           (k * max_elems + max_elems) * esz);
192     }
193 }
194 
195 /*
196  * stride: access vector element from strided memory
197  */
198 static void
199 vext_ldst_stride(void *vd, void *v0, target_ulong base,
200                  target_ulong stride, CPURISCVState *env,
201                  uint32_t desc, uint32_t vm,
202                  vext_ldst_elem_fn *ldst_elem,
203                  uint32_t log2_esz, uintptr_t ra)
204 {
205     uint32_t i, k;
206     uint32_t nf = vext_nf(desc);
207     uint32_t max_elems = vext_max_elems(desc, log2_esz);
208     uint32_t esz = 1 << log2_esz;
209     uint32_t vma = vext_vma(desc);
210 
211     VSTART_CHECK_EARLY_EXIT(env);
212 
213     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
214         k = 0;
215         while (k < nf) {
216             if (!vm && !vext_elem_mask(v0, i)) {
217                 /* set masked-off elements to 1s */
218                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
219                                   (i + k * max_elems + 1) * esz);
220                 k++;
221                 continue;
222             }
223             target_ulong addr = base + stride * i + (k << log2_esz);
224             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
225             k++;
226         }
227     }
228     env->vstart = 0;
229 
230     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
231 }
232 
233 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
234 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
235                   target_ulong stride, CPURISCVState *env,              \
236                   uint32_t desc)                                        \
237 {                                                                       \
238     uint32_t vm = vext_vm(desc);                                        \
239     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
240                      ctzl(sizeof(ETYPE)), GETPC());                     \
241 }
242 
243 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
244 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
245 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
246 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
247 
248 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
249 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
250                   target_ulong stride, CPURISCVState *env,              \
251                   uint32_t desc)                                        \
252 {                                                                       \
253     uint32_t vm = vext_vm(desc);                                        \
254     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
255                      ctzl(sizeof(ETYPE)), GETPC());                     \
256 }
257 
258 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
259 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
260 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
261 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
262 
263 /*
264  * unit-stride: access elements stored contiguously in memory
265  */
266 
267 /* unmasked unit-stride load and store operation */
268 static void
269 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
270              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
271              uintptr_t ra)
272 {
273     uint32_t i, k;
274     uint32_t nf = vext_nf(desc);
275     uint32_t max_elems = vext_max_elems(desc, log2_esz);
276     uint32_t esz = 1 << log2_esz;
277 
278     VSTART_CHECK_EARLY_EXIT(env);
279 
280     /* load bytes from guest memory */
281     for (i = env->vstart; i < evl; env->vstart = ++i) {
282         k = 0;
283         while (k < nf) {
284             target_ulong addr = base + ((i * nf + k) << log2_esz);
285             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
286             k++;
287         }
288     }
289     env->vstart = 0;
290 
291     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
292 }
293 
294 /*
295  * masked unit-stride load and store operation will be a special case of
296  * stride, stride = NF * sizeof (ETYPE)
297  */
298 
299 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
300 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
301                          CPURISCVState *env, uint32_t desc)             \
302 {                                                                       \
303     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
304     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
305                      ctzl(sizeof(ETYPE)), GETPC());                     \
306 }                                                                       \
307                                                                         \
308 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
309                   CPURISCVState *env, uint32_t desc)                    \
310 {                                                                       \
311     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
312                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
313 }
314 
315 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
316 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
317 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
318 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
319 
320 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
321 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
322                          CPURISCVState *env, uint32_t desc)              \
323 {                                                                        \
324     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
325     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
326                      ctzl(sizeof(ETYPE)), GETPC());                      \
327 }                                                                        \
328                                                                          \
329 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
330                   CPURISCVState *env, uint32_t desc)                     \
331 {                                                                        \
332     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
333                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
334 }
335 
336 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
337 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
338 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
339 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
340 
341 /*
342  * unit stride mask load and store, EEW = 1
343  */
344 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
345                     CPURISCVState *env, uint32_t desc)
346 {
347     /* evl = ceil(vl/8) */
348     uint8_t evl = (env->vl + 7) >> 3;
349     vext_ldst_us(vd, base, env, desc, lde_b,
350                  0, evl, GETPC());
351 }
352 
353 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
354                     CPURISCVState *env, uint32_t desc)
355 {
356     /* evl = ceil(vl/8) */
357     uint8_t evl = (env->vl + 7) >> 3;
358     vext_ldst_us(vd, base, env, desc, ste_b,
359                  0, evl, GETPC());
360 }
361 
362 /*
363  * index: access vector element from indexed memory
364  */
365 typedef target_ulong vext_get_index_addr(target_ulong base,
366         uint32_t idx, void *vs2);
367 
368 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
369 static target_ulong NAME(target_ulong base,            \
370                          uint32_t idx, void *vs2)      \
371 {                                                      \
372     return (base + *((ETYPE *)vs2 + H(idx)));          \
373 }
374 
375 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
376 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
377 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
378 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
379 
380 static inline void
381 vext_ldst_index(void *vd, void *v0, target_ulong base,
382                 void *vs2, CPURISCVState *env, uint32_t desc,
383                 vext_get_index_addr get_index_addr,
384                 vext_ldst_elem_fn *ldst_elem,
385                 uint32_t log2_esz, uintptr_t ra)
386 {
387     uint32_t i, k;
388     uint32_t nf = vext_nf(desc);
389     uint32_t vm = vext_vm(desc);
390     uint32_t max_elems = vext_max_elems(desc, log2_esz);
391     uint32_t esz = 1 << log2_esz;
392     uint32_t vma = vext_vma(desc);
393 
394     VSTART_CHECK_EARLY_EXIT(env);
395 
396     /* load bytes from guest memory */
397     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
398         k = 0;
399         while (k < nf) {
400             if (!vm && !vext_elem_mask(v0, i)) {
401                 /* set masked-off elements to 1s */
402                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
403                                   (i + k * max_elems + 1) * esz);
404                 k++;
405                 continue;
406             }
407             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
408             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
409             k++;
410         }
411     }
412     env->vstart = 0;
413 
414     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
415 }
416 
417 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
418 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
419                   void *vs2, CPURISCVState *env, uint32_t desc)            \
420 {                                                                          \
421     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
422                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
423 }
424 
425 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
426 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
427 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
428 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
429 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
430 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
431 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
432 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
433 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
434 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
435 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
436 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
437 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
438 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
439 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
440 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
441 
442 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
443 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
444                   void *vs2, CPURISCVState *env, uint32_t desc)  \
445 {                                                                \
446     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
447                     STORE_FN, ctzl(sizeof(ETYPE)),               \
448                     GETPC());                                    \
449 }
450 
451 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
452 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
453 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
454 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
455 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
456 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
457 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
458 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
459 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
460 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
461 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
462 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
463 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
464 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
465 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
466 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
467 
468 /*
469  * unit-stride fault-only-fisrt load instructions
470  */
471 static inline void
472 vext_ldff(void *vd, void *v0, target_ulong base,
473           CPURISCVState *env, uint32_t desc,
474           vext_ldst_elem_fn *ldst_elem,
475           uint32_t log2_esz, uintptr_t ra)
476 {
477     uint32_t i, k, vl = 0;
478     uint32_t nf = vext_nf(desc);
479     uint32_t vm = vext_vm(desc);
480     uint32_t max_elems = vext_max_elems(desc, log2_esz);
481     uint32_t esz = 1 << log2_esz;
482     uint32_t vma = vext_vma(desc);
483     target_ulong addr, offset, remain;
484     int mmu_index = riscv_env_mmu_index(env, false);
485 
486     VSTART_CHECK_EARLY_EXIT(env);
487 
488     /* probe every access */
489     for (i = env->vstart; i < env->vl; i++) {
490         if (!vm && !vext_elem_mask(v0, i)) {
491             continue;
492         }
493         addr = adjust_addr(env, base + i * (nf << log2_esz));
494         if (i == 0) {
495             /* Allow fault on first element. */
496             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
497         } else {
498             remain = nf << log2_esz;
499             while (remain > 0) {
500                 void *host;
501                 int flags;
502 
503                 offset = -(addr | TARGET_PAGE_MASK);
504 
505                 /* Probe nonfault on subsequent elements. */
506                 flags = probe_access_flags(env, addr, offset, MMU_DATA_LOAD,
507                                            mmu_index, true, &host, 0);
508 
509                 /*
510                  * Stop if invalid (unmapped) or mmio (transaction may fail).
511                  * Do not stop if watchpoint, as the spec says that
512                  * first-fault should continue to access the same
513                  * elements regardless of any watchpoint.
514                  */
515                 if (flags & ~TLB_WATCHPOINT) {
516                     vl = i;
517                     goto ProbeSuccess;
518                 }
519                 if (remain <= offset) {
520                     break;
521                 }
522                 remain -= offset;
523                 addr = adjust_addr(env, addr + offset);
524             }
525         }
526     }
527 ProbeSuccess:
528     /* load bytes from guest memory */
529     if (vl != 0) {
530         env->vl = vl;
531     }
532     for (i = env->vstart; i < env->vl; i++) {
533         k = 0;
534         while (k < nf) {
535             if (!vm && !vext_elem_mask(v0, i)) {
536                 /* set masked-off elements to 1s */
537                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
538                                   (i + k * max_elems + 1) * esz);
539                 k++;
540                 continue;
541             }
542             addr = base + ((i * nf + k) << log2_esz);
543             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
544             k++;
545         }
546     }
547     env->vstart = 0;
548 
549     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
550 }
551 
552 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
553 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
554                   CPURISCVState *env, uint32_t desc)      \
555 {                                                         \
556     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
557               ctzl(sizeof(ETYPE)), GETPC());              \
558 }
559 
560 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
561 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
562 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
563 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
564 
565 #define DO_SWAP(N, M) (M)
566 #define DO_AND(N, M)  (N & M)
567 #define DO_XOR(N, M)  (N ^ M)
568 #define DO_OR(N, M)   (N | M)
569 #define DO_ADD(N, M)  (N + M)
570 
571 /* Signed min/max */
572 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
573 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
574 
575 /*
576  * load and store whole register instructions
577  */
578 static void
579 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
580                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
581 {
582     uint32_t i, k, off, pos;
583     uint32_t nf = vext_nf(desc);
584     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
585     uint32_t max_elems = vlenb >> log2_esz;
586 
587     if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
588         env->vstart = 0;
589         return;
590     }
591 
592     k = env->vstart / max_elems;
593     off = env->vstart % max_elems;
594 
595     if (off) {
596         /* load/store rest of elements of current segment pointed by vstart */
597         for (pos = off; pos < max_elems; pos++, env->vstart++) {
598             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
599             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
600                       ra);
601         }
602         k++;
603     }
604 
605     /* load/store elements for rest of segments */
606     for (; k < nf; k++) {
607         for (i = 0; i < max_elems; i++, env->vstart++) {
608             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
609             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
610         }
611     }
612 
613     env->vstart = 0;
614 }
615 
616 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
617 void HELPER(NAME)(void *vd, target_ulong base,       \
618                   CPURISCVState *env, uint32_t desc) \
619 {                                                    \
620     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
621                     ctzl(sizeof(ETYPE)), GETPC());   \
622 }
623 
624 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
625 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
626 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
627 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
628 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
629 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
630 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
631 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
632 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
633 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
634 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
635 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
636 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
637 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
638 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
639 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
640 
641 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
642 void HELPER(NAME)(void *vd, target_ulong base,       \
643                   CPURISCVState *env, uint32_t desc) \
644 {                                                    \
645     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
646                     ctzl(sizeof(ETYPE)), GETPC());   \
647 }
648 
649 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
650 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
651 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
652 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
653 
654 /*
655  * Vector Integer Arithmetic Instructions
656  */
657 
658 /* (TD, T1, T2, TX1, TX2) */
659 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
660 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
661 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
662 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
663 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
664 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
665 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
666 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
667 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
668 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
669 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
670 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
671 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
672 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
673 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
674 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
675 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
676 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
677 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
678 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
679 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
680 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
681 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
682 
683 #define DO_SUB(N, M) (N - M)
684 #define DO_RSUB(N, M) (M - N)
685 
686 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
687 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
688 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
689 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
690 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
691 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
692 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
693 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
694 
695 GEN_VEXT_VV(vadd_vv_b, 1)
696 GEN_VEXT_VV(vadd_vv_h, 2)
697 GEN_VEXT_VV(vadd_vv_w, 4)
698 GEN_VEXT_VV(vadd_vv_d, 8)
699 GEN_VEXT_VV(vsub_vv_b, 1)
700 GEN_VEXT_VV(vsub_vv_h, 2)
701 GEN_VEXT_VV(vsub_vv_w, 4)
702 GEN_VEXT_VV(vsub_vv_d, 8)
703 
704 
705 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
706 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
707 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
708 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
709 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
710 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
711 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
712 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
713 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
714 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
715 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
716 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
717 
718 GEN_VEXT_VX(vadd_vx_b, 1)
719 GEN_VEXT_VX(vadd_vx_h, 2)
720 GEN_VEXT_VX(vadd_vx_w, 4)
721 GEN_VEXT_VX(vadd_vx_d, 8)
722 GEN_VEXT_VX(vsub_vx_b, 1)
723 GEN_VEXT_VX(vsub_vx_h, 2)
724 GEN_VEXT_VX(vsub_vx_w, 4)
725 GEN_VEXT_VX(vsub_vx_d, 8)
726 GEN_VEXT_VX(vrsub_vx_b, 1)
727 GEN_VEXT_VX(vrsub_vx_h, 2)
728 GEN_VEXT_VX(vrsub_vx_w, 4)
729 GEN_VEXT_VX(vrsub_vx_d, 8)
730 
731 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
732 {
733     intptr_t oprsz = simd_oprsz(desc);
734     intptr_t i;
735 
736     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
737         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
738     }
739 }
740 
741 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
742 {
743     intptr_t oprsz = simd_oprsz(desc);
744     intptr_t i;
745 
746     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
747         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
748     }
749 }
750 
751 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
752 {
753     intptr_t oprsz = simd_oprsz(desc);
754     intptr_t i;
755 
756     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
757         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
758     }
759 }
760 
761 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
762 {
763     intptr_t oprsz = simd_oprsz(desc);
764     intptr_t i;
765 
766     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
767         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
768     }
769 }
770 
771 /* Vector Widening Integer Add/Subtract */
772 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
773 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
774 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
775 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
776 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
777 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
778 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
779 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
780 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
781 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
782 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
783 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
784 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
785 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
786 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
787 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
788 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
789 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
790 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
791 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
792 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
793 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
794 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
795 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
796 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
797 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
798 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
799 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
800 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
801 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
802 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
803 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
804 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
805 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
806 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
807 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
808 GEN_VEXT_VV(vwaddu_vv_b, 2)
809 GEN_VEXT_VV(vwaddu_vv_h, 4)
810 GEN_VEXT_VV(vwaddu_vv_w, 8)
811 GEN_VEXT_VV(vwsubu_vv_b, 2)
812 GEN_VEXT_VV(vwsubu_vv_h, 4)
813 GEN_VEXT_VV(vwsubu_vv_w, 8)
814 GEN_VEXT_VV(vwadd_vv_b, 2)
815 GEN_VEXT_VV(vwadd_vv_h, 4)
816 GEN_VEXT_VV(vwadd_vv_w, 8)
817 GEN_VEXT_VV(vwsub_vv_b, 2)
818 GEN_VEXT_VV(vwsub_vv_h, 4)
819 GEN_VEXT_VV(vwsub_vv_w, 8)
820 GEN_VEXT_VV(vwaddu_wv_b, 2)
821 GEN_VEXT_VV(vwaddu_wv_h, 4)
822 GEN_VEXT_VV(vwaddu_wv_w, 8)
823 GEN_VEXT_VV(vwsubu_wv_b, 2)
824 GEN_VEXT_VV(vwsubu_wv_h, 4)
825 GEN_VEXT_VV(vwsubu_wv_w, 8)
826 GEN_VEXT_VV(vwadd_wv_b, 2)
827 GEN_VEXT_VV(vwadd_wv_h, 4)
828 GEN_VEXT_VV(vwadd_wv_w, 8)
829 GEN_VEXT_VV(vwsub_wv_b, 2)
830 GEN_VEXT_VV(vwsub_wv_h, 4)
831 GEN_VEXT_VV(vwsub_wv_w, 8)
832 
833 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
834 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
835 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
836 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
837 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
838 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
839 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
840 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
841 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
842 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
843 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
844 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
845 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
846 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
847 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
848 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
849 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
850 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
851 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
852 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
853 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
854 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
855 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
856 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
857 GEN_VEXT_VX(vwaddu_vx_b, 2)
858 GEN_VEXT_VX(vwaddu_vx_h, 4)
859 GEN_VEXT_VX(vwaddu_vx_w, 8)
860 GEN_VEXT_VX(vwsubu_vx_b, 2)
861 GEN_VEXT_VX(vwsubu_vx_h, 4)
862 GEN_VEXT_VX(vwsubu_vx_w, 8)
863 GEN_VEXT_VX(vwadd_vx_b, 2)
864 GEN_VEXT_VX(vwadd_vx_h, 4)
865 GEN_VEXT_VX(vwadd_vx_w, 8)
866 GEN_VEXT_VX(vwsub_vx_b, 2)
867 GEN_VEXT_VX(vwsub_vx_h, 4)
868 GEN_VEXT_VX(vwsub_vx_w, 8)
869 GEN_VEXT_VX(vwaddu_wx_b, 2)
870 GEN_VEXT_VX(vwaddu_wx_h, 4)
871 GEN_VEXT_VX(vwaddu_wx_w, 8)
872 GEN_VEXT_VX(vwsubu_wx_b, 2)
873 GEN_VEXT_VX(vwsubu_wx_h, 4)
874 GEN_VEXT_VX(vwsubu_wx_w, 8)
875 GEN_VEXT_VX(vwadd_wx_b, 2)
876 GEN_VEXT_VX(vwadd_wx_h, 4)
877 GEN_VEXT_VX(vwadd_wx_w, 8)
878 GEN_VEXT_VX(vwsub_wx_b, 2)
879 GEN_VEXT_VX(vwsub_wx_h, 4)
880 GEN_VEXT_VX(vwsub_wx_w, 8)
881 
882 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
883 #define DO_VADC(N, M, C) (N + M + C)
884 #define DO_VSBC(N, M, C) (N - M - C)
885 
886 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
887 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
888                   CPURISCVState *env, uint32_t desc)          \
889 {                                                             \
890     uint32_t vl = env->vl;                                    \
891     uint32_t esz = sizeof(ETYPE);                             \
892     uint32_t total_elems =                                    \
893         vext_get_total_elems(env, desc, esz);                 \
894     uint32_t vta = vext_vta(desc);                            \
895     uint32_t i;                                               \
896                                                               \
897     VSTART_CHECK_EARLY_EXIT(env);                             \
898                                                               \
899     for (i = env->vstart; i < vl; i++) {                      \
900         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
901         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
902         ETYPE carry = vext_elem_mask(v0, i);                  \
903                                                               \
904         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
905     }                                                         \
906     env->vstart = 0;                                          \
907     /* set tail elements to 1s */                             \
908     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
909 }
910 
911 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
912 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
913 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
914 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
915 
916 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
917 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
918 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
919 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
920 
921 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
922 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
923                   CPURISCVState *env, uint32_t desc)                     \
924 {                                                                        \
925     uint32_t vl = env->vl;                                               \
926     uint32_t esz = sizeof(ETYPE);                                        \
927     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
928     uint32_t vta = vext_vta(desc);                                       \
929     uint32_t i;                                                          \
930                                                                          \
931     VSTART_CHECK_EARLY_EXIT(env);                                        \
932                                                                          \
933     for (i = env->vstart; i < vl; i++) {                                 \
934         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
935         ETYPE carry = vext_elem_mask(v0, i);                             \
936                                                                          \
937         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
938     }                                                                    \
939     env->vstart = 0;                                                     \
940     /* set tail elements to 1s */                                        \
941     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
942 }
943 
944 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
945 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
946 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
947 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
948 
949 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
950 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
951 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
952 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
953 
954 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
955                           (__typeof(N))(N + M) < N)
956 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
957 
958 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
959 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
960                   CPURISCVState *env, uint32_t desc)          \
961 {                                                             \
962     uint32_t vl = env->vl;                                    \
963     uint32_t vm = vext_vm(desc);                              \
964     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
965     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
966     uint32_t i;                                               \
967                                                               \
968     VSTART_CHECK_EARLY_EXIT(env);                             \
969                                                               \
970     for (i = env->vstart; i < vl; i++) {                      \
971         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
972         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
973         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
974         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
975     }                                                         \
976     env->vstart = 0;                                          \
977     /*
978      * mask destination register are always tail-agnostic
979      * set tail elements to 1s
980      */                                                       \
981     if (vta_all_1s) {                                         \
982         for (; i < total_elems; i++) {                        \
983             vext_set_elem_mask(vd, i, 1);                     \
984         }                                                     \
985     }                                                         \
986 }
987 
988 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
989 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
990 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
991 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
992 
993 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
994 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
995 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
996 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
997 
998 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
999 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1000                   void *vs2, CPURISCVState *env, uint32_t desc) \
1001 {                                                               \
1002     uint32_t vl = env->vl;                                      \
1003     uint32_t vm = vext_vm(desc);                                \
1004     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1005     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1006     uint32_t i;                                                 \
1007                                                                 \
1008     VSTART_CHECK_EARLY_EXIT(env);                               \
1009                                                                 \
1010     for (i = env->vstart; i < vl; i++) {                        \
1011         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1012         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1013         vext_set_elem_mask(vd, i,                               \
1014                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1015     }                                                           \
1016     env->vstart = 0;                                            \
1017     /*
1018      * mask destination register are always tail-agnostic
1019      * set tail elements to 1s
1020      */                                                         \
1021     if (vta_all_1s) {                                           \
1022         for (; i < total_elems; i++) {                          \
1023             vext_set_elem_mask(vd, i, 1);                       \
1024         }                                                       \
1025     }                                                           \
1026 }
1027 
1028 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1029 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1030 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1031 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1032 
1033 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1034 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1035 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1036 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1037 
1038 /* Vector Bitwise Logical Instructions */
1039 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1040 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1041 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1042 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1043 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1044 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1045 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1046 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1047 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1048 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1049 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1050 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1051 GEN_VEXT_VV(vand_vv_b, 1)
1052 GEN_VEXT_VV(vand_vv_h, 2)
1053 GEN_VEXT_VV(vand_vv_w, 4)
1054 GEN_VEXT_VV(vand_vv_d, 8)
1055 GEN_VEXT_VV(vor_vv_b, 1)
1056 GEN_VEXT_VV(vor_vv_h, 2)
1057 GEN_VEXT_VV(vor_vv_w, 4)
1058 GEN_VEXT_VV(vor_vv_d, 8)
1059 GEN_VEXT_VV(vxor_vv_b, 1)
1060 GEN_VEXT_VV(vxor_vv_h, 2)
1061 GEN_VEXT_VV(vxor_vv_w, 4)
1062 GEN_VEXT_VV(vxor_vv_d, 8)
1063 
1064 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1065 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1066 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1067 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1068 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1069 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1070 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1071 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1072 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1073 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1074 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1075 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1076 GEN_VEXT_VX(vand_vx_b, 1)
1077 GEN_VEXT_VX(vand_vx_h, 2)
1078 GEN_VEXT_VX(vand_vx_w, 4)
1079 GEN_VEXT_VX(vand_vx_d, 8)
1080 GEN_VEXT_VX(vor_vx_b, 1)
1081 GEN_VEXT_VX(vor_vx_h, 2)
1082 GEN_VEXT_VX(vor_vx_w, 4)
1083 GEN_VEXT_VX(vor_vx_d, 8)
1084 GEN_VEXT_VX(vxor_vx_b, 1)
1085 GEN_VEXT_VX(vxor_vx_h, 2)
1086 GEN_VEXT_VX(vxor_vx_w, 4)
1087 GEN_VEXT_VX(vxor_vx_d, 8)
1088 
1089 /* Vector Single-Width Bit Shift Instructions */
1090 #define DO_SLL(N, M)  (N << (M))
1091 #define DO_SRL(N, M)  (N >> (M))
1092 
1093 /* generate the helpers for shift instructions with two vector operators */
1094 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1095 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1096                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1097 {                                                                         \
1098     uint32_t vm = vext_vm(desc);                                          \
1099     uint32_t vl = env->vl;                                                \
1100     uint32_t esz = sizeof(TS1);                                           \
1101     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1102     uint32_t vta = vext_vta(desc);                                        \
1103     uint32_t vma = vext_vma(desc);                                        \
1104     uint32_t i;                                                           \
1105                                                                           \
1106     VSTART_CHECK_EARLY_EXIT(env);                                         \
1107                                                                           \
1108     for (i = env->vstart; i < vl; i++) {                                  \
1109         if (!vm && !vext_elem_mask(v0, i)) {                              \
1110             /* set masked-off elements to 1s */                           \
1111             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1112             continue;                                                     \
1113         }                                                                 \
1114         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1115         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1116         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1117     }                                                                     \
1118     env->vstart = 0;                                                      \
1119     /* set tail elements to 1s */                                         \
1120     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1121 }
1122 
1123 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1124 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1125 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1126 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1127 
1128 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1129 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1130 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1131 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1132 
1133 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1134 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1135 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1136 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1137 
1138 /*
1139  * generate the helpers for shift instructions with one vector and one scalar
1140  */
1141 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1142 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1143                   void *vs2, CPURISCVState *env,            \
1144                   uint32_t desc)                            \
1145 {                                                           \
1146     uint32_t vm = vext_vm(desc);                            \
1147     uint32_t vl = env->vl;                                  \
1148     uint32_t esz = sizeof(TD);                              \
1149     uint32_t total_elems =                                  \
1150         vext_get_total_elems(env, desc, esz);               \
1151     uint32_t vta = vext_vta(desc);                          \
1152     uint32_t vma = vext_vma(desc);                          \
1153     uint32_t i;                                             \
1154                                                             \
1155     VSTART_CHECK_EARLY_EXIT(env);                           \
1156                                                             \
1157     for (i = env->vstart; i < vl; i++) {                    \
1158         if (!vm && !vext_elem_mask(v0, i)) {                \
1159             /* set masked-off elements to 1s */             \
1160             vext_set_elems_1s(vd, vma, i * esz,             \
1161                               (i + 1) * esz);               \
1162             continue;                                       \
1163         }                                                   \
1164         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1165         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1166     }                                                       \
1167     env->vstart = 0;                                        \
1168     /* set tail elements to 1s */                           \
1169     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1170 }
1171 
1172 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1173 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1174 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1175 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1176 
1177 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1178 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1179 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1180 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1181 
1182 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1183 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1184 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1185 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1186 
1187 /* Vector Narrowing Integer Right Shift Instructions */
1188 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1189 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1190 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1191 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1192 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1193 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1194 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1195 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1196 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1197 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1198 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1199 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1200 
1201 /* Vector Integer Comparison Instructions */
1202 #define DO_MSEQ(N, M) (N == M)
1203 #define DO_MSNE(N, M) (N != M)
1204 #define DO_MSLT(N, M) (N < M)
1205 #define DO_MSLE(N, M) (N <= M)
1206 #define DO_MSGT(N, M) (N > M)
1207 
1208 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1209 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1210                   CPURISCVState *env, uint32_t desc)          \
1211 {                                                             \
1212     uint32_t vm = vext_vm(desc);                              \
1213     uint32_t vl = env->vl;                                    \
1214     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1215     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1216     uint32_t vma = vext_vma(desc);                            \
1217     uint32_t i;                                               \
1218                                                               \
1219     VSTART_CHECK_EARLY_EXIT(env);                             \
1220                                                               \
1221     for (i = env->vstart; i < vl; i++) {                      \
1222         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1223         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1224         if (!vm && !vext_elem_mask(v0, i)) {                  \
1225             /* set masked-off elements to 1s */               \
1226             if (vma) {                                        \
1227                 vext_set_elem_mask(vd, i, 1);                 \
1228             }                                                 \
1229             continue;                                         \
1230         }                                                     \
1231         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1232     }                                                         \
1233     env->vstart = 0;                                          \
1234     /*
1235      * mask destination register are always tail-agnostic
1236      * set tail elements to 1s
1237      */                                                       \
1238     if (vta_all_1s) {                                         \
1239         for (; i < total_elems; i++) {                        \
1240             vext_set_elem_mask(vd, i, 1);                     \
1241         }                                                     \
1242     }                                                         \
1243 }
1244 
1245 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1246 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1247 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1248 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1249 
1250 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1251 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1252 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1253 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1254 
1255 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1256 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1257 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1258 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1259 
1260 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1261 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1262 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1263 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1264 
1265 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1266 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1267 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1268 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1269 
1270 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1271 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1272 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1273 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1274 
1275 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1276 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1277                   CPURISCVState *env, uint32_t desc)                \
1278 {                                                                   \
1279     uint32_t vm = vext_vm(desc);                                    \
1280     uint32_t vl = env->vl;                                          \
1281     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1282     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1283     uint32_t vma = vext_vma(desc);                                  \
1284     uint32_t i;                                                     \
1285                                                                     \
1286     VSTART_CHECK_EARLY_EXIT(env);                                   \
1287                                                                     \
1288     for (i = env->vstart; i < vl; i++) {                            \
1289         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1290         if (!vm && !vext_elem_mask(v0, i)) {                        \
1291             /* set masked-off elements to 1s */                     \
1292             if (vma) {                                              \
1293                 vext_set_elem_mask(vd, i, 1);                       \
1294             }                                                       \
1295             continue;                                               \
1296         }                                                           \
1297         vext_set_elem_mask(vd, i,                                   \
1298                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1299     }                                                               \
1300     env->vstart = 0;                                                \
1301     /*
1302      * mask destination register are always tail-agnostic
1303      * set tail elements to 1s
1304      */                                                             \
1305     if (vta_all_1s) {                                               \
1306         for (; i < total_elems; i++) {                              \
1307             vext_set_elem_mask(vd, i, 1);                           \
1308         }                                                           \
1309     }                                                               \
1310 }
1311 
1312 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1313 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1314 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1315 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1316 
1317 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1318 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1319 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1320 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1321 
1322 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1323 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1324 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1325 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1326 
1327 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1328 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1329 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1330 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1331 
1332 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1333 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1334 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1335 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1336 
1337 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1338 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1339 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1340 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1341 
1342 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1343 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1344 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1345 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1346 
1347 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1348 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1349 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1350 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1351 
1352 /* Vector Integer Min/Max Instructions */
1353 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1354 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1355 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1356 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1357 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1358 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1359 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1360 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1361 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1362 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1363 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1364 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1365 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1366 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1367 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1368 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1369 GEN_VEXT_VV(vminu_vv_b, 1)
1370 GEN_VEXT_VV(vminu_vv_h, 2)
1371 GEN_VEXT_VV(vminu_vv_w, 4)
1372 GEN_VEXT_VV(vminu_vv_d, 8)
1373 GEN_VEXT_VV(vmin_vv_b, 1)
1374 GEN_VEXT_VV(vmin_vv_h, 2)
1375 GEN_VEXT_VV(vmin_vv_w, 4)
1376 GEN_VEXT_VV(vmin_vv_d, 8)
1377 GEN_VEXT_VV(vmaxu_vv_b, 1)
1378 GEN_VEXT_VV(vmaxu_vv_h, 2)
1379 GEN_VEXT_VV(vmaxu_vv_w, 4)
1380 GEN_VEXT_VV(vmaxu_vv_d, 8)
1381 GEN_VEXT_VV(vmax_vv_b, 1)
1382 GEN_VEXT_VV(vmax_vv_h, 2)
1383 GEN_VEXT_VV(vmax_vv_w, 4)
1384 GEN_VEXT_VV(vmax_vv_d, 8)
1385 
1386 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1387 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1388 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1389 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1390 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1391 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1392 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1393 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1394 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1395 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1396 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1397 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1398 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1399 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1400 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1401 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1402 GEN_VEXT_VX(vminu_vx_b, 1)
1403 GEN_VEXT_VX(vminu_vx_h, 2)
1404 GEN_VEXT_VX(vminu_vx_w, 4)
1405 GEN_VEXT_VX(vminu_vx_d, 8)
1406 GEN_VEXT_VX(vmin_vx_b, 1)
1407 GEN_VEXT_VX(vmin_vx_h, 2)
1408 GEN_VEXT_VX(vmin_vx_w, 4)
1409 GEN_VEXT_VX(vmin_vx_d, 8)
1410 GEN_VEXT_VX(vmaxu_vx_b, 1)
1411 GEN_VEXT_VX(vmaxu_vx_h, 2)
1412 GEN_VEXT_VX(vmaxu_vx_w, 4)
1413 GEN_VEXT_VX(vmaxu_vx_d, 8)
1414 GEN_VEXT_VX(vmax_vx_b, 1)
1415 GEN_VEXT_VX(vmax_vx_h, 2)
1416 GEN_VEXT_VX(vmax_vx_w, 4)
1417 GEN_VEXT_VX(vmax_vx_d, 8)
1418 
1419 /* Vector Single-Width Integer Multiply Instructions */
1420 #define DO_MUL(N, M) (N * M)
1421 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1422 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1423 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1424 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1425 GEN_VEXT_VV(vmul_vv_b, 1)
1426 GEN_VEXT_VV(vmul_vv_h, 2)
1427 GEN_VEXT_VV(vmul_vv_w, 4)
1428 GEN_VEXT_VV(vmul_vv_d, 8)
1429 
1430 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1431 {
1432     return (int16_t)s2 * (int16_t)s1 >> 8;
1433 }
1434 
1435 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1436 {
1437     return (int32_t)s2 * (int32_t)s1 >> 16;
1438 }
1439 
1440 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1441 {
1442     return (int64_t)s2 * (int64_t)s1 >> 32;
1443 }
1444 
1445 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1446 {
1447     uint64_t hi_64, lo_64;
1448 
1449     muls64(&lo_64, &hi_64, s1, s2);
1450     return hi_64;
1451 }
1452 
1453 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1454 {
1455     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1456 }
1457 
1458 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1459 {
1460     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1461 }
1462 
1463 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1464 {
1465     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1466 }
1467 
1468 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1469 {
1470     uint64_t hi_64, lo_64;
1471 
1472     mulu64(&lo_64, &hi_64, s2, s1);
1473     return hi_64;
1474 }
1475 
1476 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1477 {
1478     return (int16_t)s2 * (uint16_t)s1 >> 8;
1479 }
1480 
1481 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1482 {
1483     return (int32_t)s2 * (uint32_t)s1 >> 16;
1484 }
1485 
1486 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1487 {
1488     return (int64_t)s2 * (uint64_t)s1 >> 32;
1489 }
1490 
1491 /*
1492  * Let  A = signed operand,
1493  *      B = unsigned operand
1494  *      P = mulu64(A, B), unsigned product
1495  *
1496  * LET  X = 2 ** 64  - A, 2's complement of A
1497  *      SP = signed product
1498  * THEN
1499  *      IF A < 0
1500  *          SP = -X * B
1501  *             = -(2 ** 64 - A) * B
1502  *             = A * B - 2 ** 64 * B
1503  *             = P - 2 ** 64 * B
1504  *      ELSE
1505  *          SP = P
1506  * THEN
1507  *      HI_P -= (A < 0 ? B : 0)
1508  */
1509 
1510 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1511 {
1512     uint64_t hi_64, lo_64;
1513 
1514     mulu64(&lo_64, &hi_64, s2, s1);
1515 
1516     hi_64 -= s2 < 0 ? s1 : 0;
1517     return hi_64;
1518 }
1519 
1520 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1521 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1522 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1523 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1524 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1525 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1526 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1527 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1528 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1529 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1530 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1531 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1532 GEN_VEXT_VV(vmulh_vv_b, 1)
1533 GEN_VEXT_VV(vmulh_vv_h, 2)
1534 GEN_VEXT_VV(vmulh_vv_w, 4)
1535 GEN_VEXT_VV(vmulh_vv_d, 8)
1536 GEN_VEXT_VV(vmulhu_vv_b, 1)
1537 GEN_VEXT_VV(vmulhu_vv_h, 2)
1538 GEN_VEXT_VV(vmulhu_vv_w, 4)
1539 GEN_VEXT_VV(vmulhu_vv_d, 8)
1540 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1541 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1542 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1543 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1544 
1545 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1546 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1547 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1548 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1549 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1550 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1551 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1552 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1553 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1554 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1555 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1556 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1557 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1558 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1559 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1560 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1561 GEN_VEXT_VX(vmul_vx_b, 1)
1562 GEN_VEXT_VX(vmul_vx_h, 2)
1563 GEN_VEXT_VX(vmul_vx_w, 4)
1564 GEN_VEXT_VX(vmul_vx_d, 8)
1565 GEN_VEXT_VX(vmulh_vx_b, 1)
1566 GEN_VEXT_VX(vmulh_vx_h, 2)
1567 GEN_VEXT_VX(vmulh_vx_w, 4)
1568 GEN_VEXT_VX(vmulh_vx_d, 8)
1569 GEN_VEXT_VX(vmulhu_vx_b, 1)
1570 GEN_VEXT_VX(vmulhu_vx_h, 2)
1571 GEN_VEXT_VX(vmulhu_vx_w, 4)
1572 GEN_VEXT_VX(vmulhu_vx_d, 8)
1573 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1574 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1575 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1576 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1577 
1578 /* Vector Integer Divide Instructions */
1579 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1580 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1581 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1582         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1583 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1584         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1585 
1586 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1587 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1588 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1589 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1590 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1591 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1592 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1593 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1594 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1595 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1596 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1597 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1598 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1599 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1600 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1601 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1602 GEN_VEXT_VV(vdivu_vv_b, 1)
1603 GEN_VEXT_VV(vdivu_vv_h, 2)
1604 GEN_VEXT_VV(vdivu_vv_w, 4)
1605 GEN_VEXT_VV(vdivu_vv_d, 8)
1606 GEN_VEXT_VV(vdiv_vv_b, 1)
1607 GEN_VEXT_VV(vdiv_vv_h, 2)
1608 GEN_VEXT_VV(vdiv_vv_w, 4)
1609 GEN_VEXT_VV(vdiv_vv_d, 8)
1610 GEN_VEXT_VV(vremu_vv_b, 1)
1611 GEN_VEXT_VV(vremu_vv_h, 2)
1612 GEN_VEXT_VV(vremu_vv_w, 4)
1613 GEN_VEXT_VV(vremu_vv_d, 8)
1614 GEN_VEXT_VV(vrem_vv_b, 1)
1615 GEN_VEXT_VV(vrem_vv_h, 2)
1616 GEN_VEXT_VV(vrem_vv_w, 4)
1617 GEN_VEXT_VV(vrem_vv_d, 8)
1618 
1619 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1620 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1621 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1622 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1623 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1624 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1625 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1626 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1627 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1628 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1629 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1630 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1631 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1632 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1633 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1634 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1635 GEN_VEXT_VX(vdivu_vx_b, 1)
1636 GEN_VEXT_VX(vdivu_vx_h, 2)
1637 GEN_VEXT_VX(vdivu_vx_w, 4)
1638 GEN_VEXT_VX(vdivu_vx_d, 8)
1639 GEN_VEXT_VX(vdiv_vx_b, 1)
1640 GEN_VEXT_VX(vdiv_vx_h, 2)
1641 GEN_VEXT_VX(vdiv_vx_w, 4)
1642 GEN_VEXT_VX(vdiv_vx_d, 8)
1643 GEN_VEXT_VX(vremu_vx_b, 1)
1644 GEN_VEXT_VX(vremu_vx_h, 2)
1645 GEN_VEXT_VX(vremu_vx_w, 4)
1646 GEN_VEXT_VX(vremu_vx_d, 8)
1647 GEN_VEXT_VX(vrem_vx_b, 1)
1648 GEN_VEXT_VX(vrem_vx_h, 2)
1649 GEN_VEXT_VX(vrem_vx_w, 4)
1650 GEN_VEXT_VX(vrem_vx_d, 8)
1651 
1652 /* Vector Widening Integer Multiply Instructions */
1653 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1654 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1655 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1656 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1657 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1658 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1659 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1660 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1661 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1662 GEN_VEXT_VV(vwmul_vv_b, 2)
1663 GEN_VEXT_VV(vwmul_vv_h, 4)
1664 GEN_VEXT_VV(vwmul_vv_w, 8)
1665 GEN_VEXT_VV(vwmulu_vv_b, 2)
1666 GEN_VEXT_VV(vwmulu_vv_h, 4)
1667 GEN_VEXT_VV(vwmulu_vv_w, 8)
1668 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1669 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1670 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1671 
1672 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1673 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1674 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1675 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1676 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1677 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1678 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1679 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1680 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1681 GEN_VEXT_VX(vwmul_vx_b, 2)
1682 GEN_VEXT_VX(vwmul_vx_h, 4)
1683 GEN_VEXT_VX(vwmul_vx_w, 8)
1684 GEN_VEXT_VX(vwmulu_vx_b, 2)
1685 GEN_VEXT_VX(vwmulu_vx_h, 4)
1686 GEN_VEXT_VX(vwmulu_vx_w, 8)
1687 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1688 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1689 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1690 
1691 /* Vector Single-Width Integer Multiply-Add Instructions */
1692 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1693 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1694 {                                                                  \
1695     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1696     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1697     TD d = *((TD *)vd + HD(i));                                    \
1698     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1699 }
1700 
1701 #define DO_MACC(N, M, D) (M * N + D)
1702 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1703 #define DO_MADD(N, M, D) (M * D + N)
1704 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1705 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1706 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1707 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1708 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1709 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1710 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1711 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1712 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1713 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1714 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1715 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1716 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1717 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1718 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1719 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1720 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1721 GEN_VEXT_VV(vmacc_vv_b, 1)
1722 GEN_VEXT_VV(vmacc_vv_h, 2)
1723 GEN_VEXT_VV(vmacc_vv_w, 4)
1724 GEN_VEXT_VV(vmacc_vv_d, 8)
1725 GEN_VEXT_VV(vnmsac_vv_b, 1)
1726 GEN_VEXT_VV(vnmsac_vv_h, 2)
1727 GEN_VEXT_VV(vnmsac_vv_w, 4)
1728 GEN_VEXT_VV(vnmsac_vv_d, 8)
1729 GEN_VEXT_VV(vmadd_vv_b, 1)
1730 GEN_VEXT_VV(vmadd_vv_h, 2)
1731 GEN_VEXT_VV(vmadd_vv_w, 4)
1732 GEN_VEXT_VV(vmadd_vv_d, 8)
1733 GEN_VEXT_VV(vnmsub_vv_b, 1)
1734 GEN_VEXT_VV(vnmsub_vv_h, 2)
1735 GEN_VEXT_VV(vnmsub_vv_w, 4)
1736 GEN_VEXT_VV(vnmsub_vv_d, 8)
1737 
1738 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1739 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1740 {                                                                   \
1741     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1742     TD d = *((TD *)vd + HD(i));                                     \
1743     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1744 }
1745 
1746 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1747 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1748 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1749 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1750 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1751 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1752 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1753 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1754 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1755 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1756 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1757 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1758 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1759 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1760 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1761 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1762 GEN_VEXT_VX(vmacc_vx_b, 1)
1763 GEN_VEXT_VX(vmacc_vx_h, 2)
1764 GEN_VEXT_VX(vmacc_vx_w, 4)
1765 GEN_VEXT_VX(vmacc_vx_d, 8)
1766 GEN_VEXT_VX(vnmsac_vx_b, 1)
1767 GEN_VEXT_VX(vnmsac_vx_h, 2)
1768 GEN_VEXT_VX(vnmsac_vx_w, 4)
1769 GEN_VEXT_VX(vnmsac_vx_d, 8)
1770 GEN_VEXT_VX(vmadd_vx_b, 1)
1771 GEN_VEXT_VX(vmadd_vx_h, 2)
1772 GEN_VEXT_VX(vmadd_vx_w, 4)
1773 GEN_VEXT_VX(vmadd_vx_d, 8)
1774 GEN_VEXT_VX(vnmsub_vx_b, 1)
1775 GEN_VEXT_VX(vnmsub_vx_h, 2)
1776 GEN_VEXT_VX(vnmsub_vx_w, 4)
1777 GEN_VEXT_VX(vnmsub_vx_d, 8)
1778 
1779 /* Vector Widening Integer Multiply-Add Instructions */
1780 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1781 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1782 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1783 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1784 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1785 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1786 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1787 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1788 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1789 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1790 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1791 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1792 GEN_VEXT_VV(vwmacc_vv_b, 2)
1793 GEN_VEXT_VV(vwmacc_vv_h, 4)
1794 GEN_VEXT_VV(vwmacc_vv_w, 8)
1795 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1796 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1797 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1798 
1799 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1800 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1801 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1802 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1803 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1804 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1805 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1806 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1807 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1808 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1809 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1810 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1811 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1812 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1813 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1814 GEN_VEXT_VX(vwmacc_vx_b, 2)
1815 GEN_VEXT_VX(vwmacc_vx_h, 4)
1816 GEN_VEXT_VX(vwmacc_vx_w, 8)
1817 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1818 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1819 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1820 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1821 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1822 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1823 
1824 /* Vector Integer Merge and Move Instructions */
1825 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1826 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1827                   uint32_t desc)                                     \
1828 {                                                                    \
1829     uint32_t vl = env->vl;                                           \
1830     uint32_t esz = sizeof(ETYPE);                                    \
1831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1832     uint32_t vta = vext_vta(desc);                                   \
1833     uint32_t i;                                                      \
1834                                                                      \
1835     VSTART_CHECK_EARLY_EXIT(env);                                    \
1836                                                                      \
1837     for (i = env->vstart; i < vl; i++) {                             \
1838         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1839         *((ETYPE *)vd + H(i)) = s1;                                  \
1840     }                                                                \
1841     env->vstart = 0;                                                 \
1842     /* set tail elements to 1s */                                    \
1843     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1844 }
1845 
1846 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1847 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1848 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1849 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1850 
1851 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1852 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1853                   uint32_t desc)                                     \
1854 {                                                                    \
1855     uint32_t vl = env->vl;                                           \
1856     uint32_t esz = sizeof(ETYPE);                                    \
1857     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1858     uint32_t vta = vext_vta(desc);                                   \
1859     uint32_t i;                                                      \
1860                                                                      \
1861     VSTART_CHECK_EARLY_EXIT(env);                                    \
1862                                                                      \
1863     for (i = env->vstart; i < vl; i++) {                             \
1864         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1865     }                                                                \
1866     env->vstart = 0;                                                 \
1867     /* set tail elements to 1s */                                    \
1868     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1869 }
1870 
1871 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1872 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1873 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1874 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1875 
1876 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1877 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1878                   CPURISCVState *env, uint32_t desc)                 \
1879 {                                                                    \
1880     uint32_t vl = env->vl;                                           \
1881     uint32_t esz = sizeof(ETYPE);                                    \
1882     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1883     uint32_t vta = vext_vta(desc);                                   \
1884     uint32_t i;                                                      \
1885                                                                      \
1886     VSTART_CHECK_EARLY_EXIT(env);                                    \
1887                                                                      \
1888     for (i = env->vstart; i < vl; i++) {                             \
1889         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1890         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1891     }                                                                \
1892     env->vstart = 0;                                                 \
1893     /* set tail elements to 1s */                                    \
1894     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1895 }
1896 
1897 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1898 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1899 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1900 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1901 
1902 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1903 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1904                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1905 {                                                                    \
1906     uint32_t vl = env->vl;                                           \
1907     uint32_t esz = sizeof(ETYPE);                                    \
1908     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1909     uint32_t vta = vext_vta(desc);                                   \
1910     uint32_t i;                                                      \
1911                                                                      \
1912     VSTART_CHECK_EARLY_EXIT(env);                                    \
1913                                                                      \
1914     for (i = env->vstart; i < vl; i++) {                             \
1915         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1916         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1917                    (ETYPE)(target_long)s1);                          \
1918         *((ETYPE *)vd + H(i)) = d;                                   \
1919     }                                                                \
1920     env->vstart = 0;                                                 \
1921     /* set tail elements to 1s */                                    \
1922     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1923 }
1924 
1925 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1926 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1927 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1928 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1929 
1930 /*
1931  * Vector Fixed-Point Arithmetic Instructions
1932  */
1933 
1934 /* Vector Single-Width Saturating Add and Subtract */
1935 
1936 /*
1937  * As fixed point instructions probably have round mode and saturation,
1938  * define common macros for fixed point here.
1939  */
1940 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1941                           CPURISCVState *env, int vxrm);
1942 
1943 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1944 static inline void                                                  \
1945 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1946           CPURISCVState *env, int vxrm)                             \
1947 {                                                                   \
1948     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1949     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1950     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1951 }
1952 
1953 static inline void
1954 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1955              CPURISCVState *env,
1956              uint32_t vl, uint32_t vm, int vxrm,
1957              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1958 {
1959     VSTART_CHECK_EARLY_EXIT(env);
1960 
1961     for (uint32_t i = env->vstart; i < vl; i++) {
1962         if (!vm && !vext_elem_mask(v0, i)) {
1963             /* set masked-off elements to 1s */
1964             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1965             continue;
1966         }
1967         fn(vd, vs1, vs2, i, env, vxrm);
1968     }
1969     env->vstart = 0;
1970 }
1971 
1972 static inline void
1973 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1974              CPURISCVState *env,
1975              uint32_t desc,
1976              opivv2_rm_fn *fn, uint32_t esz)
1977 {
1978     uint32_t vm = vext_vm(desc);
1979     uint32_t vl = env->vl;
1980     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1981     uint32_t vta = vext_vta(desc);
1982     uint32_t vma = vext_vma(desc);
1983 
1984     switch (env->vxrm) {
1985     case 0: /* rnu */
1986         vext_vv_rm_1(vd, v0, vs1, vs2,
1987                      env, vl, vm, 0, fn, vma, esz);
1988         break;
1989     case 1: /* rne */
1990         vext_vv_rm_1(vd, v0, vs1, vs2,
1991                      env, vl, vm, 1, fn, vma, esz);
1992         break;
1993     case 2: /* rdn */
1994         vext_vv_rm_1(vd, v0, vs1, vs2,
1995                      env, vl, vm, 2, fn, vma, esz);
1996         break;
1997     default: /* rod */
1998         vext_vv_rm_1(vd, v0, vs1, vs2,
1999                      env, vl, vm, 3, fn, vma, esz);
2000         break;
2001     }
2002     /* set tail elements to 1s */
2003     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2004 }
2005 
2006 /* generate helpers for fixed point instructions with OPIVV format */
2007 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2008 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2009                   CPURISCVState *env, uint32_t desc)            \
2010 {                                                               \
2011     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2012                  do_##NAME, ESZ);                               \
2013 }
2014 
2015 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2016                              uint8_t b)
2017 {
2018     uint8_t res = a + b;
2019     if (res < a) {
2020         res = UINT8_MAX;
2021         env->vxsat = 0x1;
2022     }
2023     return res;
2024 }
2025 
2026 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2027                                uint16_t b)
2028 {
2029     uint16_t res = a + b;
2030     if (res < a) {
2031         res = UINT16_MAX;
2032         env->vxsat = 0x1;
2033     }
2034     return res;
2035 }
2036 
2037 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2038                                uint32_t b)
2039 {
2040     uint32_t res = a + b;
2041     if (res < a) {
2042         res = UINT32_MAX;
2043         env->vxsat = 0x1;
2044     }
2045     return res;
2046 }
2047 
2048 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2049                                uint64_t b)
2050 {
2051     uint64_t res = a + b;
2052     if (res < a) {
2053         res = UINT64_MAX;
2054         env->vxsat = 0x1;
2055     }
2056     return res;
2057 }
2058 
2059 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2060 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2061 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2062 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2063 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2064 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2065 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2066 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2067 
2068 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2069                           CPURISCVState *env, int vxrm);
2070 
2071 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2072 static inline void                                                  \
2073 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2074           CPURISCVState *env, int vxrm)                             \
2075 {                                                                   \
2076     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2077     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2078 }
2079 
2080 static inline void
2081 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2082              CPURISCVState *env,
2083              uint32_t vl, uint32_t vm, int vxrm,
2084              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2085 {
2086     VSTART_CHECK_EARLY_EXIT(env);
2087 
2088     for (uint32_t i = env->vstart; i < vl; i++) {
2089         if (!vm && !vext_elem_mask(v0, i)) {
2090             /* set masked-off elements to 1s */
2091             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2092             continue;
2093         }
2094         fn(vd, s1, vs2, i, env, vxrm);
2095     }
2096     env->vstart = 0;
2097 }
2098 
2099 static inline void
2100 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2101              CPURISCVState *env,
2102              uint32_t desc,
2103              opivx2_rm_fn *fn, uint32_t esz)
2104 {
2105     uint32_t vm = vext_vm(desc);
2106     uint32_t vl = env->vl;
2107     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2108     uint32_t vta = vext_vta(desc);
2109     uint32_t vma = vext_vma(desc);
2110 
2111     switch (env->vxrm) {
2112     case 0: /* rnu */
2113         vext_vx_rm_1(vd, v0, s1, vs2,
2114                      env, vl, vm, 0, fn, vma, esz);
2115         break;
2116     case 1: /* rne */
2117         vext_vx_rm_1(vd, v0, s1, vs2,
2118                      env, vl, vm, 1, fn, vma, esz);
2119         break;
2120     case 2: /* rdn */
2121         vext_vx_rm_1(vd, v0, s1, vs2,
2122                      env, vl, vm, 2, fn, vma, esz);
2123         break;
2124     default: /* rod */
2125         vext_vx_rm_1(vd, v0, s1, vs2,
2126                      env, vl, vm, 3, fn, vma, esz);
2127         break;
2128     }
2129     /* set tail elements to 1s */
2130     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2131 }
2132 
2133 /* generate helpers for fixed point instructions with OPIVX format */
2134 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2135 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2136                   void *vs2, CPURISCVState *env,          \
2137                   uint32_t desc)                          \
2138 {                                                         \
2139     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2140                  do_##NAME, ESZ);                         \
2141 }
2142 
2143 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2144 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2145 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2146 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2147 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2148 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2149 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2150 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2151 
2152 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2153 {
2154     int8_t res = a + b;
2155     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2156         res = a > 0 ? INT8_MAX : INT8_MIN;
2157         env->vxsat = 0x1;
2158     }
2159     return res;
2160 }
2161 
2162 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2163                              int16_t b)
2164 {
2165     int16_t res = a + b;
2166     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2167         res = a > 0 ? INT16_MAX : INT16_MIN;
2168         env->vxsat = 0x1;
2169     }
2170     return res;
2171 }
2172 
2173 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2174                              int32_t b)
2175 {
2176     int32_t res = a + b;
2177     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2178         res = a > 0 ? INT32_MAX : INT32_MIN;
2179         env->vxsat = 0x1;
2180     }
2181     return res;
2182 }
2183 
2184 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2185                              int64_t b)
2186 {
2187     int64_t res = a + b;
2188     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2189         res = a > 0 ? INT64_MAX : INT64_MIN;
2190         env->vxsat = 0x1;
2191     }
2192     return res;
2193 }
2194 
2195 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2196 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2197 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2198 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2199 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2200 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2201 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2202 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2203 
2204 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2205 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2206 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2207 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2208 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2209 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2210 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2211 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2212 
2213 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2214                              uint8_t b)
2215 {
2216     uint8_t res = a - b;
2217     if (res > a) {
2218         res = 0;
2219         env->vxsat = 0x1;
2220     }
2221     return res;
2222 }
2223 
2224 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2225                                uint16_t b)
2226 {
2227     uint16_t res = a - b;
2228     if (res > a) {
2229         res = 0;
2230         env->vxsat = 0x1;
2231     }
2232     return res;
2233 }
2234 
2235 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2236                                uint32_t b)
2237 {
2238     uint32_t res = a - b;
2239     if (res > a) {
2240         res = 0;
2241         env->vxsat = 0x1;
2242     }
2243     return res;
2244 }
2245 
2246 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2247                                uint64_t b)
2248 {
2249     uint64_t res = a - b;
2250     if (res > a) {
2251         res = 0;
2252         env->vxsat = 0x1;
2253     }
2254     return res;
2255 }
2256 
2257 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2258 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2259 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2260 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2261 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2262 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2263 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2264 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2265 
2266 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2267 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2268 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2269 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2270 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2271 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2272 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2273 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2274 
2275 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2276 {
2277     int8_t res = a - b;
2278     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2279         res = a >= 0 ? INT8_MAX : INT8_MIN;
2280         env->vxsat = 0x1;
2281     }
2282     return res;
2283 }
2284 
2285 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2286                              int16_t b)
2287 {
2288     int16_t res = a - b;
2289     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2290         res = a >= 0 ? INT16_MAX : INT16_MIN;
2291         env->vxsat = 0x1;
2292     }
2293     return res;
2294 }
2295 
2296 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2297                              int32_t b)
2298 {
2299     int32_t res = a - b;
2300     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2301         res = a >= 0 ? INT32_MAX : INT32_MIN;
2302         env->vxsat = 0x1;
2303     }
2304     return res;
2305 }
2306 
2307 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2308                              int64_t b)
2309 {
2310     int64_t res = a - b;
2311     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2312         res = a >= 0 ? INT64_MAX : INT64_MIN;
2313         env->vxsat = 0x1;
2314     }
2315     return res;
2316 }
2317 
2318 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2319 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2320 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2321 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2322 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2323 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2324 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2325 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2326 
2327 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2328 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2329 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2330 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2331 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2332 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2333 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2334 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2335 
2336 /* Vector Single-Width Averaging Add and Subtract */
2337 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2338 {
2339     uint8_t d = extract64(v, shift, 1);
2340     uint8_t d1;
2341     uint64_t D1, D2;
2342 
2343     if (shift == 0 || shift > 64) {
2344         return 0;
2345     }
2346 
2347     d1 = extract64(v, shift - 1, 1);
2348     D1 = extract64(v, 0, shift);
2349     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2350         return d1;
2351     } else if (vxrm == 1) { /* round-to-nearest-even */
2352         if (shift > 1) {
2353             D2 = extract64(v, 0, shift - 1);
2354             return d1 & ((D2 != 0) | d);
2355         } else {
2356             return d1 & d;
2357         }
2358     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2359         return !d & (D1 != 0);
2360     }
2361     return 0; /* round-down (truncate) */
2362 }
2363 
2364 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2365                              int32_t b)
2366 {
2367     int64_t res = (int64_t)a + b;
2368     uint8_t round = get_round(vxrm, res, 1);
2369 
2370     return (res >> 1) + round;
2371 }
2372 
2373 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2374                              int64_t b)
2375 {
2376     int64_t res = a + b;
2377     uint8_t round = get_round(vxrm, res, 1);
2378     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2379 
2380     /* With signed overflow, bit 64 is inverse of bit 63. */
2381     return ((res >> 1) ^ over) + round;
2382 }
2383 
2384 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2385 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2386 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2387 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2388 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2389 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2390 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2391 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2392 
2393 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2394 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2395 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2396 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2397 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2398 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2399 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2400 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2401 
2402 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2403                                uint32_t a, uint32_t b)
2404 {
2405     uint64_t res = (uint64_t)a + b;
2406     uint8_t round = get_round(vxrm, res, 1);
2407 
2408     return (res >> 1) + round;
2409 }
2410 
2411 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2412                                uint64_t a, uint64_t b)
2413 {
2414     uint64_t res = a + b;
2415     uint8_t round = get_round(vxrm, res, 1);
2416     uint64_t over = (uint64_t)(res < a) << 63;
2417 
2418     return ((res >> 1) | over) + round;
2419 }
2420 
2421 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2422 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2423 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2424 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2425 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2426 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2427 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2428 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2429 
2430 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2431 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2432 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2433 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2434 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2435 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2436 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2437 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2438 
2439 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2440                              int32_t b)
2441 {
2442     int64_t res = (int64_t)a - b;
2443     uint8_t round = get_round(vxrm, res, 1);
2444 
2445     return (res >> 1) + round;
2446 }
2447 
2448 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2449                              int64_t b)
2450 {
2451     int64_t res = (int64_t)a - b;
2452     uint8_t round = get_round(vxrm, res, 1);
2453     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2454 
2455     /* With signed overflow, bit 64 is inverse of bit 63. */
2456     return ((res >> 1) ^ over) + round;
2457 }
2458 
2459 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2460 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2461 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2462 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2463 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2464 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2465 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2466 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2467 
2468 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2469 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2470 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2471 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2472 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2473 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2474 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2475 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2476 
2477 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2478                                uint32_t a, uint32_t b)
2479 {
2480     int64_t res = (int64_t)a - b;
2481     uint8_t round = get_round(vxrm, res, 1);
2482 
2483     return (res >> 1) + round;
2484 }
2485 
2486 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2487                                uint64_t a, uint64_t b)
2488 {
2489     uint64_t res = (uint64_t)a - b;
2490     uint8_t round = get_round(vxrm, res, 1);
2491     uint64_t over = (uint64_t)(res > a) << 63;
2492 
2493     return ((res >> 1) | over) + round;
2494 }
2495 
2496 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2497 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2498 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2499 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2500 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2501 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2502 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2503 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2504 
2505 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2506 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2507 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2508 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2509 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2510 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2511 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2512 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2513 
2514 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2515 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2516 {
2517     uint8_t round;
2518     int16_t res;
2519 
2520     res = (int16_t)a * (int16_t)b;
2521     round = get_round(vxrm, res, 7);
2522     res = (res >> 7) + round;
2523 
2524     if (res > INT8_MAX) {
2525         env->vxsat = 0x1;
2526         return INT8_MAX;
2527     } else if (res < INT8_MIN) {
2528         env->vxsat = 0x1;
2529         return INT8_MIN;
2530     } else {
2531         return res;
2532     }
2533 }
2534 
2535 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2536 {
2537     uint8_t round;
2538     int32_t res;
2539 
2540     res = (int32_t)a * (int32_t)b;
2541     round = get_round(vxrm, res, 15);
2542     res = (res >> 15) + round;
2543 
2544     if (res > INT16_MAX) {
2545         env->vxsat = 0x1;
2546         return INT16_MAX;
2547     } else if (res < INT16_MIN) {
2548         env->vxsat = 0x1;
2549         return INT16_MIN;
2550     } else {
2551         return res;
2552     }
2553 }
2554 
2555 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2556 {
2557     uint8_t round;
2558     int64_t res;
2559 
2560     res = (int64_t)a * (int64_t)b;
2561     round = get_round(vxrm, res, 31);
2562     res = (res >> 31) + round;
2563 
2564     if (res > INT32_MAX) {
2565         env->vxsat = 0x1;
2566         return INT32_MAX;
2567     } else if (res < INT32_MIN) {
2568         env->vxsat = 0x1;
2569         return INT32_MIN;
2570     } else {
2571         return res;
2572     }
2573 }
2574 
2575 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2576 {
2577     uint8_t round;
2578     uint64_t hi_64, lo_64;
2579     int64_t res;
2580 
2581     if (a == INT64_MIN && b == INT64_MIN) {
2582         env->vxsat = 1;
2583         return INT64_MAX;
2584     }
2585 
2586     muls64(&lo_64, &hi_64, a, b);
2587     round = get_round(vxrm, lo_64, 63);
2588     /*
2589      * Cannot overflow, as there are always
2590      * 2 sign bits after multiply.
2591      */
2592     res = (hi_64 << 1) | (lo_64 >> 63);
2593     if (round) {
2594         if (res == INT64_MAX) {
2595             env->vxsat = 1;
2596         } else {
2597             res += 1;
2598         }
2599     }
2600     return res;
2601 }
2602 
2603 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2604 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2605 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2606 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2607 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2608 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2609 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2610 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2611 
2612 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2613 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2614 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2615 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2616 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2617 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2618 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2619 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2620 
2621 /* Vector Single-Width Scaling Shift Instructions */
2622 static inline uint8_t
2623 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2624 {
2625     uint8_t round, shift = b & 0x7;
2626     uint8_t res;
2627 
2628     round = get_round(vxrm, a, shift);
2629     res = (a >> shift) + round;
2630     return res;
2631 }
2632 static inline uint16_t
2633 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2634 {
2635     uint8_t round, shift = b & 0xf;
2636 
2637     round = get_round(vxrm, a, shift);
2638     return (a >> shift) + round;
2639 }
2640 static inline uint32_t
2641 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2642 {
2643     uint8_t round, shift = b & 0x1f;
2644 
2645     round = get_round(vxrm, a, shift);
2646     return (a >> shift) + round;
2647 }
2648 static inline uint64_t
2649 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2650 {
2651     uint8_t round, shift = b & 0x3f;
2652 
2653     round = get_round(vxrm, a, shift);
2654     return (a >> shift) + round;
2655 }
2656 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2657 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2658 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2659 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2660 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2661 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2662 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2663 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2664 
2665 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2666 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2667 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2668 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2669 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2670 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2671 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2672 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2673 
2674 static inline int8_t
2675 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2676 {
2677     uint8_t round, shift = b & 0x7;
2678 
2679     round = get_round(vxrm, a, shift);
2680     return (a >> shift) + round;
2681 }
2682 static inline int16_t
2683 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2684 {
2685     uint8_t round, shift = b & 0xf;
2686 
2687     round = get_round(vxrm, a, shift);
2688     return (a >> shift) + round;
2689 }
2690 static inline int32_t
2691 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2692 {
2693     uint8_t round, shift = b & 0x1f;
2694 
2695     round = get_round(vxrm, a, shift);
2696     return (a >> shift) + round;
2697 }
2698 static inline int64_t
2699 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2700 {
2701     uint8_t round, shift = b & 0x3f;
2702 
2703     round = get_round(vxrm, a, shift);
2704     return (a >> shift) + round;
2705 }
2706 
2707 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2708 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2709 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2710 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2711 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2712 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2713 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2714 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2715 
2716 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2717 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2718 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2719 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2720 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2721 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2722 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2723 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2724 
2725 /* Vector Narrowing Fixed-Point Clip Instructions */
2726 static inline int8_t
2727 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2728 {
2729     uint8_t round, shift = b & 0xf;
2730     int16_t res;
2731 
2732     round = get_round(vxrm, a, shift);
2733     res = (a >> shift) + round;
2734     if (res > INT8_MAX) {
2735         env->vxsat = 0x1;
2736         return INT8_MAX;
2737     } else if (res < INT8_MIN) {
2738         env->vxsat = 0x1;
2739         return INT8_MIN;
2740     } else {
2741         return res;
2742     }
2743 }
2744 
2745 static inline int16_t
2746 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2747 {
2748     uint8_t round, shift = b & 0x1f;
2749     int32_t res;
2750 
2751     round = get_round(vxrm, a, shift);
2752     res = (a >> shift) + round;
2753     if (res > INT16_MAX) {
2754         env->vxsat = 0x1;
2755         return INT16_MAX;
2756     } else if (res < INT16_MIN) {
2757         env->vxsat = 0x1;
2758         return INT16_MIN;
2759     } else {
2760         return res;
2761     }
2762 }
2763 
2764 static inline int32_t
2765 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2766 {
2767     uint8_t round, shift = b & 0x3f;
2768     int64_t res;
2769 
2770     round = get_round(vxrm, a, shift);
2771     res = (a >> shift) + round;
2772     if (res > INT32_MAX) {
2773         env->vxsat = 0x1;
2774         return INT32_MAX;
2775     } else if (res < INT32_MIN) {
2776         env->vxsat = 0x1;
2777         return INT32_MIN;
2778     } else {
2779         return res;
2780     }
2781 }
2782 
2783 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2784 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2785 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2786 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2787 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2788 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2789 
2790 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2791 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2792 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2793 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2794 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2795 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2796 
2797 static inline uint8_t
2798 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2799 {
2800     uint8_t round, shift = b & 0xf;
2801     uint16_t res;
2802 
2803     round = get_round(vxrm, a, shift);
2804     res = (a >> shift) + round;
2805     if (res > UINT8_MAX) {
2806         env->vxsat = 0x1;
2807         return UINT8_MAX;
2808     } else {
2809         return res;
2810     }
2811 }
2812 
2813 static inline uint16_t
2814 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2815 {
2816     uint8_t round, shift = b & 0x1f;
2817     uint32_t res;
2818 
2819     round = get_round(vxrm, a, shift);
2820     res = (a >> shift) + round;
2821     if (res > UINT16_MAX) {
2822         env->vxsat = 0x1;
2823         return UINT16_MAX;
2824     } else {
2825         return res;
2826     }
2827 }
2828 
2829 static inline uint32_t
2830 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2831 {
2832     uint8_t round, shift = b & 0x3f;
2833     uint64_t res;
2834 
2835     round = get_round(vxrm, a, shift);
2836     res = (a >> shift) + round;
2837     if (res > UINT32_MAX) {
2838         env->vxsat = 0x1;
2839         return UINT32_MAX;
2840     } else {
2841         return res;
2842     }
2843 }
2844 
2845 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2846 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2847 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2848 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2849 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2850 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2851 
2852 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2853 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2854 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2855 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2856 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2857 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2858 
2859 /*
2860  * Vector Float Point Arithmetic Instructions
2861  */
2862 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2863 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2864 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2865                       CPURISCVState *env)                      \
2866 {                                                              \
2867     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2868     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2869     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2870 }
2871 
2872 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2873 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2874                   void *vs2, CPURISCVState *env,          \
2875                   uint32_t desc)                          \
2876 {                                                         \
2877     uint32_t vm = vext_vm(desc);                          \
2878     uint32_t vl = env->vl;                                \
2879     uint32_t total_elems =                                \
2880         vext_get_total_elems(env, desc, ESZ);             \
2881     uint32_t vta = vext_vta(desc);                        \
2882     uint32_t vma = vext_vma(desc);                        \
2883     uint32_t i;                                           \
2884                                                           \
2885     VSTART_CHECK_EARLY_EXIT(env);                         \
2886                                                           \
2887     for (i = env->vstart; i < vl; i++) {                  \
2888         if (!vm && !vext_elem_mask(v0, i)) {              \
2889             /* set masked-off elements to 1s */           \
2890             vext_set_elems_1s(vd, vma, i * ESZ,           \
2891                               (i + 1) * ESZ);             \
2892             continue;                                     \
2893         }                                                 \
2894         do_##NAME(vd, vs1, vs2, i, env);                  \
2895     }                                                     \
2896     env->vstart = 0;                                      \
2897     /* set tail elements to 1s */                         \
2898     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2899                       total_elems * ESZ);                 \
2900 }
2901 
2902 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2903 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2904 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2905 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2906 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2907 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2908 
2909 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2910 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2911                       CPURISCVState *env)                      \
2912 {                                                              \
2913     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2914     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2915 }
2916 
2917 #define GEN_VEXT_VF(NAME, ESZ)                            \
2918 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2919                   void *vs2, CPURISCVState *env,          \
2920                   uint32_t desc)                          \
2921 {                                                         \
2922     uint32_t vm = vext_vm(desc);                          \
2923     uint32_t vl = env->vl;                                \
2924     uint32_t total_elems =                                \
2925         vext_get_total_elems(env, desc, ESZ);             \
2926     uint32_t vta = vext_vta(desc);                        \
2927     uint32_t vma = vext_vma(desc);                        \
2928     uint32_t i;                                           \
2929                                                           \
2930     VSTART_CHECK_EARLY_EXIT(env);                         \
2931                                                           \
2932     for (i = env->vstart; i < vl; i++) {                  \
2933         if (!vm && !vext_elem_mask(v0, i)) {              \
2934             /* set masked-off elements to 1s */           \
2935             vext_set_elems_1s(vd, vma, i * ESZ,           \
2936                               (i + 1) * ESZ);             \
2937             continue;                                     \
2938         }                                                 \
2939         do_##NAME(vd, s1, vs2, i, env);                   \
2940     }                                                     \
2941     env->vstart = 0;                                      \
2942     /* set tail elements to 1s */                         \
2943     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2944                       total_elems * ESZ);                 \
2945 }
2946 
2947 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2948 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2949 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2950 GEN_VEXT_VF(vfadd_vf_h, 2)
2951 GEN_VEXT_VF(vfadd_vf_w, 4)
2952 GEN_VEXT_VF(vfadd_vf_d, 8)
2953 
2954 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2955 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2956 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2957 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2958 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2959 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2960 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2961 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2962 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2963 GEN_VEXT_VF(vfsub_vf_h, 2)
2964 GEN_VEXT_VF(vfsub_vf_w, 4)
2965 GEN_VEXT_VF(vfsub_vf_d, 8)
2966 
2967 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2968 {
2969     return float16_sub(b, a, s);
2970 }
2971 
2972 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2973 {
2974     return float32_sub(b, a, s);
2975 }
2976 
2977 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2978 {
2979     return float64_sub(b, a, s);
2980 }
2981 
2982 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2983 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2984 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2985 GEN_VEXT_VF(vfrsub_vf_h, 2)
2986 GEN_VEXT_VF(vfrsub_vf_w, 4)
2987 GEN_VEXT_VF(vfrsub_vf_d, 8)
2988 
2989 /* Vector Widening Floating-Point Add/Subtract Instructions */
2990 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2991 {
2992     return float32_add(float16_to_float32(a, true, s),
2993                        float16_to_float32(b, true, s), s);
2994 }
2995 
2996 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2997 {
2998     return float64_add(float32_to_float64(a, s),
2999                        float32_to_float64(b, s), s);
3000 
3001 }
3002 
3003 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3004 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3005 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3006 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3007 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3008 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3009 GEN_VEXT_VF(vfwadd_vf_h, 4)
3010 GEN_VEXT_VF(vfwadd_vf_w, 8)
3011 
3012 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3013 {
3014     return float32_sub(float16_to_float32(a, true, s),
3015                        float16_to_float32(b, true, s), s);
3016 }
3017 
3018 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3019 {
3020     return float64_sub(float32_to_float64(a, s),
3021                        float32_to_float64(b, s), s);
3022 
3023 }
3024 
3025 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3026 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3027 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3028 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3029 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3030 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3031 GEN_VEXT_VF(vfwsub_vf_h, 4)
3032 GEN_VEXT_VF(vfwsub_vf_w, 8)
3033 
3034 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3035 {
3036     return float32_add(a, float16_to_float32(b, true, s), s);
3037 }
3038 
3039 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3040 {
3041     return float64_add(a, float32_to_float64(b, s), s);
3042 }
3043 
3044 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3045 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3046 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3047 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3048 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3049 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3050 GEN_VEXT_VF(vfwadd_wf_h, 4)
3051 GEN_VEXT_VF(vfwadd_wf_w, 8)
3052 
3053 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3054 {
3055     return float32_sub(a, float16_to_float32(b, true, s), s);
3056 }
3057 
3058 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3059 {
3060     return float64_sub(a, float32_to_float64(b, s), s);
3061 }
3062 
3063 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3064 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3065 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3066 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3067 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3068 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3069 GEN_VEXT_VF(vfwsub_wf_h, 4)
3070 GEN_VEXT_VF(vfwsub_wf_w, 8)
3071 
3072 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3073 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3074 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3075 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3076 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3077 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3078 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3079 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3080 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3081 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3082 GEN_VEXT_VF(vfmul_vf_h, 2)
3083 GEN_VEXT_VF(vfmul_vf_w, 4)
3084 GEN_VEXT_VF(vfmul_vf_d, 8)
3085 
3086 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3087 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3088 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3089 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3090 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3091 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3092 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3093 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3094 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3095 GEN_VEXT_VF(vfdiv_vf_h, 2)
3096 GEN_VEXT_VF(vfdiv_vf_w, 4)
3097 GEN_VEXT_VF(vfdiv_vf_d, 8)
3098 
3099 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3100 {
3101     return float16_div(b, a, s);
3102 }
3103 
3104 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3105 {
3106     return float32_div(b, a, s);
3107 }
3108 
3109 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3110 {
3111     return float64_div(b, a, s);
3112 }
3113 
3114 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3115 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3116 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3117 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3118 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3119 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3120 
3121 /* Vector Widening Floating-Point Multiply */
3122 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3123 {
3124     return float32_mul(float16_to_float32(a, true, s),
3125                        float16_to_float32(b, true, s), s);
3126 }
3127 
3128 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3129 {
3130     return float64_mul(float32_to_float64(a, s),
3131                        float32_to_float64(b, s), s);
3132 
3133 }
3134 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3135 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3136 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3137 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3138 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3139 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3140 GEN_VEXT_VF(vfwmul_vf_h, 4)
3141 GEN_VEXT_VF(vfwmul_vf_w, 8)
3142 
3143 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3144 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3145 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3146                       CPURISCVState *env)                          \
3147 {                                                                  \
3148     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3149     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3150     TD d = *((TD *)vd + HD(i));                                    \
3151     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3152 }
3153 
3154 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3155 {
3156     return float16_muladd(a, b, d, 0, s);
3157 }
3158 
3159 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3160 {
3161     return float32_muladd(a, b, d, 0, s);
3162 }
3163 
3164 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3165 {
3166     return float64_muladd(a, b, d, 0, s);
3167 }
3168 
3169 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3170 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3171 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3172 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3173 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3174 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3175 
3176 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3177 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3178                       CPURISCVState *env)                         \
3179 {                                                                 \
3180     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3181     TD d = *((TD *)vd + HD(i));                                   \
3182     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3183 }
3184 
3185 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3186 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3187 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3188 GEN_VEXT_VF(vfmacc_vf_h, 2)
3189 GEN_VEXT_VF(vfmacc_vf_w, 4)
3190 GEN_VEXT_VF(vfmacc_vf_d, 8)
3191 
3192 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3193 {
3194     return float16_muladd(a, b, d, float_muladd_negate_c |
3195                                    float_muladd_negate_product, s);
3196 }
3197 
3198 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3199 {
3200     return float32_muladd(a, b, d, float_muladd_negate_c |
3201                                    float_muladd_negate_product, s);
3202 }
3203 
3204 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3205 {
3206     return float64_muladd(a, b, d, float_muladd_negate_c |
3207                                    float_muladd_negate_product, s);
3208 }
3209 
3210 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3211 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3212 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3213 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3214 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3215 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3216 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3217 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3218 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3219 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3220 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3221 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3222 
3223 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3224 {
3225     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3226 }
3227 
3228 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3229 {
3230     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3231 }
3232 
3233 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3234 {
3235     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3236 }
3237 
3238 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3239 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3240 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3241 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3242 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3243 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3244 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3245 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3246 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3247 GEN_VEXT_VF(vfmsac_vf_h, 2)
3248 GEN_VEXT_VF(vfmsac_vf_w, 4)
3249 GEN_VEXT_VF(vfmsac_vf_d, 8)
3250 
3251 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3252 {
3253     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3254 }
3255 
3256 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3257 {
3258     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3259 }
3260 
3261 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3262 {
3263     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3264 }
3265 
3266 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3267 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3268 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3269 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3270 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3271 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3272 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3273 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3274 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3275 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3276 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3277 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3278 
3279 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3280 {
3281     return float16_muladd(d, b, a, 0, s);
3282 }
3283 
3284 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3285 {
3286     return float32_muladd(d, b, a, 0, s);
3287 }
3288 
3289 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3290 {
3291     return float64_muladd(d, b, a, 0, s);
3292 }
3293 
3294 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3295 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3296 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3297 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3298 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3299 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3300 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3301 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3302 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3303 GEN_VEXT_VF(vfmadd_vf_h, 2)
3304 GEN_VEXT_VF(vfmadd_vf_w, 4)
3305 GEN_VEXT_VF(vfmadd_vf_d, 8)
3306 
3307 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3308 {
3309     return float16_muladd(d, b, a, float_muladd_negate_c |
3310                                    float_muladd_negate_product, s);
3311 }
3312 
3313 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3314 {
3315     return float32_muladd(d, b, a, float_muladd_negate_c |
3316                                    float_muladd_negate_product, s);
3317 }
3318 
3319 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3320 {
3321     return float64_muladd(d, b, a, float_muladd_negate_c |
3322                                    float_muladd_negate_product, s);
3323 }
3324 
3325 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3326 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3327 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3328 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3329 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3330 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3331 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3332 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3333 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3334 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3335 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3336 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3337 
3338 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3339 {
3340     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3341 }
3342 
3343 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3344 {
3345     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3346 }
3347 
3348 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3349 {
3350     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3351 }
3352 
3353 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3354 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3355 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3356 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3357 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3358 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3359 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3360 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3361 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3362 GEN_VEXT_VF(vfmsub_vf_h, 2)
3363 GEN_VEXT_VF(vfmsub_vf_w, 4)
3364 GEN_VEXT_VF(vfmsub_vf_d, 8)
3365 
3366 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3367 {
3368     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3369 }
3370 
3371 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3372 {
3373     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3374 }
3375 
3376 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3377 {
3378     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3379 }
3380 
3381 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3382 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3383 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3384 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3385 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3386 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3387 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3388 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3389 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3390 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3391 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3392 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3393 
3394 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3395 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3396 {
3397     return float32_muladd(float16_to_float32(a, true, s),
3398                           float16_to_float32(b, true, s), d, 0, s);
3399 }
3400 
3401 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3402 {
3403     return float64_muladd(float32_to_float64(a, s),
3404                           float32_to_float64(b, s), d, 0, s);
3405 }
3406 
3407 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3408 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3409 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3410 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3411 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3412 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3413 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3414 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3415 
3416 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3417 {
3418     return float32_muladd(bfloat16_to_float32(a, s),
3419                           bfloat16_to_float32(b, s), d, 0, s);
3420 }
3421 
3422 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3423 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3424 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3425 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3426 
3427 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3428 {
3429     return float32_muladd(float16_to_float32(a, true, s),
3430                           float16_to_float32(b, true, s), d,
3431                           float_muladd_negate_c | float_muladd_negate_product,
3432                           s);
3433 }
3434 
3435 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3436 {
3437     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3438                           d, float_muladd_negate_c |
3439                              float_muladd_negate_product, s);
3440 }
3441 
3442 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3443 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3444 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3445 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3446 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3447 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3448 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3449 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3450 
3451 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3452 {
3453     return float32_muladd(float16_to_float32(a, true, s),
3454                           float16_to_float32(b, true, s), d,
3455                           float_muladd_negate_c, s);
3456 }
3457 
3458 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3459 {
3460     return float64_muladd(float32_to_float64(a, s),
3461                           float32_to_float64(b, s), d,
3462                           float_muladd_negate_c, s);
3463 }
3464 
3465 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3466 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3467 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3468 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3469 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3470 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3471 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3472 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3473 
3474 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3475 {
3476     return float32_muladd(float16_to_float32(a, true, s),
3477                           float16_to_float32(b, true, s), d,
3478                           float_muladd_negate_product, s);
3479 }
3480 
3481 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3482 {
3483     return float64_muladd(float32_to_float64(a, s),
3484                           float32_to_float64(b, s), d,
3485                           float_muladd_negate_product, s);
3486 }
3487 
3488 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3489 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3490 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3491 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3492 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3493 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3494 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3495 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3496 
3497 /* Vector Floating-Point Square-Root Instruction */
3498 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3499 static void do_##NAME(void *vd, void *vs2, int i,      \
3500                       CPURISCVState *env)              \
3501 {                                                      \
3502     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3503     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3504 }
3505 
3506 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3507 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3508                   CPURISCVState *env, uint32_t desc)   \
3509 {                                                      \
3510     uint32_t vm = vext_vm(desc);                       \
3511     uint32_t vl = env->vl;                             \
3512     uint32_t total_elems =                             \
3513         vext_get_total_elems(env, desc, ESZ);          \
3514     uint32_t vta = vext_vta(desc);                     \
3515     uint32_t vma = vext_vma(desc);                     \
3516     uint32_t i;                                        \
3517                                                        \
3518     VSTART_CHECK_EARLY_EXIT(env);                      \
3519                                                        \
3520     if (vl == 0) {                                     \
3521         return;                                        \
3522     }                                                  \
3523     for (i = env->vstart; i < vl; i++) {               \
3524         if (!vm && !vext_elem_mask(v0, i)) {           \
3525             /* set masked-off elements to 1s */        \
3526             vext_set_elems_1s(vd, vma, i * ESZ,        \
3527                               (i + 1) * ESZ);          \
3528             continue;                                  \
3529         }                                              \
3530         do_##NAME(vd, vs2, i, env);                    \
3531     }                                                  \
3532     env->vstart = 0;                                   \
3533     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3534                       total_elems * ESZ);              \
3535 }
3536 
3537 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3538 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3539 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3540 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3541 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3542 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3543 
3544 /*
3545  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3546  *
3547  * Adapted from riscv-v-spec recip.c:
3548  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3549  */
3550 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3551 {
3552     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3553     uint64_t exp = extract64(f, frac_size, exp_size);
3554     uint64_t frac = extract64(f, 0, frac_size);
3555 
3556     const uint8_t lookup_table[] = {
3557         52, 51, 50, 48, 47, 46, 44, 43,
3558         42, 41, 40, 39, 38, 36, 35, 34,
3559         33, 32, 31, 30, 30, 29, 28, 27,
3560         26, 25, 24, 23, 23, 22, 21, 20,
3561         19, 19, 18, 17, 16, 16, 15, 14,
3562         14, 13, 12, 12, 11, 10, 10, 9,
3563         9, 8, 7, 7, 6, 6, 5, 4,
3564         4, 3, 3, 2, 2, 1, 1, 0,
3565         127, 125, 123, 121, 119, 118, 116, 114,
3566         113, 111, 109, 108, 106, 105, 103, 102,
3567         100, 99, 97, 96, 95, 93, 92, 91,
3568         90, 88, 87, 86, 85, 84, 83, 82,
3569         80, 79, 78, 77, 76, 75, 74, 73,
3570         72, 71, 70, 70, 69, 68, 67, 66,
3571         65, 64, 63, 63, 62, 61, 60, 59,
3572         59, 58, 57, 56, 56, 55, 54, 53
3573     };
3574     const int precision = 7;
3575 
3576     if (exp == 0 && frac != 0) { /* subnormal */
3577         /* Normalize the subnormal. */
3578         while (extract64(frac, frac_size - 1, 1) == 0) {
3579             exp--;
3580             frac <<= 1;
3581         }
3582 
3583         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3584     }
3585 
3586     int idx = ((exp & 1) << (precision - 1)) |
3587               (frac >> (frac_size - precision + 1));
3588     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3589                         (frac_size - precision);
3590     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3591 
3592     uint64_t val = 0;
3593     val = deposit64(val, 0, frac_size, out_frac);
3594     val = deposit64(val, frac_size, exp_size, out_exp);
3595     val = deposit64(val, frac_size + exp_size, 1, sign);
3596     return val;
3597 }
3598 
3599 static float16 frsqrt7_h(float16 f, float_status *s)
3600 {
3601     int exp_size = 5, frac_size = 10;
3602     bool sign = float16_is_neg(f);
3603 
3604     /*
3605      * frsqrt7(sNaN) = canonical NaN
3606      * frsqrt7(-inf) = canonical NaN
3607      * frsqrt7(-normal) = canonical NaN
3608      * frsqrt7(-subnormal) = canonical NaN
3609      */
3610     if (float16_is_signaling_nan(f, s) ||
3611         (float16_is_infinity(f) && sign) ||
3612         (float16_is_normal(f) && sign) ||
3613         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3614         s->float_exception_flags |= float_flag_invalid;
3615         return float16_default_nan(s);
3616     }
3617 
3618     /* frsqrt7(qNaN) = canonical NaN */
3619     if (float16_is_quiet_nan(f, s)) {
3620         return float16_default_nan(s);
3621     }
3622 
3623     /* frsqrt7(+-0) = +-inf */
3624     if (float16_is_zero(f)) {
3625         s->float_exception_flags |= float_flag_divbyzero;
3626         return float16_set_sign(float16_infinity, sign);
3627     }
3628 
3629     /* frsqrt7(+inf) = +0 */
3630     if (float16_is_infinity(f) && !sign) {
3631         return float16_set_sign(float16_zero, sign);
3632     }
3633 
3634     /* +normal, +subnormal */
3635     uint64_t val = frsqrt7(f, exp_size, frac_size);
3636     return make_float16(val);
3637 }
3638 
3639 static float32 frsqrt7_s(float32 f, float_status *s)
3640 {
3641     int exp_size = 8, frac_size = 23;
3642     bool sign = float32_is_neg(f);
3643 
3644     /*
3645      * frsqrt7(sNaN) = canonical NaN
3646      * frsqrt7(-inf) = canonical NaN
3647      * frsqrt7(-normal) = canonical NaN
3648      * frsqrt7(-subnormal) = canonical NaN
3649      */
3650     if (float32_is_signaling_nan(f, s) ||
3651         (float32_is_infinity(f) && sign) ||
3652         (float32_is_normal(f) && sign) ||
3653         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3654         s->float_exception_flags |= float_flag_invalid;
3655         return float32_default_nan(s);
3656     }
3657 
3658     /* frsqrt7(qNaN) = canonical NaN */
3659     if (float32_is_quiet_nan(f, s)) {
3660         return float32_default_nan(s);
3661     }
3662 
3663     /* frsqrt7(+-0) = +-inf */
3664     if (float32_is_zero(f)) {
3665         s->float_exception_flags |= float_flag_divbyzero;
3666         return float32_set_sign(float32_infinity, sign);
3667     }
3668 
3669     /* frsqrt7(+inf) = +0 */
3670     if (float32_is_infinity(f) && !sign) {
3671         return float32_set_sign(float32_zero, sign);
3672     }
3673 
3674     /* +normal, +subnormal */
3675     uint64_t val = frsqrt7(f, exp_size, frac_size);
3676     return make_float32(val);
3677 }
3678 
3679 static float64 frsqrt7_d(float64 f, float_status *s)
3680 {
3681     int exp_size = 11, frac_size = 52;
3682     bool sign = float64_is_neg(f);
3683 
3684     /*
3685      * frsqrt7(sNaN) = canonical NaN
3686      * frsqrt7(-inf) = canonical NaN
3687      * frsqrt7(-normal) = canonical NaN
3688      * frsqrt7(-subnormal) = canonical NaN
3689      */
3690     if (float64_is_signaling_nan(f, s) ||
3691         (float64_is_infinity(f) && sign) ||
3692         (float64_is_normal(f) && sign) ||
3693         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3694         s->float_exception_flags |= float_flag_invalid;
3695         return float64_default_nan(s);
3696     }
3697 
3698     /* frsqrt7(qNaN) = canonical NaN */
3699     if (float64_is_quiet_nan(f, s)) {
3700         return float64_default_nan(s);
3701     }
3702 
3703     /* frsqrt7(+-0) = +-inf */
3704     if (float64_is_zero(f)) {
3705         s->float_exception_flags |= float_flag_divbyzero;
3706         return float64_set_sign(float64_infinity, sign);
3707     }
3708 
3709     /* frsqrt7(+inf) = +0 */
3710     if (float64_is_infinity(f) && !sign) {
3711         return float64_set_sign(float64_zero, sign);
3712     }
3713 
3714     /* +normal, +subnormal */
3715     uint64_t val = frsqrt7(f, exp_size, frac_size);
3716     return make_float64(val);
3717 }
3718 
3719 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3720 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3721 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3722 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3723 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3724 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3725 
3726 /*
3727  * Vector Floating-Point Reciprocal Estimate Instruction
3728  *
3729  * Adapted from riscv-v-spec recip.c:
3730  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3731  */
3732 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3733                       float_status *s)
3734 {
3735     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3736     uint64_t exp = extract64(f, frac_size, exp_size);
3737     uint64_t frac = extract64(f, 0, frac_size);
3738 
3739     const uint8_t lookup_table[] = {
3740         127, 125, 123, 121, 119, 117, 116, 114,
3741         112, 110, 109, 107, 105, 104, 102, 100,
3742         99, 97, 96, 94, 93, 91, 90, 88,
3743         87, 85, 84, 83, 81, 80, 79, 77,
3744         76, 75, 74, 72, 71, 70, 69, 68,
3745         66, 65, 64, 63, 62, 61, 60, 59,
3746         58, 57, 56, 55, 54, 53, 52, 51,
3747         50, 49, 48, 47, 46, 45, 44, 43,
3748         42, 41, 40, 40, 39, 38, 37, 36,
3749         35, 35, 34, 33, 32, 31, 31, 30,
3750         29, 28, 28, 27, 26, 25, 25, 24,
3751         23, 23, 22, 21, 21, 20, 19, 19,
3752         18, 17, 17, 16, 15, 15, 14, 14,
3753         13, 12, 12, 11, 11, 10, 9, 9,
3754         8, 8, 7, 7, 6, 5, 5, 4,
3755         4, 3, 3, 2, 2, 1, 1, 0
3756     };
3757     const int precision = 7;
3758 
3759     if (exp == 0 && frac != 0) { /* subnormal */
3760         /* Normalize the subnormal. */
3761         while (extract64(frac, frac_size - 1, 1) == 0) {
3762             exp--;
3763             frac <<= 1;
3764         }
3765 
3766         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3767 
3768         if (exp != 0 && exp != UINT64_MAX) {
3769             /*
3770              * Overflow to inf or max value of same sign,
3771              * depending on sign and rounding mode.
3772              */
3773             s->float_exception_flags |= (float_flag_inexact |
3774                                          float_flag_overflow);
3775 
3776             if ((s->float_rounding_mode == float_round_to_zero) ||
3777                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3778                 ((s->float_rounding_mode == float_round_up) && sign)) {
3779                 /* Return greatest/negative finite value. */
3780                 return (sign << (exp_size + frac_size)) |
3781                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3782             } else {
3783                 /* Return +-inf. */
3784                 return (sign << (exp_size + frac_size)) |
3785                        MAKE_64BIT_MASK(frac_size, exp_size);
3786             }
3787         }
3788     }
3789 
3790     int idx = frac >> (frac_size - precision);
3791     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3792                         (frac_size - precision);
3793     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3794 
3795     if (out_exp == 0 || out_exp == UINT64_MAX) {
3796         /*
3797          * The result is subnormal, but don't raise the underflow exception,
3798          * because there's no additional loss of precision.
3799          */
3800         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3801         if (out_exp == UINT64_MAX) {
3802             out_frac >>= 1;
3803             out_exp = 0;
3804         }
3805     }
3806 
3807     uint64_t val = 0;
3808     val = deposit64(val, 0, frac_size, out_frac);
3809     val = deposit64(val, frac_size, exp_size, out_exp);
3810     val = deposit64(val, frac_size + exp_size, 1, sign);
3811     return val;
3812 }
3813 
3814 static float16 frec7_h(float16 f, float_status *s)
3815 {
3816     int exp_size = 5, frac_size = 10;
3817     bool sign = float16_is_neg(f);
3818 
3819     /* frec7(+-inf) = +-0 */
3820     if (float16_is_infinity(f)) {
3821         return float16_set_sign(float16_zero, sign);
3822     }
3823 
3824     /* frec7(+-0) = +-inf */
3825     if (float16_is_zero(f)) {
3826         s->float_exception_flags |= float_flag_divbyzero;
3827         return float16_set_sign(float16_infinity, sign);
3828     }
3829 
3830     /* frec7(sNaN) = canonical NaN */
3831     if (float16_is_signaling_nan(f, s)) {
3832         s->float_exception_flags |= float_flag_invalid;
3833         return float16_default_nan(s);
3834     }
3835 
3836     /* frec7(qNaN) = canonical NaN */
3837     if (float16_is_quiet_nan(f, s)) {
3838         return float16_default_nan(s);
3839     }
3840 
3841     /* +-normal, +-subnormal */
3842     uint64_t val = frec7(f, exp_size, frac_size, s);
3843     return make_float16(val);
3844 }
3845 
3846 static float32 frec7_s(float32 f, float_status *s)
3847 {
3848     int exp_size = 8, frac_size = 23;
3849     bool sign = float32_is_neg(f);
3850 
3851     /* frec7(+-inf) = +-0 */
3852     if (float32_is_infinity(f)) {
3853         return float32_set_sign(float32_zero, sign);
3854     }
3855 
3856     /* frec7(+-0) = +-inf */
3857     if (float32_is_zero(f)) {
3858         s->float_exception_flags |= float_flag_divbyzero;
3859         return float32_set_sign(float32_infinity, sign);
3860     }
3861 
3862     /* frec7(sNaN) = canonical NaN */
3863     if (float32_is_signaling_nan(f, s)) {
3864         s->float_exception_flags |= float_flag_invalid;
3865         return float32_default_nan(s);
3866     }
3867 
3868     /* frec7(qNaN) = canonical NaN */
3869     if (float32_is_quiet_nan(f, s)) {
3870         return float32_default_nan(s);
3871     }
3872 
3873     /* +-normal, +-subnormal */
3874     uint64_t val = frec7(f, exp_size, frac_size, s);
3875     return make_float32(val);
3876 }
3877 
3878 static float64 frec7_d(float64 f, float_status *s)
3879 {
3880     int exp_size = 11, frac_size = 52;
3881     bool sign = float64_is_neg(f);
3882 
3883     /* frec7(+-inf) = +-0 */
3884     if (float64_is_infinity(f)) {
3885         return float64_set_sign(float64_zero, sign);
3886     }
3887 
3888     /* frec7(+-0) = +-inf */
3889     if (float64_is_zero(f)) {
3890         s->float_exception_flags |= float_flag_divbyzero;
3891         return float64_set_sign(float64_infinity, sign);
3892     }
3893 
3894     /* frec7(sNaN) = canonical NaN */
3895     if (float64_is_signaling_nan(f, s)) {
3896         s->float_exception_flags |= float_flag_invalid;
3897         return float64_default_nan(s);
3898     }
3899 
3900     /* frec7(qNaN) = canonical NaN */
3901     if (float64_is_quiet_nan(f, s)) {
3902         return float64_default_nan(s);
3903     }
3904 
3905     /* +-normal, +-subnormal */
3906     uint64_t val = frec7(f, exp_size, frac_size, s);
3907     return make_float64(val);
3908 }
3909 
3910 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3911 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3912 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3913 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3914 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3915 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3916 
3917 /* Vector Floating-Point MIN/MAX Instructions */
3918 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3919 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3920 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3921 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3922 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3923 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3924 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3925 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3926 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3927 GEN_VEXT_VF(vfmin_vf_h, 2)
3928 GEN_VEXT_VF(vfmin_vf_w, 4)
3929 GEN_VEXT_VF(vfmin_vf_d, 8)
3930 
3931 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3932 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3933 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3934 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3935 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3936 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3937 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3938 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3939 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3940 GEN_VEXT_VF(vfmax_vf_h, 2)
3941 GEN_VEXT_VF(vfmax_vf_w, 4)
3942 GEN_VEXT_VF(vfmax_vf_d, 8)
3943 
3944 /* Vector Floating-Point Sign-Injection Instructions */
3945 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3946 {
3947     return deposit64(b, 0, 15, a);
3948 }
3949 
3950 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3951 {
3952     return deposit64(b, 0, 31, a);
3953 }
3954 
3955 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3956 {
3957     return deposit64(b, 0, 63, a);
3958 }
3959 
3960 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3961 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3962 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3963 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3964 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3965 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3966 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3967 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3968 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3969 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3970 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3971 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3972 
3973 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3974 {
3975     return deposit64(~b, 0, 15, a);
3976 }
3977 
3978 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3979 {
3980     return deposit64(~b, 0, 31, a);
3981 }
3982 
3983 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3984 {
3985     return deposit64(~b, 0, 63, a);
3986 }
3987 
3988 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3989 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3990 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3991 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3992 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3993 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3994 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3995 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3996 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3997 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
3998 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
3999 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4000 
4001 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4002 {
4003     return deposit64(b ^ a, 0, 15, a);
4004 }
4005 
4006 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4007 {
4008     return deposit64(b ^ a, 0, 31, a);
4009 }
4010 
4011 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4012 {
4013     return deposit64(b ^ a, 0, 63, a);
4014 }
4015 
4016 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4017 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4018 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4019 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4020 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4021 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4022 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4023 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4024 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4025 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4026 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4027 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4028 
4029 /* Vector Floating-Point Compare Instructions */
4030 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4031 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4032                   CPURISCVState *env, uint32_t desc)          \
4033 {                                                             \
4034     uint32_t vm = vext_vm(desc);                              \
4035     uint32_t vl = env->vl;                                    \
4036     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4037     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4038     uint32_t vma = vext_vma(desc);                            \
4039     uint32_t i;                                               \
4040                                                               \
4041     VSTART_CHECK_EARLY_EXIT(env);                             \
4042                                                               \
4043     for (i = env->vstart; i < vl; i++) {                      \
4044         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4045         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4046         if (!vm && !vext_elem_mask(v0, i)) {                  \
4047             /* set masked-off elements to 1s */               \
4048             if (vma) {                                        \
4049                 vext_set_elem_mask(vd, i, 1);                 \
4050             }                                                 \
4051             continue;                                         \
4052         }                                                     \
4053         vext_set_elem_mask(vd, i,                             \
4054                            DO_OP(s2, s1, &env->fp_status));   \
4055     }                                                         \
4056     env->vstart = 0;                                          \
4057     /*
4058      * mask destination register are always tail-agnostic
4059      * set tail elements to 1s
4060      */                                                       \
4061     if (vta_all_1s) {                                         \
4062         for (; i < total_elems; i++) {                        \
4063             vext_set_elem_mask(vd, i, 1);                     \
4064         }                                                     \
4065     }                                                         \
4066 }
4067 
4068 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4069 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4070 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4071 
4072 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4073 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4074                   CPURISCVState *env, uint32_t desc)                \
4075 {                                                                   \
4076     uint32_t vm = vext_vm(desc);                                    \
4077     uint32_t vl = env->vl;                                          \
4078     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4079     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4080     uint32_t vma = vext_vma(desc);                                  \
4081     uint32_t i;                                                     \
4082                                                                     \
4083     VSTART_CHECK_EARLY_EXIT(env);                                   \
4084                                                                     \
4085     for (i = env->vstart; i < vl; i++) {                            \
4086         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4087         if (!vm && !vext_elem_mask(v0, i)) {                        \
4088             /* set masked-off elements to 1s */                     \
4089             if (vma) {                                              \
4090                 vext_set_elem_mask(vd, i, 1);                       \
4091             }                                                       \
4092             continue;                                               \
4093         }                                                           \
4094         vext_set_elem_mask(vd, i,                                   \
4095                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4096     }                                                               \
4097     env->vstart = 0;                                                \
4098     /*
4099      * mask destination register are always tail-agnostic
4100      * set tail elements to 1s
4101      */                                                             \
4102     if (vta_all_1s) {                                               \
4103         for (; i < total_elems; i++) {                              \
4104             vext_set_elem_mask(vd, i, 1);                           \
4105         }                                                           \
4106     }                                                               \
4107 }
4108 
4109 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4110 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4111 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4112 
4113 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4114 {
4115     FloatRelation compare = float16_compare_quiet(a, b, s);
4116     return compare != float_relation_equal;
4117 }
4118 
4119 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4120 {
4121     FloatRelation compare = float32_compare_quiet(a, b, s);
4122     return compare != float_relation_equal;
4123 }
4124 
4125 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4126 {
4127     FloatRelation compare = float64_compare_quiet(a, b, s);
4128     return compare != float_relation_equal;
4129 }
4130 
4131 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4132 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4133 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4134 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4135 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4136 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4137 
4138 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4139 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4140 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4141 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4142 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4143 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4144 
4145 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4146 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4147 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4148 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4149 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4150 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4151 
4152 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4153 {
4154     FloatRelation compare = float16_compare(a, b, s);
4155     return compare == float_relation_greater;
4156 }
4157 
4158 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4159 {
4160     FloatRelation compare = float32_compare(a, b, s);
4161     return compare == float_relation_greater;
4162 }
4163 
4164 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4165 {
4166     FloatRelation compare = float64_compare(a, b, s);
4167     return compare == float_relation_greater;
4168 }
4169 
4170 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4171 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4172 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4173 
4174 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4175 {
4176     FloatRelation compare = float16_compare(a, b, s);
4177     return compare == float_relation_greater ||
4178            compare == float_relation_equal;
4179 }
4180 
4181 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4182 {
4183     FloatRelation compare = float32_compare(a, b, s);
4184     return compare == float_relation_greater ||
4185            compare == float_relation_equal;
4186 }
4187 
4188 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4189 {
4190     FloatRelation compare = float64_compare(a, b, s);
4191     return compare == float_relation_greater ||
4192            compare == float_relation_equal;
4193 }
4194 
4195 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4196 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4197 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4198 
4199 /* Vector Floating-Point Classify Instruction */
4200 target_ulong fclass_h(uint64_t frs1)
4201 {
4202     float16 f = frs1;
4203     bool sign = float16_is_neg(f);
4204 
4205     if (float16_is_infinity(f)) {
4206         return sign ? 1 << 0 : 1 << 7;
4207     } else if (float16_is_zero(f)) {
4208         return sign ? 1 << 3 : 1 << 4;
4209     } else if (float16_is_zero_or_denormal(f)) {
4210         return sign ? 1 << 2 : 1 << 5;
4211     } else if (float16_is_any_nan(f)) {
4212         float_status s = { }; /* for snan_bit_is_one */
4213         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4214     } else {
4215         return sign ? 1 << 1 : 1 << 6;
4216     }
4217 }
4218 
4219 target_ulong fclass_s(uint64_t frs1)
4220 {
4221     float32 f = frs1;
4222     bool sign = float32_is_neg(f);
4223 
4224     if (float32_is_infinity(f)) {
4225         return sign ? 1 << 0 : 1 << 7;
4226     } else if (float32_is_zero(f)) {
4227         return sign ? 1 << 3 : 1 << 4;
4228     } else if (float32_is_zero_or_denormal(f)) {
4229         return sign ? 1 << 2 : 1 << 5;
4230     } else if (float32_is_any_nan(f)) {
4231         float_status s = { }; /* for snan_bit_is_one */
4232         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4233     } else {
4234         return sign ? 1 << 1 : 1 << 6;
4235     }
4236 }
4237 
4238 target_ulong fclass_d(uint64_t frs1)
4239 {
4240     float64 f = frs1;
4241     bool sign = float64_is_neg(f);
4242 
4243     if (float64_is_infinity(f)) {
4244         return sign ? 1 << 0 : 1 << 7;
4245     } else if (float64_is_zero(f)) {
4246         return sign ? 1 << 3 : 1 << 4;
4247     } else if (float64_is_zero_or_denormal(f)) {
4248         return sign ? 1 << 2 : 1 << 5;
4249     } else if (float64_is_any_nan(f)) {
4250         float_status s = { }; /* for snan_bit_is_one */
4251         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4252     } else {
4253         return sign ? 1 << 1 : 1 << 6;
4254     }
4255 }
4256 
4257 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4258 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4259 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4260 GEN_VEXT_V(vfclass_v_h, 2)
4261 GEN_VEXT_V(vfclass_v_w, 4)
4262 GEN_VEXT_V(vfclass_v_d, 8)
4263 
4264 /* Vector Floating-Point Merge Instruction */
4265 
4266 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4267 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4268                   CPURISCVState *env, uint32_t desc)          \
4269 {                                                             \
4270     uint32_t vm = vext_vm(desc);                              \
4271     uint32_t vl = env->vl;                                    \
4272     uint32_t esz = sizeof(ETYPE);                             \
4273     uint32_t total_elems =                                    \
4274         vext_get_total_elems(env, desc, esz);                 \
4275     uint32_t vta = vext_vta(desc);                            \
4276     uint32_t i;                                               \
4277                                                               \
4278     VSTART_CHECK_EARLY_EXIT(env);                             \
4279                                                               \
4280     for (i = env->vstart; i < vl; i++) {                      \
4281         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4282         *((ETYPE *)vd + H(i)) =                               \
4283             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4284     }                                                         \
4285     env->vstart = 0;                                          \
4286     /* set tail elements to 1s */                             \
4287     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4288 }
4289 
4290 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4291 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4292 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4293 
4294 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4295 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4296 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4297 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4298 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4299 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4300 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4301 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4302 
4303 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4304 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4305 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4306 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4307 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4308 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4309 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4310 
4311 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4312 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4313 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4314 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4315 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4316 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4317 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4318 
4319 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4320 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4321 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4322 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4323 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4324 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4325 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4326 
4327 /* Widening Floating-Point/Integer Type-Convert Instructions */
4328 /* (TD, T2, TX2) */
4329 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4330 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4331 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4332 /*
4333  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4334  */
4335 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4336 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4337 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4338 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4339 
4340 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4341 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4342 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4343 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4344 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4345 
4346 /*
4347  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4348  */
4349 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4350 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4351 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4352 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4353 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4354 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4355 
4356 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4357 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4358 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4359 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4360 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4361 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4362 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4363 
4364 /*
4365  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4366  */
4367 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4368 {
4369     return float16_to_float32(a, true, s);
4370 }
4371 
4372 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4373 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4374 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4375 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4376 
4377 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4378 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4379 
4380 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4381 /* (TD, T2, TX2) */
4382 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4383 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4384 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4385 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4386 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4387 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4388 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4389 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4390 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4391 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4392 
4393 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4394 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4395 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4396 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4397 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4398 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4399 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4400 
4401 /*
4402  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4403  */
4404 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4405 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4406 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4407 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4408 
4409 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4410 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4411 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4412 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4413 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4414 
4415 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4416 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4417 {
4418     return float32_to_float16(a, true, s);
4419 }
4420 
4421 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4422 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4423 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4424 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4425 
4426 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4427 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4428 
4429 /*
4430  * Vector Reduction Operations
4431  */
4432 /* Vector Single-Width Integer Reduction Instructions */
4433 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4434 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4435                   void *vs2, CPURISCVState *env,          \
4436                   uint32_t desc)                          \
4437 {                                                         \
4438     uint32_t vm = vext_vm(desc);                          \
4439     uint32_t vl = env->vl;                                \
4440     uint32_t esz = sizeof(TD);                            \
4441     uint32_t vlenb = simd_maxsz(desc);                    \
4442     uint32_t vta = vext_vta(desc);                        \
4443     uint32_t i;                                           \
4444     TD s1 =  *((TD *)vs1 + HD(0));                        \
4445                                                           \
4446     for (i = env->vstart; i < vl; i++) {                  \
4447         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4448         if (!vm && !vext_elem_mask(v0, i)) {              \
4449             continue;                                     \
4450         }                                                 \
4451         s1 = OP(s1, (TD)s2);                              \
4452     }                                                     \
4453     *((TD *)vd + HD(0)) = s1;                             \
4454     env->vstart = 0;                                      \
4455     /* set tail elements to 1s */                         \
4456     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4457 }
4458 
4459 /* vd[0] = sum(vs1[0], vs2[*]) */
4460 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4461 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4462 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4463 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4464 
4465 /* vd[0] = maxu(vs1[0], vs2[*]) */
4466 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4467 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4468 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4469 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4470 
4471 /* vd[0] = max(vs1[0], vs2[*]) */
4472 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4473 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4474 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4475 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4476 
4477 /* vd[0] = minu(vs1[0], vs2[*]) */
4478 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4479 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4480 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4481 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4482 
4483 /* vd[0] = min(vs1[0], vs2[*]) */
4484 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4485 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4486 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4487 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4488 
4489 /* vd[0] = and(vs1[0], vs2[*]) */
4490 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4491 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4492 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4493 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4494 
4495 /* vd[0] = or(vs1[0], vs2[*]) */
4496 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4497 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4498 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4499 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4500 
4501 /* vd[0] = xor(vs1[0], vs2[*]) */
4502 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4503 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4504 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4505 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4506 
4507 /* Vector Widening Integer Reduction Instructions */
4508 /* signed sum reduction into double-width accumulator */
4509 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4510 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4511 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4512 
4513 /* Unsigned sum reduction into double-width accumulator */
4514 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4515 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4516 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4517 
4518 /* Vector Single-Width Floating-Point Reduction Instructions */
4519 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4520 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4521                   void *vs2, CPURISCVState *env,           \
4522                   uint32_t desc)                           \
4523 {                                                          \
4524     uint32_t vm = vext_vm(desc);                           \
4525     uint32_t vl = env->vl;                                 \
4526     uint32_t esz = sizeof(TD);                             \
4527     uint32_t vlenb = simd_maxsz(desc);                     \
4528     uint32_t vta = vext_vta(desc);                         \
4529     uint32_t i;                                            \
4530     TD s1 =  *((TD *)vs1 + HD(0));                         \
4531                                                            \
4532     for (i = env->vstart; i < vl; i++) {                   \
4533         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4534         if (!vm && !vext_elem_mask(v0, i)) {               \
4535             continue;                                      \
4536         }                                                  \
4537         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4538     }                                                      \
4539     *((TD *)vd + HD(0)) = s1;                              \
4540     env->vstart = 0;                                       \
4541     /* set tail elements to 1s */                          \
4542     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4543 }
4544 
4545 /* Unordered sum */
4546 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4547 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4548 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4549 
4550 /* Ordered sum */
4551 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4552 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4553 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4554 
4555 /* Maximum value */
4556 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4557               float16_maximum_number)
4558 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4559               float32_maximum_number)
4560 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4561               float64_maximum_number)
4562 
4563 /* Minimum value */
4564 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4565               float16_minimum_number)
4566 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4567               float32_minimum_number)
4568 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4569               float64_minimum_number)
4570 
4571 /* Vector Widening Floating-Point Add Instructions */
4572 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4573 {
4574     return float32_add(a, float16_to_float32(b, true, s), s);
4575 }
4576 
4577 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4578 {
4579     return float64_add(a, float32_to_float64(b, s), s);
4580 }
4581 
4582 /* Vector Widening Floating-Point Reduction Instructions */
4583 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4584 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4585 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4586 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4587 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4588 
4589 /*
4590  * Vector Mask Operations
4591  */
4592 /* Vector Mask-Register Logical Instructions */
4593 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4594 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4595                   void *vs2, CPURISCVState *env,          \
4596                   uint32_t desc)                          \
4597 {                                                         \
4598     uint32_t vl = env->vl;                                \
4599     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4600     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4601     uint32_t i;                                           \
4602     int a, b;                                             \
4603                                                           \
4604     VSTART_CHECK_EARLY_EXIT(env);                         \
4605                                                           \
4606     for (i = env->vstart; i < vl; i++) {                  \
4607         a = vext_elem_mask(vs1, i);                       \
4608         b = vext_elem_mask(vs2, i);                       \
4609         vext_set_elem_mask(vd, i, OP(b, a));              \
4610     }                                                     \
4611     env->vstart = 0;                                      \
4612     /*
4613      * mask destination register are always tail-agnostic
4614      * set tail elements to 1s
4615      */                                                   \
4616     if (vta_all_1s) {                                     \
4617         for (; i < total_elems; i++) {                    \
4618             vext_set_elem_mask(vd, i, 1);                 \
4619         }                                                 \
4620     }                                                     \
4621 }
4622 
4623 #define DO_NAND(N, M)  (!(N & M))
4624 #define DO_ANDNOT(N, M)  (N & !M)
4625 #define DO_NOR(N, M)  (!(N | M))
4626 #define DO_ORNOT(N, M)  (N | !M)
4627 #define DO_XNOR(N, M)  (!(N ^ M))
4628 
4629 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4630 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4631 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4632 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4633 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4634 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4635 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4636 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4637 
4638 /* Vector count population in mask vcpop */
4639 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4640                              uint32_t desc)
4641 {
4642     target_ulong cnt = 0;
4643     uint32_t vm = vext_vm(desc);
4644     uint32_t vl = env->vl;
4645     int i;
4646 
4647     for (i = env->vstart; i < vl; i++) {
4648         if (vm || vext_elem_mask(v0, i)) {
4649             if (vext_elem_mask(vs2, i)) {
4650                 cnt++;
4651             }
4652         }
4653     }
4654     env->vstart = 0;
4655     return cnt;
4656 }
4657 
4658 /* vfirst find-first-set mask bit */
4659 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4660                               uint32_t desc)
4661 {
4662     uint32_t vm = vext_vm(desc);
4663     uint32_t vl = env->vl;
4664     int i;
4665 
4666     for (i = env->vstart; i < vl; i++) {
4667         if (vm || vext_elem_mask(v0, i)) {
4668             if (vext_elem_mask(vs2, i)) {
4669                 return i;
4670             }
4671         }
4672     }
4673     env->vstart = 0;
4674     return -1LL;
4675 }
4676 
4677 enum set_mask_type {
4678     ONLY_FIRST = 1,
4679     INCLUDE_FIRST,
4680     BEFORE_FIRST,
4681 };
4682 
4683 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4684                    uint32_t desc, enum set_mask_type type)
4685 {
4686     uint32_t vm = vext_vm(desc);
4687     uint32_t vl = env->vl;
4688     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4689     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4690     uint32_t vma = vext_vma(desc);
4691     int i;
4692     bool first_mask_bit = false;
4693 
4694     for (i = env->vstart; i < vl; i++) {
4695         if (!vm && !vext_elem_mask(v0, i)) {
4696             /* set masked-off elements to 1s */
4697             if (vma) {
4698                 vext_set_elem_mask(vd, i, 1);
4699             }
4700             continue;
4701         }
4702         /* write a zero to all following active elements */
4703         if (first_mask_bit) {
4704             vext_set_elem_mask(vd, i, 0);
4705             continue;
4706         }
4707         if (vext_elem_mask(vs2, i)) {
4708             first_mask_bit = true;
4709             if (type == BEFORE_FIRST) {
4710                 vext_set_elem_mask(vd, i, 0);
4711             } else {
4712                 vext_set_elem_mask(vd, i, 1);
4713             }
4714         } else {
4715             if (type == ONLY_FIRST) {
4716                 vext_set_elem_mask(vd, i, 0);
4717             } else {
4718                 vext_set_elem_mask(vd, i, 1);
4719             }
4720         }
4721     }
4722     env->vstart = 0;
4723     /*
4724      * mask destination register are always tail-agnostic
4725      * set tail elements to 1s
4726      */
4727     if (vta_all_1s) {
4728         for (; i < total_elems; i++) {
4729             vext_set_elem_mask(vd, i, 1);
4730         }
4731     }
4732 }
4733 
4734 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4735                      uint32_t desc)
4736 {
4737     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4738 }
4739 
4740 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4741                      uint32_t desc)
4742 {
4743     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4744 }
4745 
4746 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4747                      uint32_t desc)
4748 {
4749     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4750 }
4751 
4752 /* Vector Iota Instruction */
4753 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4754 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4755                   uint32_t desc)                                          \
4756 {                                                                         \
4757     uint32_t vm = vext_vm(desc);                                          \
4758     uint32_t vl = env->vl;                                                \
4759     uint32_t esz = sizeof(ETYPE);                                         \
4760     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4761     uint32_t vta = vext_vta(desc);                                        \
4762     uint32_t vma = vext_vma(desc);                                        \
4763     uint32_t sum = 0;                                                     \
4764     int i;                                                                \
4765                                                                           \
4766     for (i = env->vstart; i < vl; i++) {                                  \
4767         if (!vm && !vext_elem_mask(v0, i)) {                              \
4768             /* set masked-off elements to 1s */                           \
4769             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4770             continue;                                                     \
4771         }                                                                 \
4772         *((ETYPE *)vd + H(i)) = sum;                                      \
4773         if (vext_elem_mask(vs2, i)) {                                     \
4774             sum++;                                                        \
4775         }                                                                 \
4776     }                                                                     \
4777     env->vstart = 0;                                                      \
4778     /* set tail elements to 1s */                                         \
4779     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4780 }
4781 
4782 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4783 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4784 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4785 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4786 
4787 /* Vector Element Index Instruction */
4788 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4789 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4790 {                                                                         \
4791     uint32_t vm = vext_vm(desc);                                          \
4792     uint32_t vl = env->vl;                                                \
4793     uint32_t esz = sizeof(ETYPE);                                         \
4794     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4795     uint32_t vta = vext_vta(desc);                                        \
4796     uint32_t vma = vext_vma(desc);                                        \
4797     int i;                                                                \
4798                                                                           \
4799     VSTART_CHECK_EARLY_EXIT(env);                                         \
4800                                                                           \
4801     for (i = env->vstart; i < vl; i++) {                                  \
4802         if (!vm && !vext_elem_mask(v0, i)) {                              \
4803             /* set masked-off elements to 1s */                           \
4804             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4805             continue;                                                     \
4806         }                                                                 \
4807         *((ETYPE *)vd + H(i)) = i;                                        \
4808     }                                                                     \
4809     env->vstart = 0;                                                      \
4810     /* set tail elements to 1s */                                         \
4811     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4812 }
4813 
4814 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4815 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4816 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4817 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4818 
4819 /*
4820  * Vector Permutation Instructions
4821  */
4822 
4823 /* Vector Slide Instructions */
4824 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4825 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4826                   CPURISCVState *env, uint32_t desc)                      \
4827 {                                                                         \
4828     uint32_t vm = vext_vm(desc);                                          \
4829     uint32_t vl = env->vl;                                                \
4830     uint32_t esz = sizeof(ETYPE);                                         \
4831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4832     uint32_t vta = vext_vta(desc);                                        \
4833     uint32_t vma = vext_vma(desc);                                        \
4834     target_ulong offset = s1, i_min, i;                                   \
4835                                                                           \
4836     VSTART_CHECK_EARLY_EXIT(env);                                         \
4837                                                                           \
4838     i_min = MAX(env->vstart, offset);                                     \
4839     for (i = i_min; i < vl; i++) {                                        \
4840         if (!vm && !vext_elem_mask(v0, i)) {                              \
4841             /* set masked-off elements to 1s */                           \
4842             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4843             continue;                                                     \
4844         }                                                                 \
4845         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4846     }                                                                     \
4847     env->vstart = 0;                                                      \
4848     /* set tail elements to 1s */                                         \
4849     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4850 }
4851 
4852 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4853 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4854 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4855 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4856 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4857 
4858 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4859 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4860                   CPURISCVState *env, uint32_t desc)                      \
4861 {                                                                         \
4862     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4863     uint32_t vm = vext_vm(desc);                                          \
4864     uint32_t vl = env->vl;                                                \
4865     uint32_t esz = sizeof(ETYPE);                                         \
4866     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4867     uint32_t vta = vext_vta(desc);                                        \
4868     uint32_t vma = vext_vma(desc);                                        \
4869     target_ulong i_max, i_min, i;                                         \
4870                                                                           \
4871     VSTART_CHECK_EARLY_EXIT(env);                                         \
4872                                                                           \
4873     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
4874     i_max = MAX(i_min, env->vstart);                                      \
4875     for (i = env->vstart; i < i_max; ++i) {                               \
4876         if (!vm && !vext_elem_mask(v0, i)) {                              \
4877             /* set masked-off elements to 1s */                           \
4878             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4879             continue;                                                     \
4880         }                                                                 \
4881         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4882     }                                                                     \
4883                                                                           \
4884     for (i = i_max; i < vl; ++i) {                                        \
4885         if (vm || vext_elem_mask(v0, i)) {                                \
4886             *((ETYPE *)vd + H(i)) = 0;                                    \
4887         }                                                                 \
4888     }                                                                     \
4889                                                                           \
4890     env->vstart = 0;                                                      \
4891     /* set tail elements to 1s */                                         \
4892     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4893 }
4894 
4895 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4896 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4897 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4898 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4899 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4900 
4901 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4902 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4903                                  void *vs2, CPURISCVState *env,             \
4904                                  uint32_t desc)                             \
4905 {                                                                           \
4906     typedef uint##BITWIDTH##_t ETYPE;                                       \
4907     uint32_t vm = vext_vm(desc);                                            \
4908     uint32_t vl = env->vl;                                                  \
4909     uint32_t esz = sizeof(ETYPE);                                           \
4910     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4911     uint32_t vta = vext_vta(desc);                                          \
4912     uint32_t vma = vext_vma(desc);                                          \
4913     uint32_t i;                                                             \
4914                                                                             \
4915     VSTART_CHECK_EARLY_EXIT(env);                                           \
4916                                                                             \
4917     for (i = env->vstart; i < vl; i++) {                                    \
4918         if (!vm && !vext_elem_mask(v0, i)) {                                \
4919             /* set masked-off elements to 1s */                             \
4920             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4921             continue;                                                       \
4922         }                                                                   \
4923         if (i == 0) {                                                       \
4924             *((ETYPE *)vd + H(i)) = s1;                                     \
4925         } else {                                                            \
4926             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4927         }                                                                   \
4928     }                                                                       \
4929     env->vstart = 0;                                                        \
4930     /* set tail elements to 1s */                                           \
4931     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4932 }
4933 
4934 GEN_VEXT_VSLIE1UP(8,  H1)
4935 GEN_VEXT_VSLIE1UP(16, H2)
4936 GEN_VEXT_VSLIE1UP(32, H4)
4937 GEN_VEXT_VSLIE1UP(64, H8)
4938 
4939 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4940 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4941                   CPURISCVState *env, uint32_t desc)              \
4942 {                                                                 \
4943     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4944 }
4945 
4946 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4947 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4948 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4949 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4950 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4951 
4952 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4953 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4954                                    void *vs2, CPURISCVState *env,             \
4955                                    uint32_t desc)                             \
4956 {                                                                             \
4957     typedef uint##BITWIDTH##_t ETYPE;                                         \
4958     uint32_t vm = vext_vm(desc);                                              \
4959     uint32_t vl = env->vl;                                                    \
4960     uint32_t esz = sizeof(ETYPE);                                             \
4961     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4962     uint32_t vta = vext_vta(desc);                                            \
4963     uint32_t vma = vext_vma(desc);                                            \
4964     uint32_t i;                                                               \
4965                                                                               \
4966     VSTART_CHECK_EARLY_EXIT(env);                                             \
4967                                                                               \
4968     for (i = env->vstart; i < vl; i++) {                                      \
4969         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4970             /* set masked-off elements to 1s */                               \
4971             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4972             continue;                                                         \
4973         }                                                                     \
4974         if (i == vl - 1) {                                                    \
4975             *((ETYPE *)vd + H(i)) = s1;                                       \
4976         } else {                                                              \
4977             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4978         }                                                                     \
4979     }                                                                         \
4980     env->vstart = 0;                                                          \
4981     /* set tail elements to 1s */                                             \
4982     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4983 }
4984 
4985 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4986 GEN_VEXT_VSLIDE1DOWN(16, H2)
4987 GEN_VEXT_VSLIDE1DOWN(32, H4)
4988 GEN_VEXT_VSLIDE1DOWN(64, H8)
4989 
4990 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4991 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4992                   CPURISCVState *env, uint32_t desc)              \
4993 {                                                                 \
4994     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4995 }
4996 
4997 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4998 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4999 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5000 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5001 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5002 
5003 /* Vector Floating-Point Slide Instructions */
5004 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5005 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5006                   CPURISCVState *env, uint32_t desc)          \
5007 {                                                             \
5008     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5009 }
5010 
5011 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5012 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5013 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5014 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5015 
5016 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5017 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5018                   CPURISCVState *env, uint32_t desc)          \
5019 {                                                             \
5020     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5021 }
5022 
5023 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5024 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5025 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5026 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5027 
5028 /* Vector Register Gather Instruction */
5029 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5030 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5031                   CPURISCVState *env, uint32_t desc)                      \
5032 {                                                                         \
5033     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5034     uint32_t vm = vext_vm(desc);                                          \
5035     uint32_t vl = env->vl;                                                \
5036     uint32_t esz = sizeof(TS2);                                           \
5037     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5038     uint32_t vta = vext_vta(desc);                                        \
5039     uint32_t vma = vext_vma(desc);                                        \
5040     uint64_t index;                                                       \
5041     uint32_t i;                                                           \
5042                                                                           \
5043     VSTART_CHECK_EARLY_EXIT(env);                                         \
5044                                                                           \
5045     for (i = env->vstart; i < vl; i++) {                                  \
5046         if (!vm && !vext_elem_mask(v0, i)) {                              \
5047             /* set masked-off elements to 1s */                           \
5048             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5049             continue;                                                     \
5050         }                                                                 \
5051         index = *((TS1 *)vs1 + HS1(i));                                   \
5052         if (index >= vlmax) {                                             \
5053             *((TS2 *)vd + HS2(i)) = 0;                                    \
5054         } else {                                                          \
5055             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5056         }                                                                 \
5057     }                                                                     \
5058     env->vstart = 0;                                                      \
5059     /* set tail elements to 1s */                                         \
5060     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5061 }
5062 
5063 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5064 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5065 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5066 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5067 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5068 
5069 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5070 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5071 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5072 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5073 
5074 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5075 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5076                   CPURISCVState *env, uint32_t desc)                      \
5077 {                                                                         \
5078     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5079     uint32_t vm = vext_vm(desc);                                          \
5080     uint32_t vl = env->vl;                                                \
5081     uint32_t esz = sizeof(ETYPE);                                         \
5082     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5083     uint32_t vta = vext_vta(desc);                                        \
5084     uint32_t vma = vext_vma(desc);                                        \
5085     uint64_t index = s1;                                                  \
5086     uint32_t i;                                                           \
5087                                                                           \
5088     VSTART_CHECK_EARLY_EXIT(env);                                         \
5089                                                                           \
5090     for (i = env->vstart; i < vl; i++) {                                  \
5091         if (!vm && !vext_elem_mask(v0, i)) {                              \
5092             /* set masked-off elements to 1s */                           \
5093             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5094             continue;                                                     \
5095         }                                                                 \
5096         if (index >= vlmax) {                                             \
5097             *((ETYPE *)vd + H(i)) = 0;                                    \
5098         } else {                                                          \
5099             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5100         }                                                                 \
5101     }                                                                     \
5102     env->vstart = 0;                                                      \
5103     /* set tail elements to 1s */                                         \
5104     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5105 }
5106 
5107 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5108 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5109 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5110 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5111 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5112 
5113 /* Vector Compress Instruction */
5114 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5115 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5116                   CPURISCVState *env, uint32_t desc)                      \
5117 {                                                                         \
5118     uint32_t vl = env->vl;                                                \
5119     uint32_t esz = sizeof(ETYPE);                                         \
5120     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5121     uint32_t vta = vext_vta(desc);                                        \
5122     uint32_t num = 0, i;                                                  \
5123                                                                           \
5124     for (i = env->vstart; i < vl; i++) {                                  \
5125         if (!vext_elem_mask(vs1, i)) {                                    \
5126             continue;                                                     \
5127         }                                                                 \
5128         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5129         num++;                                                            \
5130     }                                                                     \
5131     env->vstart = 0;                                                      \
5132     /* set tail elements to 1s */                                         \
5133     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5134 }
5135 
5136 /* Compress into vd elements of vs2 where vs1 is enabled */
5137 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5138 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5139 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5140 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5141 
5142 /* Vector Whole Register Move */
5143 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5144 {
5145     /* EEW = SEW */
5146     uint32_t maxsz = simd_maxsz(desc);
5147     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5148     uint32_t startb = env->vstart * sewb;
5149     uint32_t i = startb;
5150 
5151     if (startb >= maxsz) {
5152         env->vstart = 0;
5153         return;
5154     }
5155 
5156     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5157         uint32_t j = ROUND_UP(i, 8);
5158         memcpy((uint8_t *)vd + H1(j - 1),
5159                (uint8_t *)vs2 + H1(j - 1),
5160                j - i);
5161         i = j;
5162     }
5163 
5164     memcpy((uint8_t *)vd + H1(i),
5165            (uint8_t *)vs2 + H1(i),
5166            maxsz - i);
5167 
5168     env->vstart = 0;
5169 }
5170 
5171 /* Vector Integer Extension */
5172 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5173 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5174                   CPURISCVState *env, uint32_t desc)             \
5175 {                                                                \
5176     uint32_t vl = env->vl;                                       \
5177     uint32_t vm = vext_vm(desc);                                 \
5178     uint32_t esz = sizeof(ETYPE);                                \
5179     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5180     uint32_t vta = vext_vta(desc);                               \
5181     uint32_t vma = vext_vma(desc);                               \
5182     uint32_t i;                                                  \
5183                                                                  \
5184     VSTART_CHECK_EARLY_EXIT(env);                                \
5185                                                                  \
5186     for (i = env->vstart; i < vl; i++) {                         \
5187         if (!vm && !vext_elem_mask(v0, i)) {                     \
5188             /* set masked-off elements to 1s */                  \
5189             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5190             continue;                                            \
5191         }                                                        \
5192         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5193     }                                                            \
5194     env->vstart = 0;                                             \
5195     /* set tail elements to 1s */                                \
5196     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5197 }
5198 
5199 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5200 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5201 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5202 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5203 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5204 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5205 
5206 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5207 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5208 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5209 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5210 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5211 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5212