xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 98f40dd2)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "internals.h"
30 #include "vector_internals.h"
31 #include <math.h>
32 
33 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
34                             target_ulong s2)
35 {
36     int vlmax, vl;
37     RISCVCPU *cpu = env_archcpu(env);
38     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
39     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
40     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
41     int xlen = riscv_cpu_xlen(env);
42     bool vill = (s2 >> (xlen - 1)) & 0x1;
43     target_ulong reserved = s2 &
44                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
45                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
46 
47     if (lmul & 4) {
48         /* Fractional LMUL - check LMUL * VLEN >= SEW */
49         if (lmul == 4 ||
50             cpu->cfg.vlen >> (8 - lmul) < sew) {
51             vill = true;
52         }
53     }
54 
55     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
56         /* only set vill bit. */
57         env->vill = 1;
58         env->vtype = 0;
59         env->vl = 0;
60         env->vstart = 0;
61         return 0;
62     }
63 
64     vlmax = vext_get_vlmax(cpu, s2);
65     if (s1 <= vlmax) {
66         vl = s1;
67     } else {
68         vl = vlmax;
69     }
70     env->vl = vl;
71     env->vtype = s2;
72     env->vstart = 0;
73     env->vill = 0;
74     return vl;
75 }
76 
77 /*
78  * Get the maximum number of elements can be operated.
79  *
80  * log2_esz: log2 of element size in bytes.
81  */
82 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
83 {
84     /*
85      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
86      * so vlen in bytes (vlenb) is encoded as maxsz.
87      */
88     uint32_t vlenb = simd_maxsz(desc);
89 
90     /* Return VLMAX */
91     int scale = vext_lmul(desc) - log2_esz;
92     return scale < 0 ? vlenb >> -scale : vlenb << scale;
93 }
94 
95 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
96 {
97     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
98 }
99 
100 /*
101  * This function checks watchpoint before real load operation.
102  *
103  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
104  * In user mode, there is no watchpoint support now.
105  *
106  * It will trigger an exception if there is no mapping in TLB
107  * and page table walk can't fill the TLB entry. Then the guest
108  * software can return here after process the exception or never return.
109  */
110 static void probe_pages(CPURISCVState *env, target_ulong addr,
111                         target_ulong len, uintptr_t ra,
112                         MMUAccessType access_type)
113 {
114     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
115     target_ulong curlen = MIN(pagelen, len);
116 
117     probe_access(env, adjust_addr(env, addr), curlen, access_type,
118                  cpu_mmu_index(env, false), ra);
119     if (len > curlen) {
120         addr += curlen;
121         curlen = len - curlen;
122         probe_access(env, adjust_addr(env, addr), curlen, access_type,
123                      cpu_mmu_index(env, false), ra);
124     }
125 }
126 
127 static inline void vext_set_elem_mask(void *v0, int index,
128                                       uint8_t value)
129 {
130     int idx = index / 64;
131     int pos = index % 64;
132     uint64_t old = ((uint64_t *)v0)[idx];
133     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
134 }
135 
136 /* elements operations for load and store */
137 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
138                                uint32_t idx, void *vd, uintptr_t retaddr);
139 
140 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
141 static void NAME(CPURISCVState *env, abi_ptr addr,         \
142                  uint32_t idx, void *vd, uintptr_t retaddr)\
143 {                                                          \
144     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
145     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
146 }                                                          \
147 
148 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
149 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
150 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
151 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
152 
153 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
154 static void NAME(CPURISCVState *env, abi_ptr addr,         \
155                  uint32_t idx, void *vd, uintptr_t retaddr)\
156 {                                                          \
157     ETYPE data = *((ETYPE *)vd + H(idx));                  \
158     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
159 }
160 
161 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
162 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
163 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
164 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
165 
166 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
167                                    uint32_t desc, uint32_t nf,
168                                    uint32_t esz, uint32_t max_elems)
169 {
170     uint32_t vta = vext_vta(desc);
171     int k;
172 
173     if (vta == 0) {
174         return;
175     }
176 
177     for (k = 0; k < nf; ++k) {
178         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
179                           (k * max_elems + max_elems) * esz);
180     }
181 }
182 
183 /*
184  * stride: access vector element from strided memory
185  */
186 static void
187 vext_ldst_stride(void *vd, void *v0, target_ulong base,
188                  target_ulong stride, CPURISCVState *env,
189                  uint32_t desc, uint32_t vm,
190                  vext_ldst_elem_fn *ldst_elem,
191                  uint32_t log2_esz, uintptr_t ra)
192 {
193     uint32_t i, k;
194     uint32_t nf = vext_nf(desc);
195     uint32_t max_elems = vext_max_elems(desc, log2_esz);
196     uint32_t esz = 1 << log2_esz;
197     uint32_t vma = vext_vma(desc);
198 
199     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
200         k = 0;
201         while (k < nf) {
202             if (!vm && !vext_elem_mask(v0, i)) {
203                 /* set masked-off elements to 1s */
204                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
205                                   (i + k * max_elems + 1) * esz);
206                 k++;
207                 continue;
208             }
209             target_ulong addr = base + stride * i + (k << log2_esz);
210             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
211             k++;
212         }
213     }
214     env->vstart = 0;
215 
216     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
217 }
218 
219 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
220 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
221                   target_ulong stride, CPURISCVState *env,              \
222                   uint32_t desc)                                        \
223 {                                                                       \
224     uint32_t vm = vext_vm(desc);                                        \
225     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
226                      ctzl(sizeof(ETYPE)), GETPC());                     \
227 }
228 
229 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
230 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
231 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
232 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
233 
234 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
235 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
236                   target_ulong stride, CPURISCVState *env,              \
237                   uint32_t desc)                                        \
238 {                                                                       \
239     uint32_t vm = vext_vm(desc);                                        \
240     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
241                      ctzl(sizeof(ETYPE)), GETPC());                     \
242 }
243 
244 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
245 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
246 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
247 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
248 
249 /*
250  * unit-stride: access elements stored contiguously in memory
251  */
252 
253 /* unmasked unit-stride load and store operation */
254 static void
255 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
256              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
257              uintptr_t ra)
258 {
259     uint32_t i, k;
260     uint32_t nf = vext_nf(desc);
261     uint32_t max_elems = vext_max_elems(desc, log2_esz);
262     uint32_t esz = 1 << log2_esz;
263 
264     /* load bytes from guest memory */
265     for (i = env->vstart; i < evl; i++, env->vstart++) {
266         k = 0;
267         while (k < nf) {
268             target_ulong addr = base + ((i * nf + k) << log2_esz);
269             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
270             k++;
271         }
272     }
273     env->vstart = 0;
274 
275     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
276 }
277 
278 /*
279  * masked unit-stride load and store operation will be a special case of
280  * stride, stride = NF * sizeof (ETYPE)
281  */
282 
283 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
284 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
285                          CPURISCVState *env, uint32_t desc)             \
286 {                                                                       \
287     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
288     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
289                      ctzl(sizeof(ETYPE)), GETPC());                     \
290 }                                                                       \
291                                                                         \
292 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
293                   CPURISCVState *env, uint32_t desc)                    \
294 {                                                                       \
295     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
296                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
297 }
298 
299 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
300 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
301 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
302 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
303 
304 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
305 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
306                          CPURISCVState *env, uint32_t desc)              \
307 {                                                                        \
308     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
309     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
310                      ctzl(sizeof(ETYPE)), GETPC());                      \
311 }                                                                        \
312                                                                          \
313 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
314                   CPURISCVState *env, uint32_t desc)                     \
315 {                                                                        \
316     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
317                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
318 }
319 
320 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
321 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
322 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
323 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
324 
325 /*
326  * unit stride mask load and store, EEW = 1
327  */
328 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
329                     CPURISCVState *env, uint32_t desc)
330 {
331     /* evl = ceil(vl/8) */
332     uint8_t evl = (env->vl + 7) >> 3;
333     vext_ldst_us(vd, base, env, desc, lde_b,
334                  0, evl, GETPC());
335 }
336 
337 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
338                     CPURISCVState *env, uint32_t desc)
339 {
340     /* evl = ceil(vl/8) */
341     uint8_t evl = (env->vl + 7) >> 3;
342     vext_ldst_us(vd, base, env, desc, ste_b,
343                  0, evl, GETPC());
344 }
345 
346 /*
347  * index: access vector element from indexed memory
348  */
349 typedef target_ulong vext_get_index_addr(target_ulong base,
350         uint32_t idx, void *vs2);
351 
352 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
353 static target_ulong NAME(target_ulong base,            \
354                          uint32_t idx, void *vs2)      \
355 {                                                      \
356     return (base + *((ETYPE *)vs2 + H(idx)));          \
357 }
358 
359 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
360 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
361 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
362 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
363 
364 static inline void
365 vext_ldst_index(void *vd, void *v0, target_ulong base,
366                 void *vs2, CPURISCVState *env, uint32_t desc,
367                 vext_get_index_addr get_index_addr,
368                 vext_ldst_elem_fn *ldst_elem,
369                 uint32_t log2_esz, uintptr_t ra)
370 {
371     uint32_t i, k;
372     uint32_t nf = vext_nf(desc);
373     uint32_t vm = vext_vm(desc);
374     uint32_t max_elems = vext_max_elems(desc, log2_esz);
375     uint32_t esz = 1 << log2_esz;
376     uint32_t vma = vext_vma(desc);
377 
378     /* load bytes from guest memory */
379     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
380         k = 0;
381         while (k < nf) {
382             if (!vm && !vext_elem_mask(v0, i)) {
383                 /* set masked-off elements to 1s */
384                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
385                                   (i + k * max_elems + 1) * esz);
386                 k++;
387                 continue;
388             }
389             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
390             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
391             k++;
392         }
393     }
394     env->vstart = 0;
395 
396     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
397 }
398 
399 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
400 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
401                   void *vs2, CPURISCVState *env, uint32_t desc)            \
402 {                                                                          \
403     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
404                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
405 }
406 
407 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
408 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
409 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
410 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
411 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
412 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
413 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
414 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
415 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
416 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
417 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
418 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
419 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
420 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
421 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
422 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
423 
424 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
425 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
426                   void *vs2, CPURISCVState *env, uint32_t desc)  \
427 {                                                                \
428     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
429                     STORE_FN, ctzl(sizeof(ETYPE)),               \
430                     GETPC());                                    \
431 }
432 
433 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
434 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
435 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
436 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
437 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
438 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
439 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
440 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
441 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
442 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
443 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
444 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
445 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
446 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
447 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
448 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
449 
450 /*
451  * unit-stride fault-only-fisrt load instructions
452  */
453 static inline void
454 vext_ldff(void *vd, void *v0, target_ulong base,
455           CPURISCVState *env, uint32_t desc,
456           vext_ldst_elem_fn *ldst_elem,
457           uint32_t log2_esz, uintptr_t ra)
458 {
459     void *host;
460     uint32_t i, k, vl = 0;
461     uint32_t nf = vext_nf(desc);
462     uint32_t vm = vext_vm(desc);
463     uint32_t max_elems = vext_max_elems(desc, log2_esz);
464     uint32_t esz = 1 << log2_esz;
465     uint32_t vma = vext_vma(desc);
466     target_ulong addr, offset, remain;
467 
468     /* probe every access */
469     for (i = env->vstart; i < env->vl; i++) {
470         if (!vm && !vext_elem_mask(v0, i)) {
471             continue;
472         }
473         addr = adjust_addr(env, base + i * (nf << log2_esz));
474         if (i == 0) {
475             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
476         } else {
477             /* if it triggers an exception, no need to check watchpoint */
478             remain = nf << log2_esz;
479             while (remain > 0) {
480                 offset = -(addr | TARGET_PAGE_MASK);
481                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
482                                          cpu_mmu_index(env, false));
483                 if (host) {
484 #ifdef CONFIG_USER_ONLY
485                     if (!page_check_range(addr, offset, PAGE_READ)) {
486                         vl = i;
487                         goto ProbeSuccess;
488                     }
489 #else
490                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
491 #endif
492                 } else {
493                     vl = i;
494                     goto ProbeSuccess;
495                 }
496                 if (remain <=  offset) {
497                     break;
498                 }
499                 remain -= offset;
500                 addr = adjust_addr(env, addr + offset);
501             }
502         }
503     }
504 ProbeSuccess:
505     /* load bytes from guest memory */
506     if (vl != 0) {
507         env->vl = vl;
508     }
509     for (i = env->vstart; i < env->vl; i++) {
510         k = 0;
511         while (k < nf) {
512             if (!vm && !vext_elem_mask(v0, i)) {
513                 /* set masked-off elements to 1s */
514                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
515                                   (i + k * max_elems + 1) * esz);
516                 k++;
517                 continue;
518             }
519             target_ulong addr = base + ((i * nf + k) << log2_esz);
520             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
521             k++;
522         }
523     }
524     env->vstart = 0;
525 
526     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
527 }
528 
529 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
530 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
531                   CPURISCVState *env, uint32_t desc)      \
532 {                                                         \
533     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
534               ctzl(sizeof(ETYPE)), GETPC());              \
535 }
536 
537 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
538 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
539 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
540 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
541 
542 #define DO_SWAP(N, M) (M)
543 #define DO_AND(N, M)  (N & M)
544 #define DO_XOR(N, M)  (N ^ M)
545 #define DO_OR(N, M)   (N | M)
546 #define DO_ADD(N, M)  (N + M)
547 
548 /* Signed min/max */
549 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
550 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
551 
552 /*
553  * load and store whole register instructions
554  */
555 static void
556 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
557                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
558 {
559     uint32_t i, k, off, pos;
560     uint32_t nf = vext_nf(desc);
561     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
562     uint32_t max_elems = vlenb >> log2_esz;
563 
564     k = env->vstart / max_elems;
565     off = env->vstart % max_elems;
566 
567     if (off) {
568         /* load/store rest of elements of current segment pointed by vstart */
569         for (pos = off; pos < max_elems; pos++, env->vstart++) {
570             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
571             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
572                       ra);
573         }
574         k++;
575     }
576 
577     /* load/store elements for rest of segments */
578     for (; k < nf; k++) {
579         for (i = 0; i < max_elems; i++, env->vstart++) {
580             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
581             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
582         }
583     }
584 
585     env->vstart = 0;
586 }
587 
588 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
589 void HELPER(NAME)(void *vd, target_ulong base,       \
590                   CPURISCVState *env, uint32_t desc) \
591 {                                                    \
592     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
593                     ctzl(sizeof(ETYPE)), GETPC());   \
594 }
595 
596 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
597 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
598 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
599 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
600 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
601 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
602 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
603 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
604 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
605 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
606 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
607 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
608 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
609 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
610 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
611 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
612 
613 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
614 void HELPER(NAME)(void *vd, target_ulong base,       \
615                   CPURISCVState *env, uint32_t desc) \
616 {                                                    \
617     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
618                     ctzl(sizeof(ETYPE)), GETPC());   \
619 }
620 
621 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
622 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
623 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
624 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
625 
626 /*
627  * Vector Integer Arithmetic Instructions
628  */
629 
630 /* (TD, T1, T2, TX1, TX2) */
631 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
632 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
633 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
634 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
635 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
636 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
637 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
638 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
639 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
640 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
641 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
642 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
643 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
644 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
645 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
646 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
647 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
648 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
649 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
650 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
651 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
652 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
653 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
654 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
655 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
656 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
657 
658 #define DO_SUB(N, M) (N - M)
659 #define DO_RSUB(N, M) (M - N)
660 
661 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
662 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
663 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
664 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
665 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
666 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
667 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
668 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
669 
670 GEN_VEXT_VV(vadd_vv_b, 1)
671 GEN_VEXT_VV(vadd_vv_h, 2)
672 GEN_VEXT_VV(vadd_vv_w, 4)
673 GEN_VEXT_VV(vadd_vv_d, 8)
674 GEN_VEXT_VV(vsub_vv_b, 1)
675 GEN_VEXT_VV(vsub_vv_h, 2)
676 GEN_VEXT_VV(vsub_vv_w, 4)
677 GEN_VEXT_VV(vsub_vv_d, 8)
678 
679 
680 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
681 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
682 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
683 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
684 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
685 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
686 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
687 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
688 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
689 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
690 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
691 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
692 
693 GEN_VEXT_VX(vadd_vx_b, 1)
694 GEN_VEXT_VX(vadd_vx_h, 2)
695 GEN_VEXT_VX(vadd_vx_w, 4)
696 GEN_VEXT_VX(vadd_vx_d, 8)
697 GEN_VEXT_VX(vsub_vx_b, 1)
698 GEN_VEXT_VX(vsub_vx_h, 2)
699 GEN_VEXT_VX(vsub_vx_w, 4)
700 GEN_VEXT_VX(vsub_vx_d, 8)
701 GEN_VEXT_VX(vrsub_vx_b, 1)
702 GEN_VEXT_VX(vrsub_vx_h, 2)
703 GEN_VEXT_VX(vrsub_vx_w, 4)
704 GEN_VEXT_VX(vrsub_vx_d, 8)
705 
706 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
707 {
708     intptr_t oprsz = simd_oprsz(desc);
709     intptr_t i;
710 
711     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
712         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
713     }
714 }
715 
716 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
717 {
718     intptr_t oprsz = simd_oprsz(desc);
719     intptr_t i;
720 
721     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
722         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
723     }
724 }
725 
726 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
727 {
728     intptr_t oprsz = simd_oprsz(desc);
729     intptr_t i;
730 
731     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
732         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
733     }
734 }
735 
736 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
737 {
738     intptr_t oprsz = simd_oprsz(desc);
739     intptr_t i;
740 
741     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
742         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
743     }
744 }
745 
746 /* Vector Widening Integer Add/Subtract */
747 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
748 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
749 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
750 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
751 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
752 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
753 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
754 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
755 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
756 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
757 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
758 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
759 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
760 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
761 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
762 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
763 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
764 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
765 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
766 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
767 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
768 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
769 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
770 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
771 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
772 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
773 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
774 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
775 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
776 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
777 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
778 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
779 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
780 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
781 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
782 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
783 GEN_VEXT_VV(vwaddu_vv_b, 2)
784 GEN_VEXT_VV(vwaddu_vv_h, 4)
785 GEN_VEXT_VV(vwaddu_vv_w, 8)
786 GEN_VEXT_VV(vwsubu_vv_b, 2)
787 GEN_VEXT_VV(vwsubu_vv_h, 4)
788 GEN_VEXT_VV(vwsubu_vv_w, 8)
789 GEN_VEXT_VV(vwadd_vv_b, 2)
790 GEN_VEXT_VV(vwadd_vv_h, 4)
791 GEN_VEXT_VV(vwadd_vv_w, 8)
792 GEN_VEXT_VV(vwsub_vv_b, 2)
793 GEN_VEXT_VV(vwsub_vv_h, 4)
794 GEN_VEXT_VV(vwsub_vv_w, 8)
795 GEN_VEXT_VV(vwaddu_wv_b, 2)
796 GEN_VEXT_VV(vwaddu_wv_h, 4)
797 GEN_VEXT_VV(vwaddu_wv_w, 8)
798 GEN_VEXT_VV(vwsubu_wv_b, 2)
799 GEN_VEXT_VV(vwsubu_wv_h, 4)
800 GEN_VEXT_VV(vwsubu_wv_w, 8)
801 GEN_VEXT_VV(vwadd_wv_b, 2)
802 GEN_VEXT_VV(vwadd_wv_h, 4)
803 GEN_VEXT_VV(vwadd_wv_w, 8)
804 GEN_VEXT_VV(vwsub_wv_b, 2)
805 GEN_VEXT_VV(vwsub_wv_h, 4)
806 GEN_VEXT_VV(vwsub_wv_w, 8)
807 
808 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
809 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
810 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
811 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
812 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
813 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
814 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
815 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
816 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
817 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
818 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
819 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
820 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
821 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
822 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
823 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
824 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
825 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
826 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
827 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
828 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
829 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
830 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
831 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
832 GEN_VEXT_VX(vwaddu_vx_b, 2)
833 GEN_VEXT_VX(vwaddu_vx_h, 4)
834 GEN_VEXT_VX(vwaddu_vx_w, 8)
835 GEN_VEXT_VX(vwsubu_vx_b, 2)
836 GEN_VEXT_VX(vwsubu_vx_h, 4)
837 GEN_VEXT_VX(vwsubu_vx_w, 8)
838 GEN_VEXT_VX(vwadd_vx_b, 2)
839 GEN_VEXT_VX(vwadd_vx_h, 4)
840 GEN_VEXT_VX(vwadd_vx_w, 8)
841 GEN_VEXT_VX(vwsub_vx_b, 2)
842 GEN_VEXT_VX(vwsub_vx_h, 4)
843 GEN_VEXT_VX(vwsub_vx_w, 8)
844 GEN_VEXT_VX(vwaddu_wx_b, 2)
845 GEN_VEXT_VX(vwaddu_wx_h, 4)
846 GEN_VEXT_VX(vwaddu_wx_w, 8)
847 GEN_VEXT_VX(vwsubu_wx_b, 2)
848 GEN_VEXT_VX(vwsubu_wx_h, 4)
849 GEN_VEXT_VX(vwsubu_wx_w, 8)
850 GEN_VEXT_VX(vwadd_wx_b, 2)
851 GEN_VEXT_VX(vwadd_wx_h, 4)
852 GEN_VEXT_VX(vwadd_wx_w, 8)
853 GEN_VEXT_VX(vwsub_wx_b, 2)
854 GEN_VEXT_VX(vwsub_wx_h, 4)
855 GEN_VEXT_VX(vwsub_wx_w, 8)
856 
857 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
858 #define DO_VADC(N, M, C) (N + M + C)
859 #define DO_VSBC(N, M, C) (N - M - C)
860 
861 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
862 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
863                   CPURISCVState *env, uint32_t desc)          \
864 {                                                             \
865     uint32_t vl = env->vl;                                    \
866     uint32_t esz = sizeof(ETYPE);                             \
867     uint32_t total_elems =                                    \
868         vext_get_total_elems(env, desc, esz);                 \
869     uint32_t vta = vext_vta(desc);                            \
870     uint32_t i;                                               \
871                                                               \
872     for (i = env->vstart; i < vl; i++) {                      \
873         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
874         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
875         ETYPE carry = vext_elem_mask(v0, i);                  \
876                                                               \
877         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
878     }                                                         \
879     env->vstart = 0;                                          \
880     /* set tail elements to 1s */                             \
881     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
882 }
883 
884 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
885 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
886 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
887 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
888 
889 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
890 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
891 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
892 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
893 
894 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
895 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
896                   CPURISCVState *env, uint32_t desc)                     \
897 {                                                                        \
898     uint32_t vl = env->vl;                                               \
899     uint32_t esz = sizeof(ETYPE);                                        \
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
901     uint32_t vta = vext_vta(desc);                                       \
902     uint32_t i;                                                          \
903                                                                          \
904     for (i = env->vstart; i < vl; i++) {                                 \
905         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
906         ETYPE carry = vext_elem_mask(v0, i);                             \
907                                                                          \
908         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
909     }                                                                    \
910     env->vstart = 0;                                                     \
911     /* set tail elements to 1s */                                        \
912     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
913 }
914 
915 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
916 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
917 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
918 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
919 
920 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
921 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
922 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
923 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
924 
925 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
926                           (__typeof(N))(N + M) < N)
927 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
928 
929 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
930 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
931                   CPURISCVState *env, uint32_t desc)          \
932 {                                                             \
933     uint32_t vl = env->vl;                                    \
934     uint32_t vm = vext_vm(desc);                              \
935     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
936     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
937     uint32_t i;                                               \
938                                                               \
939     for (i = env->vstart; i < vl; i++) {                      \
940         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
941         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
942         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
943         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
944     }                                                         \
945     env->vstart = 0;                                          \
946     /*
947      * mask destination register are always tail-agnostic
948      * set tail elements to 1s
949      */                                                       \
950     if (vta_all_1s) {                                         \
951         for (; i < total_elems; i++) {                        \
952             vext_set_elem_mask(vd, i, 1);                     \
953         }                                                     \
954     }                                                         \
955 }
956 
957 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
958 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
959 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
960 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
961 
962 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
963 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
964 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
965 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
966 
967 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
968 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
969                   void *vs2, CPURISCVState *env, uint32_t desc) \
970 {                                                               \
971     uint32_t vl = env->vl;                                      \
972     uint32_t vm = vext_vm(desc);                                \
973     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
974     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
975     uint32_t i;                                                 \
976                                                                 \
977     for (i = env->vstart; i < vl; i++) {                        \
978         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
979         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
980         vext_set_elem_mask(vd, i,                               \
981                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
982     }                                                           \
983     env->vstart = 0;                                            \
984     /*
985      * mask destination register are always tail-agnostic
986      * set tail elements to 1s
987      */                                                         \
988     if (vta_all_1s) {                                           \
989         for (; i < total_elems; i++) {                          \
990             vext_set_elem_mask(vd, i, 1);                       \
991         }                                                       \
992     }                                                           \
993 }
994 
995 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
996 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
997 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
998 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
999 
1000 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1001 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1002 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1003 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1004 
1005 /* Vector Bitwise Logical Instructions */
1006 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1007 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1008 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1009 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1010 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1011 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1012 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1013 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1014 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1015 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1016 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1017 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1018 GEN_VEXT_VV(vand_vv_b, 1)
1019 GEN_VEXT_VV(vand_vv_h, 2)
1020 GEN_VEXT_VV(vand_vv_w, 4)
1021 GEN_VEXT_VV(vand_vv_d, 8)
1022 GEN_VEXT_VV(vor_vv_b, 1)
1023 GEN_VEXT_VV(vor_vv_h, 2)
1024 GEN_VEXT_VV(vor_vv_w, 4)
1025 GEN_VEXT_VV(vor_vv_d, 8)
1026 GEN_VEXT_VV(vxor_vv_b, 1)
1027 GEN_VEXT_VV(vxor_vv_h, 2)
1028 GEN_VEXT_VV(vxor_vv_w, 4)
1029 GEN_VEXT_VV(vxor_vv_d, 8)
1030 
1031 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1032 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1033 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1034 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1035 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1036 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1037 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1038 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1039 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1040 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1041 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1042 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1043 GEN_VEXT_VX(vand_vx_b, 1)
1044 GEN_VEXT_VX(vand_vx_h, 2)
1045 GEN_VEXT_VX(vand_vx_w, 4)
1046 GEN_VEXT_VX(vand_vx_d, 8)
1047 GEN_VEXT_VX(vor_vx_b, 1)
1048 GEN_VEXT_VX(vor_vx_h, 2)
1049 GEN_VEXT_VX(vor_vx_w, 4)
1050 GEN_VEXT_VX(vor_vx_d, 8)
1051 GEN_VEXT_VX(vxor_vx_b, 1)
1052 GEN_VEXT_VX(vxor_vx_h, 2)
1053 GEN_VEXT_VX(vxor_vx_w, 4)
1054 GEN_VEXT_VX(vxor_vx_d, 8)
1055 
1056 /* Vector Single-Width Bit Shift Instructions */
1057 #define DO_SLL(N, M)  (N << (M))
1058 #define DO_SRL(N, M)  (N >> (M))
1059 
1060 /* generate the helpers for shift instructions with two vector operators */
1061 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1062 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1063                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1064 {                                                                         \
1065     uint32_t vm = vext_vm(desc);                                          \
1066     uint32_t vl = env->vl;                                                \
1067     uint32_t esz = sizeof(TS1);                                           \
1068     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1069     uint32_t vta = vext_vta(desc);                                        \
1070     uint32_t vma = vext_vma(desc);                                        \
1071     uint32_t i;                                                           \
1072                                                                           \
1073     for (i = env->vstart; i < vl; i++) {                                  \
1074         if (!vm && !vext_elem_mask(v0, i)) {                              \
1075             /* set masked-off elements to 1s */                           \
1076             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1077             continue;                                                     \
1078         }                                                                 \
1079         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1080         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1081         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1082     }                                                                     \
1083     env->vstart = 0;                                                      \
1084     /* set tail elements to 1s */                                         \
1085     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1086 }
1087 
1088 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1089 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1090 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1091 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1092 
1093 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1094 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1095 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1096 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1097 
1098 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1099 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1100 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1101 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1102 
1103 /*
1104  * generate the helpers for shift instructions with one vector and one scalar
1105  */
1106 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1107 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1108                   void *vs2, CPURISCVState *env,            \
1109                   uint32_t desc)                            \
1110 {                                                           \
1111     uint32_t vm = vext_vm(desc);                            \
1112     uint32_t vl = env->vl;                                  \
1113     uint32_t esz = sizeof(TD);                              \
1114     uint32_t total_elems =                                  \
1115         vext_get_total_elems(env, desc, esz);               \
1116     uint32_t vta = vext_vta(desc);                          \
1117     uint32_t vma = vext_vma(desc);                          \
1118     uint32_t i;                                             \
1119                                                             \
1120     for (i = env->vstart; i < vl; i++) {                    \
1121         if (!vm && !vext_elem_mask(v0, i)) {                \
1122             /* set masked-off elements to 1s */             \
1123             vext_set_elems_1s(vd, vma, i * esz,             \
1124                               (i + 1) * esz);               \
1125             continue;                                       \
1126         }                                                   \
1127         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1128         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1129     }                                                       \
1130     env->vstart = 0;                                        \
1131     /* set tail elements to 1s */                           \
1132     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1133 }
1134 
1135 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1136 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1137 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1138 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1139 
1140 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1141 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1142 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1143 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1144 
1145 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1146 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1147 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1148 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1149 
1150 /* Vector Narrowing Integer Right Shift Instructions */
1151 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1152 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1153 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1154 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1155 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1156 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1157 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1158 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1159 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1160 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1161 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1162 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1163 
1164 /* Vector Integer Comparison Instructions */
1165 #define DO_MSEQ(N, M) (N == M)
1166 #define DO_MSNE(N, M) (N != M)
1167 #define DO_MSLT(N, M) (N < M)
1168 #define DO_MSLE(N, M) (N <= M)
1169 #define DO_MSGT(N, M) (N > M)
1170 
1171 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1172 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1173                   CPURISCVState *env, uint32_t desc)          \
1174 {                                                             \
1175     uint32_t vm = vext_vm(desc);                              \
1176     uint32_t vl = env->vl;                                    \
1177     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1178     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1179     uint32_t vma = vext_vma(desc);                            \
1180     uint32_t i;                                               \
1181                                                               \
1182     for (i = env->vstart; i < vl; i++) {                      \
1183         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1184         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1185         if (!vm && !vext_elem_mask(v0, i)) {                  \
1186             /* set masked-off elements to 1s */               \
1187             if (vma) {                                        \
1188                 vext_set_elem_mask(vd, i, 1);                 \
1189             }                                                 \
1190             continue;                                         \
1191         }                                                     \
1192         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1193     }                                                         \
1194     env->vstart = 0;                                          \
1195     /*
1196      * mask destination register are always tail-agnostic
1197      * set tail elements to 1s
1198      */                                                       \
1199     if (vta_all_1s) {                                         \
1200         for (; i < total_elems; i++) {                        \
1201             vext_set_elem_mask(vd, i, 1);                     \
1202         }                                                     \
1203     }                                                         \
1204 }
1205 
1206 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1207 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1208 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1209 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1210 
1211 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1212 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1213 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1214 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1215 
1216 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1217 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1218 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1219 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1220 
1221 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1222 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1223 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1224 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1225 
1226 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1227 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1228 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1229 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1230 
1231 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1232 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1233 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1234 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1235 
1236 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1237 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1238                   CPURISCVState *env, uint32_t desc)                \
1239 {                                                                   \
1240     uint32_t vm = vext_vm(desc);                                    \
1241     uint32_t vl = env->vl;                                          \
1242     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1243     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1244     uint32_t vma = vext_vma(desc);                                  \
1245     uint32_t i;                                                     \
1246                                                                     \
1247     for (i = env->vstart; i < vl; i++) {                            \
1248         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1249         if (!vm && !vext_elem_mask(v0, i)) {                        \
1250             /* set masked-off elements to 1s */                     \
1251             if (vma) {                                              \
1252                 vext_set_elem_mask(vd, i, 1);                       \
1253             }                                                       \
1254             continue;                                               \
1255         }                                                           \
1256         vext_set_elem_mask(vd, i,                                   \
1257                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1258     }                                                               \
1259     env->vstart = 0;                                                \
1260     /*
1261      * mask destination register are always tail-agnostic
1262      * set tail elements to 1s
1263      */                                                             \
1264     if (vta_all_1s) {                                               \
1265         for (; i < total_elems; i++) {                              \
1266             vext_set_elem_mask(vd, i, 1);                           \
1267         }                                                           \
1268     }                                                               \
1269 }
1270 
1271 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1272 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1273 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1274 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1275 
1276 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1277 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1278 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1279 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1280 
1281 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1282 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1283 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1284 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1285 
1286 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1287 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1288 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1289 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1290 
1291 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1292 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1293 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1294 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1295 
1296 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1297 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1298 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1299 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1300 
1301 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1302 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1303 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1304 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1305 
1306 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1307 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1308 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1309 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1310 
1311 /* Vector Integer Min/Max Instructions */
1312 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1313 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1314 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1315 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1316 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1317 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1318 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1319 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1320 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1321 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1322 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1323 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1324 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1325 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1326 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1327 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1328 GEN_VEXT_VV(vminu_vv_b, 1)
1329 GEN_VEXT_VV(vminu_vv_h, 2)
1330 GEN_VEXT_VV(vminu_vv_w, 4)
1331 GEN_VEXT_VV(vminu_vv_d, 8)
1332 GEN_VEXT_VV(vmin_vv_b, 1)
1333 GEN_VEXT_VV(vmin_vv_h, 2)
1334 GEN_VEXT_VV(vmin_vv_w, 4)
1335 GEN_VEXT_VV(vmin_vv_d, 8)
1336 GEN_VEXT_VV(vmaxu_vv_b, 1)
1337 GEN_VEXT_VV(vmaxu_vv_h, 2)
1338 GEN_VEXT_VV(vmaxu_vv_w, 4)
1339 GEN_VEXT_VV(vmaxu_vv_d, 8)
1340 GEN_VEXT_VV(vmax_vv_b, 1)
1341 GEN_VEXT_VV(vmax_vv_h, 2)
1342 GEN_VEXT_VV(vmax_vv_w, 4)
1343 GEN_VEXT_VV(vmax_vv_d, 8)
1344 
1345 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1346 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1347 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1348 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1349 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1350 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1351 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1352 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1353 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1354 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1355 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1356 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1357 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1358 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1359 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1360 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1361 GEN_VEXT_VX(vminu_vx_b, 1)
1362 GEN_VEXT_VX(vminu_vx_h, 2)
1363 GEN_VEXT_VX(vminu_vx_w, 4)
1364 GEN_VEXT_VX(vminu_vx_d, 8)
1365 GEN_VEXT_VX(vmin_vx_b, 1)
1366 GEN_VEXT_VX(vmin_vx_h, 2)
1367 GEN_VEXT_VX(vmin_vx_w, 4)
1368 GEN_VEXT_VX(vmin_vx_d, 8)
1369 GEN_VEXT_VX(vmaxu_vx_b, 1)
1370 GEN_VEXT_VX(vmaxu_vx_h, 2)
1371 GEN_VEXT_VX(vmaxu_vx_w, 4)
1372 GEN_VEXT_VX(vmaxu_vx_d, 8)
1373 GEN_VEXT_VX(vmax_vx_b, 1)
1374 GEN_VEXT_VX(vmax_vx_h, 2)
1375 GEN_VEXT_VX(vmax_vx_w, 4)
1376 GEN_VEXT_VX(vmax_vx_d, 8)
1377 
1378 /* Vector Single-Width Integer Multiply Instructions */
1379 #define DO_MUL(N, M) (N * M)
1380 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1381 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1382 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1383 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1384 GEN_VEXT_VV(vmul_vv_b, 1)
1385 GEN_VEXT_VV(vmul_vv_h, 2)
1386 GEN_VEXT_VV(vmul_vv_w, 4)
1387 GEN_VEXT_VV(vmul_vv_d, 8)
1388 
1389 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1390 {
1391     return (int16_t)s2 * (int16_t)s1 >> 8;
1392 }
1393 
1394 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1395 {
1396     return (int32_t)s2 * (int32_t)s1 >> 16;
1397 }
1398 
1399 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1400 {
1401     return (int64_t)s2 * (int64_t)s1 >> 32;
1402 }
1403 
1404 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1405 {
1406     uint64_t hi_64, lo_64;
1407 
1408     muls64(&lo_64, &hi_64, s1, s2);
1409     return hi_64;
1410 }
1411 
1412 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1413 {
1414     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1415 }
1416 
1417 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1418 {
1419     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1420 }
1421 
1422 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1423 {
1424     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1425 }
1426 
1427 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1428 {
1429     uint64_t hi_64, lo_64;
1430 
1431     mulu64(&lo_64, &hi_64, s2, s1);
1432     return hi_64;
1433 }
1434 
1435 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1436 {
1437     return (int16_t)s2 * (uint16_t)s1 >> 8;
1438 }
1439 
1440 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1441 {
1442     return (int32_t)s2 * (uint32_t)s1 >> 16;
1443 }
1444 
1445 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1446 {
1447     return (int64_t)s2 * (uint64_t)s1 >> 32;
1448 }
1449 
1450 /*
1451  * Let  A = signed operand,
1452  *      B = unsigned operand
1453  *      P = mulu64(A, B), unsigned product
1454  *
1455  * LET  X = 2 ** 64  - A, 2's complement of A
1456  *      SP = signed product
1457  * THEN
1458  *      IF A < 0
1459  *          SP = -X * B
1460  *             = -(2 ** 64 - A) * B
1461  *             = A * B - 2 ** 64 * B
1462  *             = P - 2 ** 64 * B
1463  *      ELSE
1464  *          SP = P
1465  * THEN
1466  *      HI_P -= (A < 0 ? B : 0)
1467  */
1468 
1469 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1470 {
1471     uint64_t hi_64, lo_64;
1472 
1473     mulu64(&lo_64, &hi_64, s2, s1);
1474 
1475     hi_64 -= s2 < 0 ? s1 : 0;
1476     return hi_64;
1477 }
1478 
1479 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1480 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1481 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1482 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1483 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1484 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1485 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1486 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1487 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1488 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1489 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1490 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1491 GEN_VEXT_VV(vmulh_vv_b, 1)
1492 GEN_VEXT_VV(vmulh_vv_h, 2)
1493 GEN_VEXT_VV(vmulh_vv_w, 4)
1494 GEN_VEXT_VV(vmulh_vv_d, 8)
1495 GEN_VEXT_VV(vmulhu_vv_b, 1)
1496 GEN_VEXT_VV(vmulhu_vv_h, 2)
1497 GEN_VEXT_VV(vmulhu_vv_w, 4)
1498 GEN_VEXT_VV(vmulhu_vv_d, 8)
1499 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1500 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1501 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1502 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1503 
1504 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1505 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1506 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1507 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1508 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1509 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1510 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1511 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1512 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1513 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1514 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1515 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1516 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1517 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1518 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1519 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1520 GEN_VEXT_VX(vmul_vx_b, 1)
1521 GEN_VEXT_VX(vmul_vx_h, 2)
1522 GEN_VEXT_VX(vmul_vx_w, 4)
1523 GEN_VEXT_VX(vmul_vx_d, 8)
1524 GEN_VEXT_VX(vmulh_vx_b, 1)
1525 GEN_VEXT_VX(vmulh_vx_h, 2)
1526 GEN_VEXT_VX(vmulh_vx_w, 4)
1527 GEN_VEXT_VX(vmulh_vx_d, 8)
1528 GEN_VEXT_VX(vmulhu_vx_b, 1)
1529 GEN_VEXT_VX(vmulhu_vx_h, 2)
1530 GEN_VEXT_VX(vmulhu_vx_w, 4)
1531 GEN_VEXT_VX(vmulhu_vx_d, 8)
1532 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1533 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1534 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1535 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1536 
1537 /* Vector Integer Divide Instructions */
1538 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1539 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1540 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1541         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1542 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1543         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1544 
1545 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1546 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1547 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1548 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1549 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1550 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1551 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1552 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1553 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1554 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1555 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1556 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1557 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1558 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1559 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1560 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1561 GEN_VEXT_VV(vdivu_vv_b, 1)
1562 GEN_VEXT_VV(vdivu_vv_h, 2)
1563 GEN_VEXT_VV(vdivu_vv_w, 4)
1564 GEN_VEXT_VV(vdivu_vv_d, 8)
1565 GEN_VEXT_VV(vdiv_vv_b, 1)
1566 GEN_VEXT_VV(vdiv_vv_h, 2)
1567 GEN_VEXT_VV(vdiv_vv_w, 4)
1568 GEN_VEXT_VV(vdiv_vv_d, 8)
1569 GEN_VEXT_VV(vremu_vv_b, 1)
1570 GEN_VEXT_VV(vremu_vv_h, 2)
1571 GEN_VEXT_VV(vremu_vv_w, 4)
1572 GEN_VEXT_VV(vremu_vv_d, 8)
1573 GEN_VEXT_VV(vrem_vv_b, 1)
1574 GEN_VEXT_VV(vrem_vv_h, 2)
1575 GEN_VEXT_VV(vrem_vv_w, 4)
1576 GEN_VEXT_VV(vrem_vv_d, 8)
1577 
1578 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1579 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1580 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1581 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1582 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1583 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1584 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1585 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1586 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1587 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1588 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1589 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1590 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1591 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1592 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1593 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1594 GEN_VEXT_VX(vdivu_vx_b, 1)
1595 GEN_VEXT_VX(vdivu_vx_h, 2)
1596 GEN_VEXT_VX(vdivu_vx_w, 4)
1597 GEN_VEXT_VX(vdivu_vx_d, 8)
1598 GEN_VEXT_VX(vdiv_vx_b, 1)
1599 GEN_VEXT_VX(vdiv_vx_h, 2)
1600 GEN_VEXT_VX(vdiv_vx_w, 4)
1601 GEN_VEXT_VX(vdiv_vx_d, 8)
1602 GEN_VEXT_VX(vremu_vx_b, 1)
1603 GEN_VEXT_VX(vremu_vx_h, 2)
1604 GEN_VEXT_VX(vremu_vx_w, 4)
1605 GEN_VEXT_VX(vremu_vx_d, 8)
1606 GEN_VEXT_VX(vrem_vx_b, 1)
1607 GEN_VEXT_VX(vrem_vx_h, 2)
1608 GEN_VEXT_VX(vrem_vx_w, 4)
1609 GEN_VEXT_VX(vrem_vx_d, 8)
1610 
1611 /* Vector Widening Integer Multiply Instructions */
1612 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1613 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1614 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1615 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1616 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1617 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1618 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1619 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1620 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1621 GEN_VEXT_VV(vwmul_vv_b, 2)
1622 GEN_VEXT_VV(vwmul_vv_h, 4)
1623 GEN_VEXT_VV(vwmul_vv_w, 8)
1624 GEN_VEXT_VV(vwmulu_vv_b, 2)
1625 GEN_VEXT_VV(vwmulu_vv_h, 4)
1626 GEN_VEXT_VV(vwmulu_vv_w, 8)
1627 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1628 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1629 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1630 
1631 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1632 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1633 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1634 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1635 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1636 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1637 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1638 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1639 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1640 GEN_VEXT_VX(vwmul_vx_b, 2)
1641 GEN_VEXT_VX(vwmul_vx_h, 4)
1642 GEN_VEXT_VX(vwmul_vx_w, 8)
1643 GEN_VEXT_VX(vwmulu_vx_b, 2)
1644 GEN_VEXT_VX(vwmulu_vx_h, 4)
1645 GEN_VEXT_VX(vwmulu_vx_w, 8)
1646 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1647 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1648 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1649 
1650 /* Vector Single-Width Integer Multiply-Add Instructions */
1651 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1652 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1653 {                                                                  \
1654     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1655     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1656     TD d = *((TD *)vd + HD(i));                                    \
1657     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1658 }
1659 
1660 #define DO_MACC(N, M, D) (M * N + D)
1661 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1662 #define DO_MADD(N, M, D) (M * D + N)
1663 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1664 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1665 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1666 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1667 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1668 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1669 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1670 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1671 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1672 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1673 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1674 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1675 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1676 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1677 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1678 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1679 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1680 GEN_VEXT_VV(vmacc_vv_b, 1)
1681 GEN_VEXT_VV(vmacc_vv_h, 2)
1682 GEN_VEXT_VV(vmacc_vv_w, 4)
1683 GEN_VEXT_VV(vmacc_vv_d, 8)
1684 GEN_VEXT_VV(vnmsac_vv_b, 1)
1685 GEN_VEXT_VV(vnmsac_vv_h, 2)
1686 GEN_VEXT_VV(vnmsac_vv_w, 4)
1687 GEN_VEXT_VV(vnmsac_vv_d, 8)
1688 GEN_VEXT_VV(vmadd_vv_b, 1)
1689 GEN_VEXT_VV(vmadd_vv_h, 2)
1690 GEN_VEXT_VV(vmadd_vv_w, 4)
1691 GEN_VEXT_VV(vmadd_vv_d, 8)
1692 GEN_VEXT_VV(vnmsub_vv_b, 1)
1693 GEN_VEXT_VV(vnmsub_vv_h, 2)
1694 GEN_VEXT_VV(vnmsub_vv_w, 4)
1695 GEN_VEXT_VV(vnmsub_vv_d, 8)
1696 
1697 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1698 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1699 {                                                                   \
1700     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1701     TD d = *((TD *)vd + HD(i));                                     \
1702     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1703 }
1704 
1705 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1706 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1707 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1708 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1709 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1710 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1711 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1712 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1713 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1714 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1715 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1716 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1717 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1718 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1719 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1720 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1721 GEN_VEXT_VX(vmacc_vx_b, 1)
1722 GEN_VEXT_VX(vmacc_vx_h, 2)
1723 GEN_VEXT_VX(vmacc_vx_w, 4)
1724 GEN_VEXT_VX(vmacc_vx_d, 8)
1725 GEN_VEXT_VX(vnmsac_vx_b, 1)
1726 GEN_VEXT_VX(vnmsac_vx_h, 2)
1727 GEN_VEXT_VX(vnmsac_vx_w, 4)
1728 GEN_VEXT_VX(vnmsac_vx_d, 8)
1729 GEN_VEXT_VX(vmadd_vx_b, 1)
1730 GEN_VEXT_VX(vmadd_vx_h, 2)
1731 GEN_VEXT_VX(vmadd_vx_w, 4)
1732 GEN_VEXT_VX(vmadd_vx_d, 8)
1733 GEN_VEXT_VX(vnmsub_vx_b, 1)
1734 GEN_VEXT_VX(vnmsub_vx_h, 2)
1735 GEN_VEXT_VX(vnmsub_vx_w, 4)
1736 GEN_VEXT_VX(vnmsub_vx_d, 8)
1737 
1738 /* Vector Widening Integer Multiply-Add Instructions */
1739 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1740 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1741 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1742 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1743 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1744 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1745 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1746 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1747 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1748 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1749 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1750 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1751 GEN_VEXT_VV(vwmacc_vv_b, 2)
1752 GEN_VEXT_VV(vwmacc_vv_h, 4)
1753 GEN_VEXT_VV(vwmacc_vv_w, 8)
1754 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1755 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1756 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1757 
1758 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1759 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1760 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1761 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1762 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1763 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1764 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1765 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1766 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1767 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1768 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1769 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1770 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1771 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1772 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1773 GEN_VEXT_VX(vwmacc_vx_b, 2)
1774 GEN_VEXT_VX(vwmacc_vx_h, 4)
1775 GEN_VEXT_VX(vwmacc_vx_w, 8)
1776 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1777 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1778 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1779 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1780 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1781 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1782 
1783 /* Vector Integer Merge and Move Instructions */
1784 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1785 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1786                   uint32_t desc)                                     \
1787 {                                                                    \
1788     uint32_t vl = env->vl;                                           \
1789     uint32_t esz = sizeof(ETYPE);                                    \
1790     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1791     uint32_t vta = vext_vta(desc);                                   \
1792     uint32_t i;                                                      \
1793                                                                      \
1794     for (i = env->vstart; i < vl; i++) {                             \
1795         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1796         *((ETYPE *)vd + H(i)) = s1;                                  \
1797     }                                                                \
1798     env->vstart = 0;                                                 \
1799     /* set tail elements to 1s */                                    \
1800     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1801 }
1802 
1803 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1804 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1805 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1806 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1807 
1808 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1809 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1810                   uint32_t desc)                                     \
1811 {                                                                    \
1812     uint32_t vl = env->vl;                                           \
1813     uint32_t esz = sizeof(ETYPE);                                    \
1814     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1815     uint32_t vta = vext_vta(desc);                                   \
1816     uint32_t i;                                                      \
1817                                                                      \
1818     for (i = env->vstart; i < vl; i++) {                             \
1819         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1820     }                                                                \
1821     env->vstart = 0;                                                 \
1822     /* set tail elements to 1s */                                    \
1823     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1824 }
1825 
1826 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1827 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1828 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1829 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1830 
1831 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1832 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1833                   CPURISCVState *env, uint32_t desc)                 \
1834 {                                                                    \
1835     uint32_t vl = env->vl;                                           \
1836     uint32_t esz = sizeof(ETYPE);                                    \
1837     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1838     uint32_t vta = vext_vta(desc);                                   \
1839     uint32_t i;                                                      \
1840                                                                      \
1841     for (i = env->vstart; i < vl; i++) {                             \
1842         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1843         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1844     }                                                                \
1845     env->vstart = 0;                                                 \
1846     /* set tail elements to 1s */                                    \
1847     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1848 }
1849 
1850 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1851 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1852 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1853 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1854 
1855 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1856 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1857                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1858 {                                                                    \
1859     uint32_t vl = env->vl;                                           \
1860     uint32_t esz = sizeof(ETYPE);                                    \
1861     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1862     uint32_t vta = vext_vta(desc);                                   \
1863     uint32_t i;                                                      \
1864                                                                      \
1865     for (i = env->vstart; i < vl; i++) {                             \
1866         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1867         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1868                    (ETYPE)(target_long)s1);                          \
1869         *((ETYPE *)vd + H(i)) = d;                                   \
1870     }                                                                \
1871     env->vstart = 0;                                                 \
1872     /* set tail elements to 1s */                                    \
1873     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1874 }
1875 
1876 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1877 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1878 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1879 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1880 
1881 /*
1882  * Vector Fixed-Point Arithmetic Instructions
1883  */
1884 
1885 /* Vector Single-Width Saturating Add and Subtract */
1886 
1887 /*
1888  * As fixed point instructions probably have round mode and saturation,
1889  * define common macros for fixed point here.
1890  */
1891 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1892                           CPURISCVState *env, int vxrm);
1893 
1894 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1895 static inline void                                                  \
1896 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1897           CPURISCVState *env, int vxrm)                             \
1898 {                                                                   \
1899     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1900     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1901     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1902 }
1903 
1904 static inline void
1905 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1906              CPURISCVState *env,
1907              uint32_t vl, uint32_t vm, int vxrm,
1908              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1909 {
1910     for (uint32_t i = env->vstart; i < vl; i++) {
1911         if (!vm && !vext_elem_mask(v0, i)) {
1912             /* set masked-off elements to 1s */
1913             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1914             continue;
1915         }
1916         fn(vd, vs1, vs2, i, env, vxrm);
1917     }
1918     env->vstart = 0;
1919 }
1920 
1921 static inline void
1922 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1923              CPURISCVState *env,
1924              uint32_t desc,
1925              opivv2_rm_fn *fn, uint32_t esz)
1926 {
1927     uint32_t vm = vext_vm(desc);
1928     uint32_t vl = env->vl;
1929     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1930     uint32_t vta = vext_vta(desc);
1931     uint32_t vma = vext_vma(desc);
1932 
1933     switch (env->vxrm) {
1934     case 0: /* rnu */
1935         vext_vv_rm_1(vd, v0, vs1, vs2,
1936                      env, vl, vm, 0, fn, vma, esz);
1937         break;
1938     case 1: /* rne */
1939         vext_vv_rm_1(vd, v0, vs1, vs2,
1940                      env, vl, vm, 1, fn, vma, esz);
1941         break;
1942     case 2: /* rdn */
1943         vext_vv_rm_1(vd, v0, vs1, vs2,
1944                      env, vl, vm, 2, fn, vma, esz);
1945         break;
1946     default: /* rod */
1947         vext_vv_rm_1(vd, v0, vs1, vs2,
1948                      env, vl, vm, 3, fn, vma, esz);
1949         break;
1950     }
1951     /* set tail elements to 1s */
1952     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
1953 }
1954 
1955 /* generate helpers for fixed point instructions with OPIVV format */
1956 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
1957 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1958                   CPURISCVState *env, uint32_t desc)            \
1959 {                                                               \
1960     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
1961                  do_##NAME, ESZ);                               \
1962 }
1963 
1964 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
1965                              uint8_t b)
1966 {
1967     uint8_t res = a + b;
1968     if (res < a) {
1969         res = UINT8_MAX;
1970         env->vxsat = 0x1;
1971     }
1972     return res;
1973 }
1974 
1975 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
1976                                uint16_t b)
1977 {
1978     uint16_t res = a + b;
1979     if (res < a) {
1980         res = UINT16_MAX;
1981         env->vxsat = 0x1;
1982     }
1983     return res;
1984 }
1985 
1986 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
1987                                uint32_t b)
1988 {
1989     uint32_t res = a + b;
1990     if (res < a) {
1991         res = UINT32_MAX;
1992         env->vxsat = 0x1;
1993     }
1994     return res;
1995 }
1996 
1997 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
1998                                uint64_t b)
1999 {
2000     uint64_t res = a + b;
2001     if (res < a) {
2002         res = UINT64_MAX;
2003         env->vxsat = 0x1;
2004     }
2005     return res;
2006 }
2007 
2008 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2009 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2010 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2011 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2012 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2013 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2014 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2015 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2016 
2017 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2018                           CPURISCVState *env, int vxrm);
2019 
2020 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2021 static inline void                                                  \
2022 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2023           CPURISCVState *env, int vxrm)                             \
2024 {                                                                   \
2025     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2026     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2027 }
2028 
2029 static inline void
2030 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2031              CPURISCVState *env,
2032              uint32_t vl, uint32_t vm, int vxrm,
2033              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2034 {
2035     for (uint32_t i = env->vstart; i < vl; i++) {
2036         if (!vm && !vext_elem_mask(v0, i)) {
2037             /* set masked-off elements to 1s */
2038             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2039             continue;
2040         }
2041         fn(vd, s1, vs2, i, env, vxrm);
2042     }
2043     env->vstart = 0;
2044 }
2045 
2046 static inline void
2047 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2048              CPURISCVState *env,
2049              uint32_t desc,
2050              opivx2_rm_fn *fn, uint32_t esz)
2051 {
2052     uint32_t vm = vext_vm(desc);
2053     uint32_t vl = env->vl;
2054     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2055     uint32_t vta = vext_vta(desc);
2056     uint32_t vma = vext_vma(desc);
2057 
2058     switch (env->vxrm) {
2059     case 0: /* rnu */
2060         vext_vx_rm_1(vd, v0, s1, vs2,
2061                      env, vl, vm, 0, fn, vma, esz);
2062         break;
2063     case 1: /* rne */
2064         vext_vx_rm_1(vd, v0, s1, vs2,
2065                      env, vl, vm, 1, fn, vma, esz);
2066         break;
2067     case 2: /* rdn */
2068         vext_vx_rm_1(vd, v0, s1, vs2,
2069                      env, vl, vm, 2, fn, vma, esz);
2070         break;
2071     default: /* rod */
2072         vext_vx_rm_1(vd, v0, s1, vs2,
2073                      env, vl, vm, 3, fn, vma, esz);
2074         break;
2075     }
2076     /* set tail elements to 1s */
2077     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2078 }
2079 
2080 /* generate helpers for fixed point instructions with OPIVX format */
2081 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2082 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2083                   void *vs2, CPURISCVState *env,          \
2084                   uint32_t desc)                          \
2085 {                                                         \
2086     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2087                  do_##NAME, ESZ);                         \
2088 }
2089 
2090 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2091 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2092 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2093 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2094 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2095 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2096 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2097 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2098 
2099 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2100 {
2101     int8_t res = a + b;
2102     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2103         res = a > 0 ? INT8_MAX : INT8_MIN;
2104         env->vxsat = 0x1;
2105     }
2106     return res;
2107 }
2108 
2109 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2110                              int16_t b)
2111 {
2112     int16_t res = a + b;
2113     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2114         res = a > 0 ? INT16_MAX : INT16_MIN;
2115         env->vxsat = 0x1;
2116     }
2117     return res;
2118 }
2119 
2120 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2121                              int32_t b)
2122 {
2123     int32_t res = a + b;
2124     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2125         res = a > 0 ? INT32_MAX : INT32_MIN;
2126         env->vxsat = 0x1;
2127     }
2128     return res;
2129 }
2130 
2131 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2132                              int64_t b)
2133 {
2134     int64_t res = a + b;
2135     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2136         res = a > 0 ? INT64_MAX : INT64_MIN;
2137         env->vxsat = 0x1;
2138     }
2139     return res;
2140 }
2141 
2142 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2143 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2144 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2145 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2146 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2147 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2148 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2149 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2150 
2151 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2152 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2153 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2154 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2155 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2156 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2157 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2158 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2159 
2160 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2161                              uint8_t b)
2162 {
2163     uint8_t res = a - b;
2164     if (res > a) {
2165         res = 0;
2166         env->vxsat = 0x1;
2167     }
2168     return res;
2169 }
2170 
2171 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2172                                uint16_t b)
2173 {
2174     uint16_t res = a - b;
2175     if (res > a) {
2176         res = 0;
2177         env->vxsat = 0x1;
2178     }
2179     return res;
2180 }
2181 
2182 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2183                                uint32_t b)
2184 {
2185     uint32_t res = a - b;
2186     if (res > a) {
2187         res = 0;
2188         env->vxsat = 0x1;
2189     }
2190     return res;
2191 }
2192 
2193 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2194                                uint64_t b)
2195 {
2196     uint64_t res = a - b;
2197     if (res > a) {
2198         res = 0;
2199         env->vxsat = 0x1;
2200     }
2201     return res;
2202 }
2203 
2204 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2205 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2206 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2207 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2208 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2209 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2210 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2211 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2212 
2213 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2214 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2215 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2216 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2217 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2218 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2219 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2220 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2221 
2222 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2223 {
2224     int8_t res = a - b;
2225     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2226         res = a >= 0 ? INT8_MAX : INT8_MIN;
2227         env->vxsat = 0x1;
2228     }
2229     return res;
2230 }
2231 
2232 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2233                              int16_t b)
2234 {
2235     int16_t res = a - b;
2236     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2237         res = a >= 0 ? INT16_MAX : INT16_MIN;
2238         env->vxsat = 0x1;
2239     }
2240     return res;
2241 }
2242 
2243 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2244                              int32_t b)
2245 {
2246     int32_t res = a - b;
2247     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2248         res = a >= 0 ? INT32_MAX : INT32_MIN;
2249         env->vxsat = 0x1;
2250     }
2251     return res;
2252 }
2253 
2254 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2255                              int64_t b)
2256 {
2257     int64_t res = a - b;
2258     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2259         res = a >= 0 ? INT64_MAX : INT64_MIN;
2260         env->vxsat = 0x1;
2261     }
2262     return res;
2263 }
2264 
2265 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2266 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2267 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2268 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2269 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2270 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2271 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2272 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2273 
2274 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2275 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2276 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2277 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2278 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2279 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2280 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2281 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2282 
2283 /* Vector Single-Width Averaging Add and Subtract */
2284 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2285 {
2286     uint8_t d = extract64(v, shift, 1);
2287     uint8_t d1;
2288     uint64_t D1, D2;
2289 
2290     if (shift == 0 || shift > 64) {
2291         return 0;
2292     }
2293 
2294     d1 = extract64(v, shift - 1, 1);
2295     D1 = extract64(v, 0, shift);
2296     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2297         return d1;
2298     } else if (vxrm == 1) { /* round-to-nearest-even */
2299         if (shift > 1) {
2300             D2 = extract64(v, 0, shift - 1);
2301             return d1 & ((D2 != 0) | d);
2302         } else {
2303             return d1 & d;
2304         }
2305     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2306         return !d & (D1 != 0);
2307     }
2308     return 0; /* round-down (truncate) */
2309 }
2310 
2311 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2312                              int32_t b)
2313 {
2314     int64_t res = (int64_t)a + b;
2315     uint8_t round = get_round(vxrm, res, 1);
2316 
2317     return (res >> 1) + round;
2318 }
2319 
2320 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2321                              int64_t b)
2322 {
2323     int64_t res = a + b;
2324     uint8_t round = get_round(vxrm, res, 1);
2325     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2326 
2327     /* With signed overflow, bit 64 is inverse of bit 63. */
2328     return ((res >> 1) ^ over) + round;
2329 }
2330 
2331 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2332 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2333 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2334 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2335 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2336 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2337 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2338 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2339 
2340 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2341 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2342 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2343 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2344 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2345 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2346 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2347 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2348 
2349 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2350                                uint32_t a, uint32_t b)
2351 {
2352     uint64_t res = (uint64_t)a + b;
2353     uint8_t round = get_round(vxrm, res, 1);
2354 
2355     return (res >> 1) + round;
2356 }
2357 
2358 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2359                                uint64_t a, uint64_t b)
2360 {
2361     uint64_t res = a + b;
2362     uint8_t round = get_round(vxrm, res, 1);
2363     uint64_t over = (uint64_t)(res < a) << 63;
2364 
2365     return ((res >> 1) | over) + round;
2366 }
2367 
2368 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2369 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2370 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2371 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2372 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2373 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2374 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2375 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2376 
2377 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2378 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2379 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2380 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2381 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2382 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2383 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2384 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2385 
2386 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2387                              int32_t b)
2388 {
2389     int64_t res = (int64_t)a - b;
2390     uint8_t round = get_round(vxrm, res, 1);
2391 
2392     return (res >> 1) + round;
2393 }
2394 
2395 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2396                              int64_t b)
2397 {
2398     int64_t res = (int64_t)a - b;
2399     uint8_t round = get_round(vxrm, res, 1);
2400     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2401 
2402     /* With signed overflow, bit 64 is inverse of bit 63. */
2403     return ((res >> 1) ^ over) + round;
2404 }
2405 
2406 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2407 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2408 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2409 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2410 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2411 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2412 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2413 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2414 
2415 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2416 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2417 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2418 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2419 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2420 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2421 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2422 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2423 
2424 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2425                                uint32_t a, uint32_t b)
2426 {
2427     int64_t res = (int64_t)a - b;
2428     uint8_t round = get_round(vxrm, res, 1);
2429 
2430     return (res >> 1) + round;
2431 }
2432 
2433 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2434                                uint64_t a, uint64_t b)
2435 {
2436     uint64_t res = (uint64_t)a - b;
2437     uint8_t round = get_round(vxrm, res, 1);
2438     uint64_t over = (uint64_t)(res > a) << 63;
2439 
2440     return ((res >> 1) | over) + round;
2441 }
2442 
2443 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2444 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2445 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2446 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2447 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2448 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2449 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2450 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2451 
2452 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2453 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2454 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2455 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2456 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2457 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2458 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2459 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2460 
2461 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2462 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2463 {
2464     uint8_t round;
2465     int16_t res;
2466 
2467     res = (int16_t)a * (int16_t)b;
2468     round = get_round(vxrm, res, 7);
2469     res = (res >> 7) + round;
2470 
2471     if (res > INT8_MAX) {
2472         env->vxsat = 0x1;
2473         return INT8_MAX;
2474     } else if (res < INT8_MIN) {
2475         env->vxsat = 0x1;
2476         return INT8_MIN;
2477     } else {
2478         return res;
2479     }
2480 }
2481 
2482 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2483 {
2484     uint8_t round;
2485     int32_t res;
2486 
2487     res = (int32_t)a * (int32_t)b;
2488     round = get_round(vxrm, res, 15);
2489     res = (res >> 15) + round;
2490 
2491     if (res > INT16_MAX) {
2492         env->vxsat = 0x1;
2493         return INT16_MAX;
2494     } else if (res < INT16_MIN) {
2495         env->vxsat = 0x1;
2496         return INT16_MIN;
2497     } else {
2498         return res;
2499     }
2500 }
2501 
2502 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2503 {
2504     uint8_t round;
2505     int64_t res;
2506 
2507     res = (int64_t)a * (int64_t)b;
2508     round = get_round(vxrm, res, 31);
2509     res = (res >> 31) + round;
2510 
2511     if (res > INT32_MAX) {
2512         env->vxsat = 0x1;
2513         return INT32_MAX;
2514     } else if (res < INT32_MIN) {
2515         env->vxsat = 0x1;
2516         return INT32_MIN;
2517     } else {
2518         return res;
2519     }
2520 }
2521 
2522 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2523 {
2524     uint8_t round;
2525     uint64_t hi_64, lo_64;
2526     int64_t res;
2527 
2528     if (a == INT64_MIN && b == INT64_MIN) {
2529         env->vxsat = 1;
2530         return INT64_MAX;
2531     }
2532 
2533     muls64(&lo_64, &hi_64, a, b);
2534     round = get_round(vxrm, lo_64, 63);
2535     /*
2536      * Cannot overflow, as there are always
2537      * 2 sign bits after multiply.
2538      */
2539     res = (hi_64 << 1) | (lo_64 >> 63);
2540     if (round) {
2541         if (res == INT64_MAX) {
2542             env->vxsat = 1;
2543         } else {
2544             res += 1;
2545         }
2546     }
2547     return res;
2548 }
2549 
2550 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2551 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2552 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2553 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2554 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2555 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2556 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2557 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2558 
2559 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2560 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2561 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2562 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2563 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2564 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2565 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2566 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2567 
2568 /* Vector Single-Width Scaling Shift Instructions */
2569 static inline uint8_t
2570 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2571 {
2572     uint8_t round, shift = b & 0x7;
2573     uint8_t res;
2574 
2575     round = get_round(vxrm, a, shift);
2576     res = (a >> shift) + round;
2577     return res;
2578 }
2579 static inline uint16_t
2580 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2581 {
2582     uint8_t round, shift = b & 0xf;
2583 
2584     round = get_round(vxrm, a, shift);
2585     return (a >> shift) + round;
2586 }
2587 static inline uint32_t
2588 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2589 {
2590     uint8_t round, shift = b & 0x1f;
2591 
2592     round = get_round(vxrm, a, shift);
2593     return (a >> shift) + round;
2594 }
2595 static inline uint64_t
2596 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2597 {
2598     uint8_t round, shift = b & 0x3f;
2599 
2600     round = get_round(vxrm, a, shift);
2601     return (a >> shift) + round;
2602 }
2603 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2604 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2605 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2606 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2607 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2608 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2609 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2610 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2611 
2612 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2613 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2614 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2615 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2616 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2617 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2618 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2619 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2620 
2621 static inline int8_t
2622 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2623 {
2624     uint8_t round, shift = b & 0x7;
2625 
2626     round = get_round(vxrm, a, shift);
2627     return (a >> shift) + round;
2628 }
2629 static inline int16_t
2630 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2631 {
2632     uint8_t round, shift = b & 0xf;
2633 
2634     round = get_round(vxrm, a, shift);
2635     return (a >> shift) + round;
2636 }
2637 static inline int32_t
2638 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2639 {
2640     uint8_t round, shift = b & 0x1f;
2641 
2642     round = get_round(vxrm, a, shift);
2643     return (a >> shift) + round;
2644 }
2645 static inline int64_t
2646 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2647 {
2648     uint8_t round, shift = b & 0x3f;
2649 
2650     round = get_round(vxrm, a, shift);
2651     return (a >> shift) + round;
2652 }
2653 
2654 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2655 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2656 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2657 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2658 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2659 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2660 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2661 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2662 
2663 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2664 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2665 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2666 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2667 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2668 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2669 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2670 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2671 
2672 /* Vector Narrowing Fixed-Point Clip Instructions */
2673 static inline int8_t
2674 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2675 {
2676     uint8_t round, shift = b & 0xf;
2677     int16_t res;
2678 
2679     round = get_round(vxrm, a, shift);
2680     res = (a >> shift) + round;
2681     if (res > INT8_MAX) {
2682         env->vxsat = 0x1;
2683         return INT8_MAX;
2684     } else if (res < INT8_MIN) {
2685         env->vxsat = 0x1;
2686         return INT8_MIN;
2687     } else {
2688         return res;
2689     }
2690 }
2691 
2692 static inline int16_t
2693 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2694 {
2695     uint8_t round, shift = b & 0x1f;
2696     int32_t res;
2697 
2698     round = get_round(vxrm, a, shift);
2699     res = (a >> shift) + round;
2700     if (res > INT16_MAX) {
2701         env->vxsat = 0x1;
2702         return INT16_MAX;
2703     } else if (res < INT16_MIN) {
2704         env->vxsat = 0x1;
2705         return INT16_MIN;
2706     } else {
2707         return res;
2708     }
2709 }
2710 
2711 static inline int32_t
2712 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2713 {
2714     uint8_t round, shift = b & 0x3f;
2715     int64_t res;
2716 
2717     round = get_round(vxrm, a, shift);
2718     res = (a >> shift) + round;
2719     if (res > INT32_MAX) {
2720         env->vxsat = 0x1;
2721         return INT32_MAX;
2722     } else if (res < INT32_MIN) {
2723         env->vxsat = 0x1;
2724         return INT32_MIN;
2725     } else {
2726         return res;
2727     }
2728 }
2729 
2730 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2731 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2732 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2733 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2734 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2735 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2736 
2737 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2738 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2739 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2740 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2741 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2742 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2743 
2744 static inline uint8_t
2745 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2746 {
2747     uint8_t round, shift = b & 0xf;
2748     uint16_t res;
2749 
2750     round = get_round(vxrm, a, shift);
2751     res = (a >> shift) + round;
2752     if (res > UINT8_MAX) {
2753         env->vxsat = 0x1;
2754         return UINT8_MAX;
2755     } else {
2756         return res;
2757     }
2758 }
2759 
2760 static inline uint16_t
2761 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2762 {
2763     uint8_t round, shift = b & 0x1f;
2764     uint32_t res;
2765 
2766     round = get_round(vxrm, a, shift);
2767     res = (a >> shift) + round;
2768     if (res > UINT16_MAX) {
2769         env->vxsat = 0x1;
2770         return UINT16_MAX;
2771     } else {
2772         return res;
2773     }
2774 }
2775 
2776 static inline uint32_t
2777 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2778 {
2779     uint8_t round, shift = b & 0x3f;
2780     uint64_t res;
2781 
2782     round = get_round(vxrm, a, shift);
2783     res = (a >> shift) + round;
2784     if (res > UINT32_MAX) {
2785         env->vxsat = 0x1;
2786         return UINT32_MAX;
2787     } else {
2788         return res;
2789     }
2790 }
2791 
2792 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2793 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2794 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2795 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2796 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2797 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2798 
2799 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2800 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2801 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2802 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2803 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2804 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2805 
2806 /*
2807  * Vector Float Point Arithmetic Instructions
2808  */
2809 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2810 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2811 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2812                       CPURISCVState *env)                      \
2813 {                                                              \
2814     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2815     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2816     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2817 }
2818 
2819 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2820 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2821                   void *vs2, CPURISCVState *env,          \
2822                   uint32_t desc)                          \
2823 {                                                         \
2824     uint32_t vm = vext_vm(desc);                          \
2825     uint32_t vl = env->vl;                                \
2826     uint32_t total_elems =                                \
2827         vext_get_total_elems(env, desc, ESZ);             \
2828     uint32_t vta = vext_vta(desc);                        \
2829     uint32_t vma = vext_vma(desc);                        \
2830     uint32_t i;                                           \
2831                                                           \
2832     for (i = env->vstart; i < vl; i++) {                  \
2833         if (!vm && !vext_elem_mask(v0, i)) {              \
2834             /* set masked-off elements to 1s */           \
2835             vext_set_elems_1s(vd, vma, i * ESZ,           \
2836                               (i + 1) * ESZ);             \
2837             continue;                                     \
2838         }                                                 \
2839         do_##NAME(vd, vs1, vs2, i, env);                  \
2840     }                                                     \
2841     env->vstart = 0;                                      \
2842     /* set tail elements to 1s */                         \
2843     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2844                       total_elems * ESZ);                 \
2845 }
2846 
2847 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2848 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2849 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2850 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2851 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2852 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2853 
2854 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2855 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2856                       CPURISCVState *env)                      \
2857 {                                                              \
2858     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2859     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2860 }
2861 
2862 #define GEN_VEXT_VF(NAME, ESZ)                            \
2863 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2864                   void *vs2, CPURISCVState *env,          \
2865                   uint32_t desc)                          \
2866 {                                                         \
2867     uint32_t vm = vext_vm(desc);                          \
2868     uint32_t vl = env->vl;                                \
2869     uint32_t total_elems =                                \
2870         vext_get_total_elems(env, desc, ESZ);             \
2871     uint32_t vta = vext_vta(desc);                        \
2872     uint32_t vma = vext_vma(desc);                        \
2873     uint32_t i;                                           \
2874                                                           \
2875     for (i = env->vstart; i < vl; i++) {                  \
2876         if (!vm && !vext_elem_mask(v0, i)) {              \
2877             /* set masked-off elements to 1s */           \
2878             vext_set_elems_1s(vd, vma, i * ESZ,           \
2879                               (i + 1) * ESZ);             \
2880             continue;                                     \
2881         }                                                 \
2882         do_##NAME(vd, s1, vs2, i, env);                   \
2883     }                                                     \
2884     env->vstart = 0;                                      \
2885     /* set tail elements to 1s */                         \
2886     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2887                       total_elems * ESZ);                 \
2888 }
2889 
2890 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2891 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2892 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2893 GEN_VEXT_VF(vfadd_vf_h, 2)
2894 GEN_VEXT_VF(vfadd_vf_w, 4)
2895 GEN_VEXT_VF(vfadd_vf_d, 8)
2896 
2897 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2898 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2899 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2900 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2901 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2902 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2903 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2904 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2905 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2906 GEN_VEXT_VF(vfsub_vf_h, 2)
2907 GEN_VEXT_VF(vfsub_vf_w, 4)
2908 GEN_VEXT_VF(vfsub_vf_d, 8)
2909 
2910 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2911 {
2912     return float16_sub(b, a, s);
2913 }
2914 
2915 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2916 {
2917     return float32_sub(b, a, s);
2918 }
2919 
2920 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2921 {
2922     return float64_sub(b, a, s);
2923 }
2924 
2925 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2926 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2927 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2928 GEN_VEXT_VF(vfrsub_vf_h, 2)
2929 GEN_VEXT_VF(vfrsub_vf_w, 4)
2930 GEN_VEXT_VF(vfrsub_vf_d, 8)
2931 
2932 /* Vector Widening Floating-Point Add/Subtract Instructions */
2933 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2934 {
2935     return float32_add(float16_to_float32(a, true, s),
2936                        float16_to_float32(b, true, s), s);
2937 }
2938 
2939 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2940 {
2941     return float64_add(float32_to_float64(a, s),
2942                        float32_to_float64(b, s), s);
2943 
2944 }
2945 
2946 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
2947 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
2948 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
2949 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
2950 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
2951 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
2952 GEN_VEXT_VF(vfwadd_vf_h, 4)
2953 GEN_VEXT_VF(vfwadd_vf_w, 8)
2954 
2955 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
2956 {
2957     return float32_sub(float16_to_float32(a, true, s),
2958                        float16_to_float32(b, true, s), s);
2959 }
2960 
2961 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
2962 {
2963     return float64_sub(float32_to_float64(a, s),
2964                        float32_to_float64(b, s), s);
2965 
2966 }
2967 
2968 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
2969 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
2970 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
2971 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
2972 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
2973 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
2974 GEN_VEXT_VF(vfwsub_vf_h, 4)
2975 GEN_VEXT_VF(vfwsub_vf_w, 8)
2976 
2977 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
2978 {
2979     return float32_add(a, float16_to_float32(b, true, s), s);
2980 }
2981 
2982 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
2983 {
2984     return float64_add(a, float32_to_float64(b, s), s);
2985 }
2986 
2987 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
2988 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
2989 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
2990 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
2991 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
2992 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
2993 GEN_VEXT_VF(vfwadd_wf_h, 4)
2994 GEN_VEXT_VF(vfwadd_wf_w, 8)
2995 
2996 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
2997 {
2998     return float32_sub(a, float16_to_float32(b, true, s), s);
2999 }
3000 
3001 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3002 {
3003     return float64_sub(a, float32_to_float64(b, s), s);
3004 }
3005 
3006 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3007 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3008 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3009 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3010 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3011 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3012 GEN_VEXT_VF(vfwsub_wf_h, 4)
3013 GEN_VEXT_VF(vfwsub_wf_w, 8)
3014 
3015 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3016 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3017 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3018 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3019 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3020 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3021 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3022 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3023 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3024 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3025 GEN_VEXT_VF(vfmul_vf_h, 2)
3026 GEN_VEXT_VF(vfmul_vf_w, 4)
3027 GEN_VEXT_VF(vfmul_vf_d, 8)
3028 
3029 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3030 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3031 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3032 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3033 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3034 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3035 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3036 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3037 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3038 GEN_VEXT_VF(vfdiv_vf_h, 2)
3039 GEN_VEXT_VF(vfdiv_vf_w, 4)
3040 GEN_VEXT_VF(vfdiv_vf_d, 8)
3041 
3042 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3043 {
3044     return float16_div(b, a, s);
3045 }
3046 
3047 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3048 {
3049     return float32_div(b, a, s);
3050 }
3051 
3052 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3053 {
3054     return float64_div(b, a, s);
3055 }
3056 
3057 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3058 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3059 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3060 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3061 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3062 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3063 
3064 /* Vector Widening Floating-Point Multiply */
3065 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3066 {
3067     return float32_mul(float16_to_float32(a, true, s),
3068                        float16_to_float32(b, true, s), s);
3069 }
3070 
3071 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3072 {
3073     return float64_mul(float32_to_float64(a, s),
3074                        float32_to_float64(b, s), s);
3075 
3076 }
3077 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3078 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3079 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3080 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3081 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3082 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3083 GEN_VEXT_VF(vfwmul_vf_h, 4)
3084 GEN_VEXT_VF(vfwmul_vf_w, 8)
3085 
3086 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3087 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3088 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3089                       CPURISCVState *env)                          \
3090 {                                                                  \
3091     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3092     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3093     TD d = *((TD *)vd + HD(i));                                    \
3094     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3095 }
3096 
3097 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3098 {
3099     return float16_muladd(a, b, d, 0, s);
3100 }
3101 
3102 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3103 {
3104     return float32_muladd(a, b, d, 0, s);
3105 }
3106 
3107 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3108 {
3109     return float64_muladd(a, b, d, 0, s);
3110 }
3111 
3112 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3113 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3114 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3115 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3116 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3117 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3118 
3119 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3120 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3121                       CPURISCVState *env)                         \
3122 {                                                                 \
3123     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3124     TD d = *((TD *)vd + HD(i));                                   \
3125     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3126 }
3127 
3128 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3129 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3130 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3131 GEN_VEXT_VF(vfmacc_vf_h, 2)
3132 GEN_VEXT_VF(vfmacc_vf_w, 4)
3133 GEN_VEXT_VF(vfmacc_vf_d, 8)
3134 
3135 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3136 {
3137     return float16_muladd(a, b, d, float_muladd_negate_c |
3138                                    float_muladd_negate_product, s);
3139 }
3140 
3141 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3142 {
3143     return float32_muladd(a, b, d, float_muladd_negate_c |
3144                                    float_muladd_negate_product, s);
3145 }
3146 
3147 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3148 {
3149     return float64_muladd(a, b, d, float_muladd_negate_c |
3150                                    float_muladd_negate_product, s);
3151 }
3152 
3153 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3154 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3155 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3156 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3157 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3158 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3159 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3160 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3161 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3162 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3163 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3164 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3165 
3166 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3167 {
3168     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3169 }
3170 
3171 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3172 {
3173     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3174 }
3175 
3176 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3177 {
3178     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3179 }
3180 
3181 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3182 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3183 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3184 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3185 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3186 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3187 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3188 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3189 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3190 GEN_VEXT_VF(vfmsac_vf_h, 2)
3191 GEN_VEXT_VF(vfmsac_vf_w, 4)
3192 GEN_VEXT_VF(vfmsac_vf_d, 8)
3193 
3194 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3195 {
3196     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3197 }
3198 
3199 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3200 {
3201     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3202 }
3203 
3204 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3205 {
3206     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3207 }
3208 
3209 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3210 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3211 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3212 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3213 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3214 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3215 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3216 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3217 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3218 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3219 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3220 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3221 
3222 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3223 {
3224     return float16_muladd(d, b, a, 0, s);
3225 }
3226 
3227 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3228 {
3229     return float32_muladd(d, b, a, 0, s);
3230 }
3231 
3232 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3233 {
3234     return float64_muladd(d, b, a, 0, s);
3235 }
3236 
3237 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3238 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3239 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3240 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3241 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3242 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3243 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3244 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3245 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3246 GEN_VEXT_VF(vfmadd_vf_h, 2)
3247 GEN_VEXT_VF(vfmadd_vf_w, 4)
3248 GEN_VEXT_VF(vfmadd_vf_d, 8)
3249 
3250 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3251 {
3252     return float16_muladd(d, b, a, float_muladd_negate_c |
3253                                    float_muladd_negate_product, s);
3254 }
3255 
3256 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3257 {
3258     return float32_muladd(d, b, a, float_muladd_negate_c |
3259                                    float_muladd_negate_product, s);
3260 }
3261 
3262 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3263 {
3264     return float64_muladd(d, b, a, float_muladd_negate_c |
3265                                    float_muladd_negate_product, s);
3266 }
3267 
3268 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3269 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3270 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3271 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3272 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3273 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3274 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3275 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3276 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3277 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3278 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3279 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3280 
3281 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3282 {
3283     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3284 }
3285 
3286 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3287 {
3288     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3289 }
3290 
3291 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3292 {
3293     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3294 }
3295 
3296 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3297 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3298 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3299 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3300 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3301 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3302 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3303 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3304 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3305 GEN_VEXT_VF(vfmsub_vf_h, 2)
3306 GEN_VEXT_VF(vfmsub_vf_w, 4)
3307 GEN_VEXT_VF(vfmsub_vf_d, 8)
3308 
3309 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3310 {
3311     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3312 }
3313 
3314 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3315 {
3316     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3317 }
3318 
3319 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3320 {
3321     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3322 }
3323 
3324 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3325 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3326 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3327 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3328 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3329 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3330 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3331 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3332 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3333 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3334 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3335 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3336 
3337 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3338 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3339 {
3340     return float32_muladd(float16_to_float32(a, true, s),
3341                           float16_to_float32(b, true, s), d, 0, s);
3342 }
3343 
3344 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3345 {
3346     return float64_muladd(float32_to_float64(a, s),
3347                           float32_to_float64(b, s), d, 0, s);
3348 }
3349 
3350 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3351 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3352 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3353 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3354 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3355 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3356 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3357 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3358 
3359 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3360 {
3361     return float32_muladd(bfloat16_to_float32(a, s),
3362                           bfloat16_to_float32(b, s), d, 0, s);
3363 }
3364 
3365 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3366 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3367 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmacc16)
3368 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3369 
3370 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3371 {
3372     return float32_muladd(float16_to_float32(a, true, s),
3373                           float16_to_float32(b, true, s), d,
3374                           float_muladd_negate_c | float_muladd_negate_product,
3375                           s);
3376 }
3377 
3378 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3379 {
3380     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3381                           d, float_muladd_negate_c |
3382                              float_muladd_negate_product, s);
3383 }
3384 
3385 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3386 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3387 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3388 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3389 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3390 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3391 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3392 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3393 
3394 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3395 {
3396     return float32_muladd(float16_to_float32(a, true, s),
3397                           float16_to_float32(b, true, s), d,
3398                           float_muladd_negate_c, s);
3399 }
3400 
3401 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3402 {
3403     return float64_muladd(float32_to_float64(a, s),
3404                           float32_to_float64(b, s), d,
3405                           float_muladd_negate_c, s);
3406 }
3407 
3408 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3409 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3410 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3411 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3412 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3413 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3414 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3415 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3416 
3417 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3418 {
3419     return float32_muladd(float16_to_float32(a, true, s),
3420                           float16_to_float32(b, true, s), d,
3421                           float_muladd_negate_product, s);
3422 }
3423 
3424 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3425 {
3426     return float64_muladd(float32_to_float64(a, s),
3427                           float32_to_float64(b, s), d,
3428                           float_muladd_negate_product, s);
3429 }
3430 
3431 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3432 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3433 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3434 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3435 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3436 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3437 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3438 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3439 
3440 /* Vector Floating-Point Square-Root Instruction */
3441 /* (TD, T2, TX2) */
3442 #define OP_UU_H uint16_t, uint16_t, uint16_t
3443 #define OP_UU_W uint32_t, uint32_t, uint32_t
3444 #define OP_UU_D uint64_t, uint64_t, uint64_t
3445 
3446 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3447 static void do_##NAME(void *vd, void *vs2, int i,      \
3448                       CPURISCVState *env)              \
3449 {                                                      \
3450     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3451     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3452 }
3453 
3454 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3455 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3456                   CPURISCVState *env, uint32_t desc)   \
3457 {                                                      \
3458     uint32_t vm = vext_vm(desc);                       \
3459     uint32_t vl = env->vl;                             \
3460     uint32_t total_elems =                             \
3461         vext_get_total_elems(env, desc, ESZ);          \
3462     uint32_t vta = vext_vta(desc);                     \
3463     uint32_t vma = vext_vma(desc);                     \
3464     uint32_t i;                                        \
3465                                                        \
3466     if (vl == 0) {                                     \
3467         return;                                        \
3468     }                                                  \
3469     for (i = env->vstart; i < vl; i++) {               \
3470         if (!vm && !vext_elem_mask(v0, i)) {           \
3471             /* set masked-off elements to 1s */        \
3472             vext_set_elems_1s(vd, vma, i * ESZ,        \
3473                               (i + 1) * ESZ);          \
3474             continue;                                  \
3475         }                                              \
3476         do_##NAME(vd, vs2, i, env);                    \
3477     }                                                  \
3478     env->vstart = 0;                                   \
3479     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3480                       total_elems * ESZ);              \
3481 }
3482 
3483 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3484 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3485 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3486 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3487 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3488 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3489 
3490 /*
3491  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3492  *
3493  * Adapted from riscv-v-spec recip.c:
3494  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3495  */
3496 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3497 {
3498     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3499     uint64_t exp = extract64(f, frac_size, exp_size);
3500     uint64_t frac = extract64(f, 0, frac_size);
3501 
3502     const uint8_t lookup_table[] = {
3503         52, 51, 50, 48, 47, 46, 44, 43,
3504         42, 41, 40, 39, 38, 36, 35, 34,
3505         33, 32, 31, 30, 30, 29, 28, 27,
3506         26, 25, 24, 23, 23, 22, 21, 20,
3507         19, 19, 18, 17, 16, 16, 15, 14,
3508         14, 13, 12, 12, 11, 10, 10, 9,
3509         9, 8, 7, 7, 6, 6, 5, 4,
3510         4, 3, 3, 2, 2, 1, 1, 0,
3511         127, 125, 123, 121, 119, 118, 116, 114,
3512         113, 111, 109, 108, 106, 105, 103, 102,
3513         100, 99, 97, 96, 95, 93, 92, 91,
3514         90, 88, 87, 86, 85, 84, 83, 82,
3515         80, 79, 78, 77, 76, 75, 74, 73,
3516         72, 71, 70, 70, 69, 68, 67, 66,
3517         65, 64, 63, 63, 62, 61, 60, 59,
3518         59, 58, 57, 56, 56, 55, 54, 53
3519     };
3520     const int precision = 7;
3521 
3522     if (exp == 0 && frac != 0) { /* subnormal */
3523         /* Normalize the subnormal. */
3524         while (extract64(frac, frac_size - 1, 1) == 0) {
3525             exp--;
3526             frac <<= 1;
3527         }
3528 
3529         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3530     }
3531 
3532     int idx = ((exp & 1) << (precision - 1)) |
3533               (frac >> (frac_size - precision + 1));
3534     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3535                         (frac_size - precision);
3536     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3537 
3538     uint64_t val = 0;
3539     val = deposit64(val, 0, frac_size, out_frac);
3540     val = deposit64(val, frac_size, exp_size, out_exp);
3541     val = deposit64(val, frac_size + exp_size, 1, sign);
3542     return val;
3543 }
3544 
3545 static float16 frsqrt7_h(float16 f, float_status *s)
3546 {
3547     int exp_size = 5, frac_size = 10;
3548     bool sign = float16_is_neg(f);
3549 
3550     /*
3551      * frsqrt7(sNaN) = canonical NaN
3552      * frsqrt7(-inf) = canonical NaN
3553      * frsqrt7(-normal) = canonical NaN
3554      * frsqrt7(-subnormal) = canonical NaN
3555      */
3556     if (float16_is_signaling_nan(f, s) ||
3557         (float16_is_infinity(f) && sign) ||
3558         (float16_is_normal(f) && sign) ||
3559         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3560         s->float_exception_flags |= float_flag_invalid;
3561         return float16_default_nan(s);
3562     }
3563 
3564     /* frsqrt7(qNaN) = canonical NaN */
3565     if (float16_is_quiet_nan(f, s)) {
3566         return float16_default_nan(s);
3567     }
3568 
3569     /* frsqrt7(+-0) = +-inf */
3570     if (float16_is_zero(f)) {
3571         s->float_exception_flags |= float_flag_divbyzero;
3572         return float16_set_sign(float16_infinity, sign);
3573     }
3574 
3575     /* frsqrt7(+inf) = +0 */
3576     if (float16_is_infinity(f) && !sign) {
3577         return float16_set_sign(float16_zero, sign);
3578     }
3579 
3580     /* +normal, +subnormal */
3581     uint64_t val = frsqrt7(f, exp_size, frac_size);
3582     return make_float16(val);
3583 }
3584 
3585 static float32 frsqrt7_s(float32 f, float_status *s)
3586 {
3587     int exp_size = 8, frac_size = 23;
3588     bool sign = float32_is_neg(f);
3589 
3590     /*
3591      * frsqrt7(sNaN) = canonical NaN
3592      * frsqrt7(-inf) = canonical NaN
3593      * frsqrt7(-normal) = canonical NaN
3594      * frsqrt7(-subnormal) = canonical NaN
3595      */
3596     if (float32_is_signaling_nan(f, s) ||
3597         (float32_is_infinity(f) && sign) ||
3598         (float32_is_normal(f) && sign) ||
3599         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3600         s->float_exception_flags |= float_flag_invalid;
3601         return float32_default_nan(s);
3602     }
3603 
3604     /* frsqrt7(qNaN) = canonical NaN */
3605     if (float32_is_quiet_nan(f, s)) {
3606         return float32_default_nan(s);
3607     }
3608 
3609     /* frsqrt7(+-0) = +-inf */
3610     if (float32_is_zero(f)) {
3611         s->float_exception_flags |= float_flag_divbyzero;
3612         return float32_set_sign(float32_infinity, sign);
3613     }
3614 
3615     /* frsqrt7(+inf) = +0 */
3616     if (float32_is_infinity(f) && !sign) {
3617         return float32_set_sign(float32_zero, sign);
3618     }
3619 
3620     /* +normal, +subnormal */
3621     uint64_t val = frsqrt7(f, exp_size, frac_size);
3622     return make_float32(val);
3623 }
3624 
3625 static float64 frsqrt7_d(float64 f, float_status *s)
3626 {
3627     int exp_size = 11, frac_size = 52;
3628     bool sign = float64_is_neg(f);
3629 
3630     /*
3631      * frsqrt7(sNaN) = canonical NaN
3632      * frsqrt7(-inf) = canonical NaN
3633      * frsqrt7(-normal) = canonical NaN
3634      * frsqrt7(-subnormal) = canonical NaN
3635      */
3636     if (float64_is_signaling_nan(f, s) ||
3637         (float64_is_infinity(f) && sign) ||
3638         (float64_is_normal(f) && sign) ||
3639         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3640         s->float_exception_flags |= float_flag_invalid;
3641         return float64_default_nan(s);
3642     }
3643 
3644     /* frsqrt7(qNaN) = canonical NaN */
3645     if (float64_is_quiet_nan(f, s)) {
3646         return float64_default_nan(s);
3647     }
3648 
3649     /* frsqrt7(+-0) = +-inf */
3650     if (float64_is_zero(f)) {
3651         s->float_exception_flags |= float_flag_divbyzero;
3652         return float64_set_sign(float64_infinity, sign);
3653     }
3654 
3655     /* frsqrt7(+inf) = +0 */
3656     if (float64_is_infinity(f) && !sign) {
3657         return float64_set_sign(float64_zero, sign);
3658     }
3659 
3660     /* +normal, +subnormal */
3661     uint64_t val = frsqrt7(f, exp_size, frac_size);
3662     return make_float64(val);
3663 }
3664 
3665 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3666 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3667 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3668 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3669 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3670 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3671 
3672 /*
3673  * Vector Floating-Point Reciprocal Estimate Instruction
3674  *
3675  * Adapted from riscv-v-spec recip.c:
3676  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3677  */
3678 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3679                       float_status *s)
3680 {
3681     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3682     uint64_t exp = extract64(f, frac_size, exp_size);
3683     uint64_t frac = extract64(f, 0, frac_size);
3684 
3685     const uint8_t lookup_table[] = {
3686         127, 125, 123, 121, 119, 117, 116, 114,
3687         112, 110, 109, 107, 105, 104, 102, 100,
3688         99, 97, 96, 94, 93, 91, 90, 88,
3689         87, 85, 84, 83, 81, 80, 79, 77,
3690         76, 75, 74, 72, 71, 70, 69, 68,
3691         66, 65, 64, 63, 62, 61, 60, 59,
3692         58, 57, 56, 55, 54, 53, 52, 51,
3693         50, 49, 48, 47, 46, 45, 44, 43,
3694         42, 41, 40, 40, 39, 38, 37, 36,
3695         35, 35, 34, 33, 32, 31, 31, 30,
3696         29, 28, 28, 27, 26, 25, 25, 24,
3697         23, 23, 22, 21, 21, 20, 19, 19,
3698         18, 17, 17, 16, 15, 15, 14, 14,
3699         13, 12, 12, 11, 11, 10, 9, 9,
3700         8, 8, 7, 7, 6, 5, 5, 4,
3701         4, 3, 3, 2, 2, 1, 1, 0
3702     };
3703     const int precision = 7;
3704 
3705     if (exp == 0 && frac != 0) { /* subnormal */
3706         /* Normalize the subnormal. */
3707         while (extract64(frac, frac_size - 1, 1) == 0) {
3708             exp--;
3709             frac <<= 1;
3710         }
3711 
3712         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3713 
3714         if (exp != 0 && exp != UINT64_MAX) {
3715             /*
3716              * Overflow to inf or max value of same sign,
3717              * depending on sign and rounding mode.
3718              */
3719             s->float_exception_flags |= (float_flag_inexact |
3720                                          float_flag_overflow);
3721 
3722             if ((s->float_rounding_mode == float_round_to_zero) ||
3723                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3724                 ((s->float_rounding_mode == float_round_up) && sign)) {
3725                 /* Return greatest/negative finite value. */
3726                 return (sign << (exp_size + frac_size)) |
3727                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3728             } else {
3729                 /* Return +-inf. */
3730                 return (sign << (exp_size + frac_size)) |
3731                        MAKE_64BIT_MASK(frac_size, exp_size);
3732             }
3733         }
3734     }
3735 
3736     int idx = frac >> (frac_size - precision);
3737     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3738                         (frac_size - precision);
3739     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3740 
3741     if (out_exp == 0 || out_exp == UINT64_MAX) {
3742         /*
3743          * The result is subnormal, but don't raise the underflow exception,
3744          * because there's no additional loss of precision.
3745          */
3746         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3747         if (out_exp == UINT64_MAX) {
3748             out_frac >>= 1;
3749             out_exp = 0;
3750         }
3751     }
3752 
3753     uint64_t val = 0;
3754     val = deposit64(val, 0, frac_size, out_frac);
3755     val = deposit64(val, frac_size, exp_size, out_exp);
3756     val = deposit64(val, frac_size + exp_size, 1, sign);
3757     return val;
3758 }
3759 
3760 static float16 frec7_h(float16 f, float_status *s)
3761 {
3762     int exp_size = 5, frac_size = 10;
3763     bool sign = float16_is_neg(f);
3764 
3765     /* frec7(+-inf) = +-0 */
3766     if (float16_is_infinity(f)) {
3767         return float16_set_sign(float16_zero, sign);
3768     }
3769 
3770     /* frec7(+-0) = +-inf */
3771     if (float16_is_zero(f)) {
3772         s->float_exception_flags |= float_flag_divbyzero;
3773         return float16_set_sign(float16_infinity, sign);
3774     }
3775 
3776     /* frec7(sNaN) = canonical NaN */
3777     if (float16_is_signaling_nan(f, s)) {
3778         s->float_exception_flags |= float_flag_invalid;
3779         return float16_default_nan(s);
3780     }
3781 
3782     /* frec7(qNaN) = canonical NaN */
3783     if (float16_is_quiet_nan(f, s)) {
3784         return float16_default_nan(s);
3785     }
3786 
3787     /* +-normal, +-subnormal */
3788     uint64_t val = frec7(f, exp_size, frac_size, s);
3789     return make_float16(val);
3790 }
3791 
3792 static float32 frec7_s(float32 f, float_status *s)
3793 {
3794     int exp_size = 8, frac_size = 23;
3795     bool sign = float32_is_neg(f);
3796 
3797     /* frec7(+-inf) = +-0 */
3798     if (float32_is_infinity(f)) {
3799         return float32_set_sign(float32_zero, sign);
3800     }
3801 
3802     /* frec7(+-0) = +-inf */
3803     if (float32_is_zero(f)) {
3804         s->float_exception_flags |= float_flag_divbyzero;
3805         return float32_set_sign(float32_infinity, sign);
3806     }
3807 
3808     /* frec7(sNaN) = canonical NaN */
3809     if (float32_is_signaling_nan(f, s)) {
3810         s->float_exception_flags |= float_flag_invalid;
3811         return float32_default_nan(s);
3812     }
3813 
3814     /* frec7(qNaN) = canonical NaN */
3815     if (float32_is_quiet_nan(f, s)) {
3816         return float32_default_nan(s);
3817     }
3818 
3819     /* +-normal, +-subnormal */
3820     uint64_t val = frec7(f, exp_size, frac_size, s);
3821     return make_float32(val);
3822 }
3823 
3824 static float64 frec7_d(float64 f, float_status *s)
3825 {
3826     int exp_size = 11, frac_size = 52;
3827     bool sign = float64_is_neg(f);
3828 
3829     /* frec7(+-inf) = +-0 */
3830     if (float64_is_infinity(f)) {
3831         return float64_set_sign(float64_zero, sign);
3832     }
3833 
3834     /* frec7(+-0) = +-inf */
3835     if (float64_is_zero(f)) {
3836         s->float_exception_flags |= float_flag_divbyzero;
3837         return float64_set_sign(float64_infinity, sign);
3838     }
3839 
3840     /* frec7(sNaN) = canonical NaN */
3841     if (float64_is_signaling_nan(f, s)) {
3842         s->float_exception_flags |= float_flag_invalid;
3843         return float64_default_nan(s);
3844     }
3845 
3846     /* frec7(qNaN) = canonical NaN */
3847     if (float64_is_quiet_nan(f, s)) {
3848         return float64_default_nan(s);
3849     }
3850 
3851     /* +-normal, +-subnormal */
3852     uint64_t val = frec7(f, exp_size, frac_size, s);
3853     return make_float64(val);
3854 }
3855 
3856 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3857 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3858 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3859 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3860 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3861 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3862 
3863 /* Vector Floating-Point MIN/MAX Instructions */
3864 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3865 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3866 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3867 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3868 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3869 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3870 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3871 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3872 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3873 GEN_VEXT_VF(vfmin_vf_h, 2)
3874 GEN_VEXT_VF(vfmin_vf_w, 4)
3875 GEN_VEXT_VF(vfmin_vf_d, 8)
3876 
3877 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3878 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3879 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3880 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3881 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3882 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3883 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3884 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3885 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3886 GEN_VEXT_VF(vfmax_vf_h, 2)
3887 GEN_VEXT_VF(vfmax_vf_w, 4)
3888 GEN_VEXT_VF(vfmax_vf_d, 8)
3889 
3890 /* Vector Floating-Point Sign-Injection Instructions */
3891 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3892 {
3893     return deposit64(b, 0, 15, a);
3894 }
3895 
3896 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3897 {
3898     return deposit64(b, 0, 31, a);
3899 }
3900 
3901 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3902 {
3903     return deposit64(b, 0, 63, a);
3904 }
3905 
3906 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3907 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3908 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3909 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3910 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3911 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3912 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3913 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3914 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3915 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3916 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3917 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3918 
3919 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3920 {
3921     return deposit64(~b, 0, 15, a);
3922 }
3923 
3924 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3925 {
3926     return deposit64(~b, 0, 31, a);
3927 }
3928 
3929 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3930 {
3931     return deposit64(~b, 0, 63, a);
3932 }
3933 
3934 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3935 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3936 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3937 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3938 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3939 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3940 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3941 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3942 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3943 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
3944 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
3945 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
3946 
3947 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3948 {
3949     return deposit64(b ^ a, 0, 15, a);
3950 }
3951 
3952 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3953 {
3954     return deposit64(b ^ a, 0, 31, a);
3955 }
3956 
3957 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3958 {
3959     return deposit64(b ^ a, 0, 63, a);
3960 }
3961 
3962 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3963 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3964 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3965 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
3966 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
3967 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
3968 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3969 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3970 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3971 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
3972 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
3973 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
3974 
3975 /* Vector Floating-Point Compare Instructions */
3976 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3977 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3978                   CPURISCVState *env, uint32_t desc)          \
3979 {                                                             \
3980     uint32_t vm = vext_vm(desc);                              \
3981     uint32_t vl = env->vl;                                    \
3982     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
3983     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
3984     uint32_t vma = vext_vma(desc);                            \
3985     uint32_t i;                                               \
3986                                                               \
3987     for (i = env->vstart; i < vl; i++) {                      \
3988         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3989         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3990         if (!vm && !vext_elem_mask(v0, i)) {                  \
3991             /* set masked-off elements to 1s */               \
3992             if (vma) {                                        \
3993                 vext_set_elem_mask(vd, i, 1);                 \
3994             }                                                 \
3995             continue;                                         \
3996         }                                                     \
3997         vext_set_elem_mask(vd, i,                             \
3998                            DO_OP(s2, s1, &env->fp_status));   \
3999     }                                                         \
4000     env->vstart = 0;                                          \
4001     /*
4002      * mask destination register are always tail-agnostic
4003      * set tail elements to 1s
4004      */                                                       \
4005     if (vta_all_1s) {                                         \
4006         for (; i < total_elems; i++) {                        \
4007             vext_set_elem_mask(vd, i, 1);                     \
4008         }                                                     \
4009     }                                                         \
4010 }
4011 
4012 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4013 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4014 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4015 
4016 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4017 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4018                   CPURISCVState *env, uint32_t desc)                \
4019 {                                                                   \
4020     uint32_t vm = vext_vm(desc);                                    \
4021     uint32_t vl = env->vl;                                          \
4022     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4023     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4024     uint32_t vma = vext_vma(desc);                                  \
4025     uint32_t i;                                                     \
4026                                                                     \
4027     for (i = env->vstart; i < vl; i++) {                            \
4028         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4029         if (!vm && !vext_elem_mask(v0, i)) {                        \
4030             /* set masked-off elements to 1s */                     \
4031             if (vma) {                                              \
4032                 vext_set_elem_mask(vd, i, 1);                       \
4033             }                                                       \
4034             continue;                                               \
4035         }                                                           \
4036         vext_set_elem_mask(vd, i,                                   \
4037                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4038     }                                                               \
4039     env->vstart = 0;                                                \
4040     /*
4041      * mask destination register are always tail-agnostic
4042      * set tail elements to 1s
4043      */                                                             \
4044     if (vta_all_1s) {                                               \
4045         for (; i < total_elems; i++) {                              \
4046             vext_set_elem_mask(vd, i, 1);                           \
4047         }                                                           \
4048     }                                                               \
4049 }
4050 
4051 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4052 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4053 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4054 
4055 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4056 {
4057     FloatRelation compare = float16_compare_quiet(a, b, s);
4058     return compare != float_relation_equal;
4059 }
4060 
4061 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4062 {
4063     FloatRelation compare = float32_compare_quiet(a, b, s);
4064     return compare != float_relation_equal;
4065 }
4066 
4067 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4068 {
4069     FloatRelation compare = float64_compare_quiet(a, b, s);
4070     return compare != float_relation_equal;
4071 }
4072 
4073 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4074 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4075 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4076 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4077 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4078 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4079 
4080 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4081 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4082 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4083 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4084 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4085 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4086 
4087 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4088 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4089 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4090 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4091 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4092 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4093 
4094 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4095 {
4096     FloatRelation compare = float16_compare(a, b, s);
4097     return compare == float_relation_greater;
4098 }
4099 
4100 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4101 {
4102     FloatRelation compare = float32_compare(a, b, s);
4103     return compare == float_relation_greater;
4104 }
4105 
4106 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4107 {
4108     FloatRelation compare = float64_compare(a, b, s);
4109     return compare == float_relation_greater;
4110 }
4111 
4112 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4113 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4114 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4115 
4116 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4117 {
4118     FloatRelation compare = float16_compare(a, b, s);
4119     return compare == float_relation_greater ||
4120            compare == float_relation_equal;
4121 }
4122 
4123 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4124 {
4125     FloatRelation compare = float32_compare(a, b, s);
4126     return compare == float_relation_greater ||
4127            compare == float_relation_equal;
4128 }
4129 
4130 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4131 {
4132     FloatRelation compare = float64_compare(a, b, s);
4133     return compare == float_relation_greater ||
4134            compare == float_relation_equal;
4135 }
4136 
4137 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4138 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4139 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4140 
4141 /* Vector Floating-Point Classify Instruction */
4142 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4143 static void do_##NAME(void *vd, void *vs2, int i)      \
4144 {                                                      \
4145     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4146     *((TD *)vd + HD(i)) = OP(s2);                      \
4147 }
4148 
4149 #define GEN_VEXT_V(NAME, ESZ)                          \
4150 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4151                   CPURISCVState *env, uint32_t desc)   \
4152 {                                                      \
4153     uint32_t vm = vext_vm(desc);                       \
4154     uint32_t vl = env->vl;                             \
4155     uint32_t total_elems =                             \
4156         vext_get_total_elems(env, desc, ESZ);          \
4157     uint32_t vta = vext_vta(desc);                     \
4158     uint32_t vma = vext_vma(desc);                     \
4159     uint32_t i;                                        \
4160                                                        \
4161     for (i = env->vstart; i < vl; i++) {               \
4162         if (!vm && !vext_elem_mask(v0, i)) {           \
4163             /* set masked-off elements to 1s */        \
4164             vext_set_elems_1s(vd, vma, i * ESZ,        \
4165                               (i + 1) * ESZ);          \
4166             continue;                                  \
4167         }                                              \
4168         do_##NAME(vd, vs2, i);                         \
4169     }                                                  \
4170     env->vstart = 0;                                   \
4171     /* set tail elements to 1s */                      \
4172     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4173                       total_elems * ESZ);              \
4174 }
4175 
4176 target_ulong fclass_h(uint64_t frs1)
4177 {
4178     float16 f = frs1;
4179     bool sign = float16_is_neg(f);
4180 
4181     if (float16_is_infinity(f)) {
4182         return sign ? 1 << 0 : 1 << 7;
4183     } else if (float16_is_zero(f)) {
4184         return sign ? 1 << 3 : 1 << 4;
4185     } else if (float16_is_zero_or_denormal(f)) {
4186         return sign ? 1 << 2 : 1 << 5;
4187     } else if (float16_is_any_nan(f)) {
4188         float_status s = { }; /* for snan_bit_is_one */
4189         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4190     } else {
4191         return sign ? 1 << 1 : 1 << 6;
4192     }
4193 }
4194 
4195 target_ulong fclass_s(uint64_t frs1)
4196 {
4197     float32 f = frs1;
4198     bool sign = float32_is_neg(f);
4199 
4200     if (float32_is_infinity(f)) {
4201         return sign ? 1 << 0 : 1 << 7;
4202     } else if (float32_is_zero(f)) {
4203         return sign ? 1 << 3 : 1 << 4;
4204     } else if (float32_is_zero_or_denormal(f)) {
4205         return sign ? 1 << 2 : 1 << 5;
4206     } else if (float32_is_any_nan(f)) {
4207         float_status s = { }; /* for snan_bit_is_one */
4208         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4209     } else {
4210         return sign ? 1 << 1 : 1 << 6;
4211     }
4212 }
4213 
4214 target_ulong fclass_d(uint64_t frs1)
4215 {
4216     float64 f = frs1;
4217     bool sign = float64_is_neg(f);
4218 
4219     if (float64_is_infinity(f)) {
4220         return sign ? 1 << 0 : 1 << 7;
4221     } else if (float64_is_zero(f)) {
4222         return sign ? 1 << 3 : 1 << 4;
4223     } else if (float64_is_zero_or_denormal(f)) {
4224         return sign ? 1 << 2 : 1 << 5;
4225     } else if (float64_is_any_nan(f)) {
4226         float_status s = { }; /* for snan_bit_is_one */
4227         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4228     } else {
4229         return sign ? 1 << 1 : 1 << 6;
4230     }
4231 }
4232 
4233 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4234 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4235 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4236 GEN_VEXT_V(vfclass_v_h, 2)
4237 GEN_VEXT_V(vfclass_v_w, 4)
4238 GEN_VEXT_V(vfclass_v_d, 8)
4239 
4240 /* Vector Floating-Point Merge Instruction */
4241 
4242 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4243 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4244                   CPURISCVState *env, uint32_t desc)          \
4245 {                                                             \
4246     uint32_t vm = vext_vm(desc);                              \
4247     uint32_t vl = env->vl;                                    \
4248     uint32_t esz = sizeof(ETYPE);                             \
4249     uint32_t total_elems =                                    \
4250         vext_get_total_elems(env, desc, esz);                 \
4251     uint32_t vta = vext_vta(desc);                            \
4252     uint32_t i;                                               \
4253                                                               \
4254     for (i = env->vstart; i < vl; i++) {                      \
4255         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4256         *((ETYPE *)vd + H(i)) =                               \
4257             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4258     }                                                         \
4259     env->vstart = 0;                                          \
4260     /* set tail elements to 1s */                             \
4261     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4262 }
4263 
4264 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4265 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4266 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4267 
4268 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4269 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4270 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4271 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4272 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4273 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4274 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4275 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4276 
4277 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4278 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4279 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4280 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4281 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4282 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4283 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4284 
4285 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4286 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4287 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4288 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4289 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4290 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4291 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4292 
4293 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4294 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4295 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4296 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4297 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4298 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4299 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4300 
4301 /* Widening Floating-Point/Integer Type-Convert Instructions */
4302 /* (TD, T2, TX2) */
4303 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4304 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4305 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4306 /*
4307  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4308  */
4309 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4310 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4311 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4312 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4313 
4314 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4315 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4316 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4317 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4318 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4319 
4320 /*
4321  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4322  */
4323 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4324 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4325 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4326 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4327 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4328 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4329 
4330 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4331 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4332 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4333 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4334 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4335 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4336 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4337 
4338 /*
4339  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4340  */
4341 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4342 {
4343     return float16_to_float32(a, true, s);
4344 }
4345 
4346 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4347 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4348 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4349 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4350 
4351 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4352 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4353 
4354 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4355 /* (TD, T2, TX2) */
4356 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4357 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4358 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4359 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4360 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4361 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4362 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4363 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4364 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4365 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4366 
4367 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4368 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4369 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4370 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4371 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4372 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4373 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4374 
4375 /*
4376  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4377  */
4378 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4379 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4380 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4381 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4382 
4383 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4384 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4385 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4386 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4387 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4388 
4389 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4390 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4391 {
4392     return float32_to_float16(a, true, s);
4393 }
4394 
4395 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4396 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4397 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4398 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4399 
4400 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4401 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4402 
4403 /*
4404  * Vector Reduction Operations
4405  */
4406 /* Vector Single-Width Integer Reduction Instructions */
4407 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4408 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4409                   void *vs2, CPURISCVState *env,          \
4410                   uint32_t desc)                          \
4411 {                                                         \
4412     uint32_t vm = vext_vm(desc);                          \
4413     uint32_t vl = env->vl;                                \
4414     uint32_t esz = sizeof(TD);                            \
4415     uint32_t vlenb = simd_maxsz(desc);                    \
4416     uint32_t vta = vext_vta(desc);                        \
4417     uint32_t i;                                           \
4418     TD s1 =  *((TD *)vs1 + HD(0));                        \
4419                                                           \
4420     for (i = env->vstart; i < vl; i++) {                  \
4421         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4422         if (!vm && !vext_elem_mask(v0, i)) {              \
4423             continue;                                     \
4424         }                                                 \
4425         s1 = OP(s1, (TD)s2);                              \
4426     }                                                     \
4427     *((TD *)vd + HD(0)) = s1;                             \
4428     env->vstart = 0;                                      \
4429     /* set tail elements to 1s */                         \
4430     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4431 }
4432 
4433 /* vd[0] = sum(vs1[0], vs2[*]) */
4434 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4435 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4436 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4437 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4438 
4439 /* vd[0] = maxu(vs1[0], vs2[*]) */
4440 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4441 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4442 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4443 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4444 
4445 /* vd[0] = max(vs1[0], vs2[*]) */
4446 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4447 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4448 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4449 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4450 
4451 /* vd[0] = minu(vs1[0], vs2[*]) */
4452 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4453 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4454 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4455 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4456 
4457 /* vd[0] = min(vs1[0], vs2[*]) */
4458 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4459 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4460 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4461 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4462 
4463 /* vd[0] = and(vs1[0], vs2[*]) */
4464 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4465 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4466 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4467 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4468 
4469 /* vd[0] = or(vs1[0], vs2[*]) */
4470 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4471 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4472 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4473 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4474 
4475 /* vd[0] = xor(vs1[0], vs2[*]) */
4476 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4477 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4478 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4479 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4480 
4481 /* Vector Widening Integer Reduction Instructions */
4482 /* signed sum reduction into double-width accumulator */
4483 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4484 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4485 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4486 
4487 /* Unsigned sum reduction into double-width accumulator */
4488 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4489 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4490 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4491 
4492 /* Vector Single-Width Floating-Point Reduction Instructions */
4493 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4494 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4495                   void *vs2, CPURISCVState *env,           \
4496                   uint32_t desc)                           \
4497 {                                                          \
4498     uint32_t vm = vext_vm(desc);                           \
4499     uint32_t vl = env->vl;                                 \
4500     uint32_t esz = sizeof(TD);                             \
4501     uint32_t vlenb = simd_maxsz(desc);                     \
4502     uint32_t vta = vext_vta(desc);                         \
4503     uint32_t i;                                            \
4504     TD s1 =  *((TD *)vs1 + HD(0));                         \
4505                                                            \
4506     for (i = env->vstart; i < vl; i++) {                   \
4507         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4508         if (!vm && !vext_elem_mask(v0, i)) {               \
4509             continue;                                      \
4510         }                                                  \
4511         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4512     }                                                      \
4513     *((TD *)vd + HD(0)) = s1;                              \
4514     env->vstart = 0;                                       \
4515     /* set tail elements to 1s */                          \
4516     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4517 }
4518 
4519 /* Unordered sum */
4520 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4521 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4522 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4523 
4524 /* Ordered sum */
4525 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4526 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4527 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4528 
4529 /* Maximum value */
4530 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4531               float16_maximum_number)
4532 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4533               float32_maximum_number)
4534 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4535               float64_maximum_number)
4536 
4537 /* Minimum value */
4538 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4539               float16_minimum_number)
4540 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4541               float32_minimum_number)
4542 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4543               float64_minimum_number)
4544 
4545 /* Vector Widening Floating-Point Add Instructions */
4546 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4547 {
4548     return float32_add(a, float16_to_float32(b, true, s), s);
4549 }
4550 
4551 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4552 {
4553     return float64_add(a, float32_to_float64(b, s), s);
4554 }
4555 
4556 /* Vector Widening Floating-Point Reduction Instructions */
4557 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4558 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4559 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4560 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4561 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4562 
4563 /*
4564  * Vector Mask Operations
4565  */
4566 /* Vector Mask-Register Logical Instructions */
4567 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4568 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4569                   void *vs2, CPURISCVState *env,          \
4570                   uint32_t desc)                          \
4571 {                                                         \
4572     uint32_t vl = env->vl;                                \
4573     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4574     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4575     uint32_t i;                                           \
4576     int a, b;                                             \
4577                                                           \
4578     for (i = env->vstart; i < vl; i++) {                  \
4579         a = vext_elem_mask(vs1, i);                       \
4580         b = vext_elem_mask(vs2, i);                       \
4581         vext_set_elem_mask(vd, i, OP(b, a));              \
4582     }                                                     \
4583     env->vstart = 0;                                      \
4584     /*
4585      * mask destination register are always tail-agnostic
4586      * set tail elements to 1s
4587      */                                                   \
4588     if (vta_all_1s) {                                     \
4589         for (; i < total_elems; i++) {                    \
4590             vext_set_elem_mask(vd, i, 1);                 \
4591         }                                                 \
4592     }                                                     \
4593 }
4594 
4595 #define DO_NAND(N, M)  (!(N & M))
4596 #define DO_ANDNOT(N, M)  (N & !M)
4597 #define DO_NOR(N, M)  (!(N | M))
4598 #define DO_ORNOT(N, M)  (N | !M)
4599 #define DO_XNOR(N, M)  (!(N ^ M))
4600 
4601 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4602 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4603 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4604 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4605 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4606 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4607 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4608 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4609 
4610 /* Vector count population in mask vcpop */
4611 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4612                              uint32_t desc)
4613 {
4614     target_ulong cnt = 0;
4615     uint32_t vm = vext_vm(desc);
4616     uint32_t vl = env->vl;
4617     int i;
4618 
4619     for (i = env->vstart; i < vl; i++) {
4620         if (vm || vext_elem_mask(v0, i)) {
4621             if (vext_elem_mask(vs2, i)) {
4622                 cnt++;
4623             }
4624         }
4625     }
4626     env->vstart = 0;
4627     return cnt;
4628 }
4629 
4630 /* vfirst find-first-set mask bit */
4631 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4632                               uint32_t desc)
4633 {
4634     uint32_t vm = vext_vm(desc);
4635     uint32_t vl = env->vl;
4636     int i;
4637 
4638     for (i = env->vstart; i < vl; i++) {
4639         if (vm || vext_elem_mask(v0, i)) {
4640             if (vext_elem_mask(vs2, i)) {
4641                 return i;
4642             }
4643         }
4644     }
4645     env->vstart = 0;
4646     return -1LL;
4647 }
4648 
4649 enum set_mask_type {
4650     ONLY_FIRST = 1,
4651     INCLUDE_FIRST,
4652     BEFORE_FIRST,
4653 };
4654 
4655 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4656                    uint32_t desc, enum set_mask_type type)
4657 {
4658     uint32_t vm = vext_vm(desc);
4659     uint32_t vl = env->vl;
4660     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4661     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4662     uint32_t vma = vext_vma(desc);
4663     int i;
4664     bool first_mask_bit = false;
4665 
4666     for (i = env->vstart; i < vl; i++) {
4667         if (!vm && !vext_elem_mask(v0, i)) {
4668             /* set masked-off elements to 1s */
4669             if (vma) {
4670                 vext_set_elem_mask(vd, i, 1);
4671             }
4672             continue;
4673         }
4674         /* write a zero to all following active elements */
4675         if (first_mask_bit) {
4676             vext_set_elem_mask(vd, i, 0);
4677             continue;
4678         }
4679         if (vext_elem_mask(vs2, i)) {
4680             first_mask_bit = true;
4681             if (type == BEFORE_FIRST) {
4682                 vext_set_elem_mask(vd, i, 0);
4683             } else {
4684                 vext_set_elem_mask(vd, i, 1);
4685             }
4686         } else {
4687             if (type == ONLY_FIRST) {
4688                 vext_set_elem_mask(vd, i, 0);
4689             } else {
4690                 vext_set_elem_mask(vd, i, 1);
4691             }
4692         }
4693     }
4694     env->vstart = 0;
4695     /*
4696      * mask destination register are always tail-agnostic
4697      * set tail elements to 1s
4698      */
4699     if (vta_all_1s) {
4700         for (; i < total_elems; i++) {
4701             vext_set_elem_mask(vd, i, 1);
4702         }
4703     }
4704 }
4705 
4706 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4707                      uint32_t desc)
4708 {
4709     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4710 }
4711 
4712 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4713                      uint32_t desc)
4714 {
4715     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4716 }
4717 
4718 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4719                      uint32_t desc)
4720 {
4721     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4722 }
4723 
4724 /* Vector Iota Instruction */
4725 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4726 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4727                   uint32_t desc)                                          \
4728 {                                                                         \
4729     uint32_t vm = vext_vm(desc);                                          \
4730     uint32_t vl = env->vl;                                                \
4731     uint32_t esz = sizeof(ETYPE);                                         \
4732     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4733     uint32_t vta = vext_vta(desc);                                        \
4734     uint32_t vma = vext_vma(desc);                                        \
4735     uint32_t sum = 0;                                                     \
4736     int i;                                                                \
4737                                                                           \
4738     for (i = env->vstart; i < vl; i++) {                                  \
4739         if (!vm && !vext_elem_mask(v0, i)) {                              \
4740             /* set masked-off elements to 1s */                           \
4741             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4742             continue;                                                     \
4743         }                                                                 \
4744         *((ETYPE *)vd + H(i)) = sum;                                      \
4745         if (vext_elem_mask(vs2, i)) {                                     \
4746             sum++;                                                        \
4747         }                                                                 \
4748     }                                                                     \
4749     env->vstart = 0;                                                      \
4750     /* set tail elements to 1s */                                         \
4751     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4752 }
4753 
4754 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4755 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4756 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4757 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4758 
4759 /* Vector Element Index Instruction */
4760 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4761 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4762 {                                                                         \
4763     uint32_t vm = vext_vm(desc);                                          \
4764     uint32_t vl = env->vl;                                                \
4765     uint32_t esz = sizeof(ETYPE);                                         \
4766     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4767     uint32_t vta = vext_vta(desc);                                        \
4768     uint32_t vma = vext_vma(desc);                                        \
4769     int i;                                                                \
4770                                                                           \
4771     for (i = env->vstart; i < vl; i++) {                                  \
4772         if (!vm && !vext_elem_mask(v0, i)) {                              \
4773             /* set masked-off elements to 1s */                           \
4774             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4775             continue;                                                     \
4776         }                                                                 \
4777         *((ETYPE *)vd + H(i)) = i;                                        \
4778     }                                                                     \
4779     env->vstart = 0;                                                      \
4780     /* set tail elements to 1s */                                         \
4781     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4782 }
4783 
4784 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4785 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4786 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4787 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4788 
4789 /*
4790  * Vector Permutation Instructions
4791  */
4792 
4793 /* Vector Slide Instructions */
4794 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4795 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4796                   CPURISCVState *env, uint32_t desc)                      \
4797 {                                                                         \
4798     uint32_t vm = vext_vm(desc);                                          \
4799     uint32_t vl = env->vl;                                                \
4800     uint32_t esz = sizeof(ETYPE);                                         \
4801     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4802     uint32_t vta = vext_vta(desc);                                        \
4803     uint32_t vma = vext_vma(desc);                                        \
4804     target_ulong offset = s1, i_min, i;                                   \
4805                                                                           \
4806     i_min = MAX(env->vstart, offset);                                     \
4807     for (i = i_min; i < vl; i++) {                                        \
4808         if (!vm && !vext_elem_mask(v0, i)) {                              \
4809             /* set masked-off elements to 1s */                           \
4810             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4811             continue;                                                     \
4812         }                                                                 \
4813         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4814     }                                                                     \
4815     /* set tail elements to 1s */                                         \
4816     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4817 }
4818 
4819 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4820 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4821 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4822 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4823 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4824 
4825 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4826 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4827                   CPURISCVState *env, uint32_t desc)                      \
4828 {                                                                         \
4829     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4830     uint32_t vm = vext_vm(desc);                                          \
4831     uint32_t vl = env->vl;                                                \
4832     uint32_t esz = sizeof(ETYPE);                                         \
4833     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4834     uint32_t vta = vext_vta(desc);                                        \
4835     uint32_t vma = vext_vma(desc);                                        \
4836     target_ulong i_max, i;                                                \
4837                                                                           \
4838     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4839     for (i = env->vstart; i < i_max; ++i) {                               \
4840         if (!vm && !vext_elem_mask(v0, i)) {                              \
4841             /* set masked-off elements to 1s */                           \
4842             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4843             continue;                                                     \
4844         }                                                                 \
4845         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4846     }                                                                     \
4847                                                                           \
4848     for (i = i_max; i < vl; ++i) {                                        \
4849         if (vm || vext_elem_mask(v0, i)) {                                \
4850             *((ETYPE *)vd + H(i)) = 0;                                    \
4851         }                                                                 \
4852     }                                                                     \
4853                                                                           \
4854     env->vstart = 0;                                                      \
4855     /* set tail elements to 1s */                                         \
4856     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4857 }
4858 
4859 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4860 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4861 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4862 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4863 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4864 
4865 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4866 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4867                                  void *vs2, CPURISCVState *env,             \
4868                                  uint32_t desc)                             \
4869 {                                                                           \
4870     typedef uint##BITWIDTH##_t ETYPE;                                       \
4871     uint32_t vm = vext_vm(desc);                                            \
4872     uint32_t vl = env->vl;                                                  \
4873     uint32_t esz = sizeof(ETYPE);                                           \
4874     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4875     uint32_t vta = vext_vta(desc);                                          \
4876     uint32_t vma = vext_vma(desc);                                          \
4877     uint32_t i;                                                             \
4878                                                                             \
4879     for (i = env->vstart; i < vl; i++) {                                    \
4880         if (!vm && !vext_elem_mask(v0, i)) {                                \
4881             /* set masked-off elements to 1s */                             \
4882             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4883             continue;                                                       \
4884         }                                                                   \
4885         if (i == 0) {                                                       \
4886             *((ETYPE *)vd + H(i)) = s1;                                     \
4887         } else {                                                            \
4888             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4889         }                                                                   \
4890     }                                                                       \
4891     env->vstart = 0;                                                        \
4892     /* set tail elements to 1s */                                           \
4893     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4894 }
4895 
4896 GEN_VEXT_VSLIE1UP(8,  H1)
4897 GEN_VEXT_VSLIE1UP(16, H2)
4898 GEN_VEXT_VSLIE1UP(32, H4)
4899 GEN_VEXT_VSLIE1UP(64, H8)
4900 
4901 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4902 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4903                   CPURISCVState *env, uint32_t desc)              \
4904 {                                                                 \
4905     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4906 }
4907 
4908 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4909 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4910 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4911 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4912 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4913 
4914 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4915 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4916                                    void *vs2, CPURISCVState *env,             \
4917                                    uint32_t desc)                             \
4918 {                                                                             \
4919     typedef uint##BITWIDTH##_t ETYPE;                                         \
4920     uint32_t vm = vext_vm(desc);                                              \
4921     uint32_t vl = env->vl;                                                    \
4922     uint32_t esz = sizeof(ETYPE);                                             \
4923     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4924     uint32_t vta = vext_vta(desc);                                            \
4925     uint32_t vma = vext_vma(desc);                                            \
4926     uint32_t i;                                                               \
4927                                                                               \
4928     for (i = env->vstart; i < vl; i++) {                                      \
4929         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4930             /* set masked-off elements to 1s */                               \
4931             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4932             continue;                                                         \
4933         }                                                                     \
4934         if (i == vl - 1) {                                                    \
4935             *((ETYPE *)vd + H(i)) = s1;                                       \
4936         } else {                                                              \
4937             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4938         }                                                                     \
4939     }                                                                         \
4940     env->vstart = 0;                                                          \
4941     /* set tail elements to 1s */                                             \
4942     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4943 }
4944 
4945 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4946 GEN_VEXT_VSLIDE1DOWN(16, H2)
4947 GEN_VEXT_VSLIDE1DOWN(32, H4)
4948 GEN_VEXT_VSLIDE1DOWN(64, H8)
4949 
4950 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4951 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4952                   CPURISCVState *env, uint32_t desc)              \
4953 {                                                                 \
4954     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4955 }
4956 
4957 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4958 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4959 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4960 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4961 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4962 
4963 /* Vector Floating-Point Slide Instructions */
4964 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4965 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4966                   CPURISCVState *env, uint32_t desc)          \
4967 {                                                             \
4968     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4969 }
4970 
4971 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4972 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4973 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4974 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4975 
4976 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4977 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4978                   CPURISCVState *env, uint32_t desc)          \
4979 {                                                             \
4980     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4981 }
4982 
4983 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4984 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4985 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4986 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4987 
4988 /* Vector Register Gather Instruction */
4989 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4990 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4991                   CPURISCVState *env, uint32_t desc)                      \
4992 {                                                                         \
4993     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4994     uint32_t vm = vext_vm(desc);                                          \
4995     uint32_t vl = env->vl;                                                \
4996     uint32_t esz = sizeof(TS2);                                           \
4997     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4998     uint32_t vta = vext_vta(desc);                                        \
4999     uint32_t vma = vext_vma(desc);                                        \
5000     uint64_t index;                                                       \
5001     uint32_t i;                                                           \
5002                                                                           \
5003     for (i = env->vstart; i < vl; i++) {                                  \
5004         if (!vm && !vext_elem_mask(v0, i)) {                              \
5005             /* set masked-off elements to 1s */                           \
5006             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5007             continue;                                                     \
5008         }                                                                 \
5009         index = *((TS1 *)vs1 + HS1(i));                                   \
5010         if (index >= vlmax) {                                             \
5011             *((TS2 *)vd + HS2(i)) = 0;                                    \
5012         } else {                                                          \
5013             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5014         }                                                                 \
5015     }                                                                     \
5016     env->vstart = 0;                                                      \
5017     /* set tail elements to 1s */                                         \
5018     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5019 }
5020 
5021 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5022 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5023 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5024 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5025 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5026 
5027 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5028 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5029 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5030 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5031 
5032 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5033 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5034                   CPURISCVState *env, uint32_t desc)                      \
5035 {                                                                         \
5036     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5037     uint32_t vm = vext_vm(desc);                                          \
5038     uint32_t vl = env->vl;                                                \
5039     uint32_t esz = sizeof(ETYPE);                                         \
5040     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5041     uint32_t vta = vext_vta(desc);                                        \
5042     uint32_t vma = vext_vma(desc);                                        \
5043     uint64_t index = s1;                                                  \
5044     uint32_t i;                                                           \
5045                                                                           \
5046     for (i = env->vstart; i < vl; i++) {                                  \
5047         if (!vm && !vext_elem_mask(v0, i)) {                              \
5048             /* set masked-off elements to 1s */                           \
5049             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5050             continue;                                                     \
5051         }                                                                 \
5052         if (index >= vlmax) {                                             \
5053             *((ETYPE *)vd + H(i)) = 0;                                    \
5054         } else {                                                          \
5055             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5056         }                                                                 \
5057     }                                                                     \
5058     env->vstart = 0;                                                      \
5059     /* set tail elements to 1s */                                         \
5060     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5061 }
5062 
5063 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5064 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5065 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5066 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5067 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5068 
5069 /* Vector Compress Instruction */
5070 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5071 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5072                   CPURISCVState *env, uint32_t desc)                      \
5073 {                                                                         \
5074     uint32_t vl = env->vl;                                                \
5075     uint32_t esz = sizeof(ETYPE);                                         \
5076     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5077     uint32_t vta = vext_vta(desc);                                        \
5078     uint32_t num = 0, i;                                                  \
5079                                                                           \
5080     for (i = env->vstart; i < vl; i++) {                                  \
5081         if (!vext_elem_mask(vs1, i)) {                                    \
5082             continue;                                                     \
5083         }                                                                 \
5084         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5085         num++;                                                            \
5086     }                                                                     \
5087     env->vstart = 0;                                                      \
5088     /* set tail elements to 1s */                                         \
5089     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5090 }
5091 
5092 /* Compress into vd elements of vs2 where vs1 is enabled */
5093 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5094 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5095 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5096 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5097 
5098 /* Vector Whole Register Move */
5099 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5100 {
5101     /* EEW = SEW */
5102     uint32_t maxsz = simd_maxsz(desc);
5103     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5104     uint32_t startb = env->vstart * sewb;
5105     uint32_t i = startb;
5106 
5107     memcpy((uint8_t *)vd + H1(i),
5108            (uint8_t *)vs2 + H1(i),
5109            maxsz - startb);
5110 
5111     env->vstart = 0;
5112 }
5113 
5114 /* Vector Integer Extension */
5115 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5116 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5117                   CPURISCVState *env, uint32_t desc)             \
5118 {                                                                \
5119     uint32_t vl = env->vl;                                       \
5120     uint32_t vm = vext_vm(desc);                                 \
5121     uint32_t esz = sizeof(ETYPE);                                \
5122     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5123     uint32_t vta = vext_vta(desc);                               \
5124     uint32_t vma = vext_vma(desc);                               \
5125     uint32_t i;                                                  \
5126                                                                  \
5127     for (i = env->vstart; i < vl; i++) {                         \
5128         if (!vm && !vext_elem_mask(v0, i)) {                     \
5129             /* set masked-off elements to 1s */                  \
5130             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5131             continue;                                            \
5132         }                                                        \
5133         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5134     }                                                            \
5135     env->vstart = 0;                                             \
5136     /* set tail elements to 1s */                                \
5137     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5138 }
5139 
5140 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5141 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5142 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5143 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5144 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5145 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5146 
5147 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5148 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5149 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5150 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5151 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5152 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5153