xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 6400be01)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "internals.h"
30 #include "vector_internals.h"
31 #include <math.h>
32 
33 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
34                             target_ulong s2)
35 {
36     int vlmax, vl;
37     RISCVCPU *cpu = env_archcpu(env);
38     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
39     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
40     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
41     int xlen = riscv_cpu_xlen(env);
42     bool vill = (s2 >> (xlen - 1)) & 0x1;
43     target_ulong reserved = s2 &
44                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
45                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
46 
47     if (lmul & 4) {
48         /* Fractional LMUL - check LMUL * VLEN >= SEW */
49         if (lmul == 4 ||
50             cpu->cfg.vlen >> (8 - lmul) < sew) {
51             vill = true;
52         }
53     }
54 
55     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
56         /* only set vill bit. */
57         env->vill = 1;
58         env->vtype = 0;
59         env->vl = 0;
60         env->vstart = 0;
61         return 0;
62     }
63 
64     vlmax = vext_get_vlmax(cpu, s2);
65     if (s1 <= vlmax) {
66         vl = s1;
67     } else {
68         vl = vlmax;
69     }
70     env->vl = vl;
71     env->vtype = s2;
72     env->vstart = 0;
73     env->vill = 0;
74     return vl;
75 }
76 
77 /*
78  * Get the maximum number of elements can be operated.
79  *
80  * log2_esz: log2 of element size in bytes.
81  */
82 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
83 {
84     /*
85      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
86      * so vlen in bytes (vlenb) is encoded as maxsz.
87      */
88     uint32_t vlenb = simd_maxsz(desc);
89 
90     /* Return VLMAX */
91     int scale = vext_lmul(desc) - log2_esz;
92     return scale < 0 ? vlenb >> -scale : vlenb << scale;
93 }
94 
95 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
96 {
97     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
98 }
99 
100 /*
101  * This function checks watchpoint before real load operation.
102  *
103  * In system mode, the TLB API probe_access is enough for watchpoint check.
104  * In user mode, there is no watchpoint support now.
105  *
106  * It will trigger an exception if there is no mapping in TLB
107  * and page table walk can't fill the TLB entry. Then the guest
108  * software can return here after process the exception or never return.
109  */
110 static void probe_pages(CPURISCVState *env, target_ulong addr,
111                         target_ulong len, uintptr_t ra,
112                         MMUAccessType access_type)
113 {
114     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
115     target_ulong curlen = MIN(pagelen, len);
116     int mmu_index = riscv_env_mmu_index(env, false);
117 
118     probe_access(env, adjust_addr(env, addr), curlen, access_type,
119                  mmu_index, ra);
120     if (len > curlen) {
121         addr += curlen;
122         curlen = len - curlen;
123         probe_access(env, adjust_addr(env, addr), curlen, access_type,
124                      mmu_index, ra);
125     }
126 }
127 
128 static inline void vext_set_elem_mask(void *v0, int index,
129                                       uint8_t value)
130 {
131     int idx = index / 64;
132     int pos = index % 64;
133     uint64_t old = ((uint64_t *)v0)[idx];
134     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
135 }
136 
137 /* elements operations for load and store */
138 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
139                                uint32_t idx, void *vd, uintptr_t retaddr);
140 
141 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
142 static void NAME(CPURISCVState *env, abi_ptr addr,         \
143                  uint32_t idx, void *vd, uintptr_t retaddr)\
144 {                                                          \
145     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
146     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
147 }                                                          \
148 
149 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
150 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
151 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
152 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
153 
154 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
155 static void NAME(CPURISCVState *env, abi_ptr addr,         \
156                  uint32_t idx, void *vd, uintptr_t retaddr)\
157 {                                                          \
158     ETYPE data = *((ETYPE *)vd + H(idx));                  \
159     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
160 }
161 
162 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
163 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
164 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
165 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
166 
167 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
168                                    uint32_t desc, uint32_t nf,
169                                    uint32_t esz, uint32_t max_elems)
170 {
171     uint32_t vta = vext_vta(desc);
172     int k;
173 
174     if (vta == 0) {
175         return;
176     }
177 
178     for (k = 0; k < nf; ++k) {
179         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
180                           (k * max_elems + max_elems) * esz);
181     }
182 }
183 
184 /*
185  * stride: access vector element from strided memory
186  */
187 static void
188 vext_ldst_stride(void *vd, void *v0, target_ulong base,
189                  target_ulong stride, CPURISCVState *env,
190                  uint32_t desc, uint32_t vm,
191                  vext_ldst_elem_fn *ldst_elem,
192                  uint32_t log2_esz, uintptr_t ra)
193 {
194     uint32_t i, k;
195     uint32_t nf = vext_nf(desc);
196     uint32_t max_elems = vext_max_elems(desc, log2_esz);
197     uint32_t esz = 1 << log2_esz;
198     uint32_t vma = vext_vma(desc);
199 
200     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
201         k = 0;
202         while (k < nf) {
203             if (!vm && !vext_elem_mask(v0, i)) {
204                 /* set masked-off elements to 1s */
205                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
206                                   (i + k * max_elems + 1) * esz);
207                 k++;
208                 continue;
209             }
210             target_ulong addr = base + stride * i + (k << log2_esz);
211             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
212             k++;
213         }
214     }
215     env->vstart = 0;
216 
217     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
218 }
219 
220 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
221 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
222                   target_ulong stride, CPURISCVState *env,              \
223                   uint32_t desc)                                        \
224 {                                                                       \
225     uint32_t vm = vext_vm(desc);                                        \
226     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
227                      ctzl(sizeof(ETYPE)), GETPC());                     \
228 }
229 
230 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
231 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
232 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
233 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
234 
235 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
236 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
237                   target_ulong stride, CPURISCVState *env,              \
238                   uint32_t desc)                                        \
239 {                                                                       \
240     uint32_t vm = vext_vm(desc);                                        \
241     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
242                      ctzl(sizeof(ETYPE)), GETPC());                     \
243 }
244 
245 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
246 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
247 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
248 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
249 
250 /*
251  * unit-stride: access elements stored contiguously in memory
252  */
253 
254 /* unmasked unit-stride load and store operation */
255 static void
256 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
257              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
258              uintptr_t ra)
259 {
260     uint32_t i, k;
261     uint32_t nf = vext_nf(desc);
262     uint32_t max_elems = vext_max_elems(desc, log2_esz);
263     uint32_t esz = 1 << log2_esz;
264 
265     /* load bytes from guest memory */
266     for (i = env->vstart; i < evl; i++, env->vstart++) {
267         k = 0;
268         while (k < nf) {
269             target_ulong addr = base + ((i * nf + k) << log2_esz);
270             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
271             k++;
272         }
273     }
274     env->vstart = 0;
275 
276     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
277 }
278 
279 /*
280  * masked unit-stride load and store operation will be a special case of
281  * stride, stride = NF * sizeof (ETYPE)
282  */
283 
284 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
285 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
286                          CPURISCVState *env, uint32_t desc)             \
287 {                                                                       \
288     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
289     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
290                      ctzl(sizeof(ETYPE)), GETPC());                     \
291 }                                                                       \
292                                                                         \
293 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
294                   CPURISCVState *env, uint32_t desc)                    \
295 {                                                                       \
296     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
297                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
298 }
299 
300 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
301 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
302 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
303 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
304 
305 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
306 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
307                          CPURISCVState *env, uint32_t desc)              \
308 {                                                                        \
309     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
310     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
311                      ctzl(sizeof(ETYPE)), GETPC());                      \
312 }                                                                        \
313                                                                          \
314 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
315                   CPURISCVState *env, uint32_t desc)                     \
316 {                                                                        \
317     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
318                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
319 }
320 
321 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
322 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
323 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
324 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
325 
326 /*
327  * unit stride mask load and store, EEW = 1
328  */
329 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
330                     CPURISCVState *env, uint32_t desc)
331 {
332     /* evl = ceil(vl/8) */
333     uint8_t evl = (env->vl + 7) >> 3;
334     vext_ldst_us(vd, base, env, desc, lde_b,
335                  0, evl, GETPC());
336 }
337 
338 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
339                     CPURISCVState *env, uint32_t desc)
340 {
341     /* evl = ceil(vl/8) */
342     uint8_t evl = (env->vl + 7) >> 3;
343     vext_ldst_us(vd, base, env, desc, ste_b,
344                  0, evl, GETPC());
345 }
346 
347 /*
348  * index: access vector element from indexed memory
349  */
350 typedef target_ulong vext_get_index_addr(target_ulong base,
351         uint32_t idx, void *vs2);
352 
353 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
354 static target_ulong NAME(target_ulong base,            \
355                          uint32_t idx, void *vs2)      \
356 {                                                      \
357     return (base + *((ETYPE *)vs2 + H(idx)));          \
358 }
359 
360 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
361 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
362 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
363 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
364 
365 static inline void
366 vext_ldst_index(void *vd, void *v0, target_ulong base,
367                 void *vs2, CPURISCVState *env, uint32_t desc,
368                 vext_get_index_addr get_index_addr,
369                 vext_ldst_elem_fn *ldst_elem,
370                 uint32_t log2_esz, uintptr_t ra)
371 {
372     uint32_t i, k;
373     uint32_t nf = vext_nf(desc);
374     uint32_t vm = vext_vm(desc);
375     uint32_t max_elems = vext_max_elems(desc, log2_esz);
376     uint32_t esz = 1 << log2_esz;
377     uint32_t vma = vext_vma(desc);
378 
379     /* load bytes from guest memory */
380     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
381         k = 0;
382         while (k < nf) {
383             if (!vm && !vext_elem_mask(v0, i)) {
384                 /* set masked-off elements to 1s */
385                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
386                                   (i + k * max_elems + 1) * esz);
387                 k++;
388                 continue;
389             }
390             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
391             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
392             k++;
393         }
394     }
395     env->vstart = 0;
396 
397     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
398 }
399 
400 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
401 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
402                   void *vs2, CPURISCVState *env, uint32_t desc)            \
403 {                                                                          \
404     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
405                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
406 }
407 
408 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
409 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
410 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
411 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
412 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
413 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
414 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
415 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
416 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
417 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
418 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
419 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
420 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
421 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
422 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
423 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
424 
425 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
426 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
427                   void *vs2, CPURISCVState *env, uint32_t desc)  \
428 {                                                                \
429     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
430                     STORE_FN, ctzl(sizeof(ETYPE)),               \
431                     GETPC());                                    \
432 }
433 
434 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
435 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
436 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
437 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
438 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
439 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
440 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
441 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
442 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
443 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
444 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
445 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
446 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
447 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
448 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
449 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
450 
451 /*
452  * unit-stride fault-only-fisrt load instructions
453  */
454 static inline void
455 vext_ldff(void *vd, void *v0, target_ulong base,
456           CPURISCVState *env, uint32_t desc,
457           vext_ldst_elem_fn *ldst_elem,
458           uint32_t log2_esz, uintptr_t ra)
459 {
460     void *host;
461     uint32_t i, k, vl = 0;
462     uint32_t nf = vext_nf(desc);
463     uint32_t vm = vext_vm(desc);
464     uint32_t max_elems = vext_max_elems(desc, log2_esz);
465     uint32_t esz = 1 << log2_esz;
466     uint32_t vma = vext_vma(desc);
467     target_ulong addr, offset, remain;
468     int mmu_index = riscv_env_mmu_index(env, false);
469 
470     /* probe every access */
471     for (i = env->vstart; i < env->vl; i++) {
472         if (!vm && !vext_elem_mask(v0, i)) {
473             continue;
474         }
475         addr = adjust_addr(env, base + i * (nf << log2_esz));
476         if (i == 0) {
477             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
478         } else {
479             /* if it triggers an exception, no need to check watchpoint */
480             remain = nf << log2_esz;
481             while (remain > 0) {
482                 offset = -(addr | TARGET_PAGE_MASK);
483                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_index);
484                 if (host) {
485 #ifdef CONFIG_USER_ONLY
486                     if (!page_check_range(addr, offset, PAGE_READ)) {
487                         vl = i;
488                         goto ProbeSuccess;
489                     }
490 #else
491                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
492 #endif
493                 } else {
494                     vl = i;
495                     goto ProbeSuccess;
496                 }
497                 if (remain <=  offset) {
498                     break;
499                 }
500                 remain -= offset;
501                 addr = adjust_addr(env, addr + offset);
502             }
503         }
504     }
505 ProbeSuccess:
506     /* load bytes from guest memory */
507     if (vl != 0) {
508         env->vl = vl;
509     }
510     for (i = env->vstart; i < env->vl; i++) {
511         k = 0;
512         while (k < nf) {
513             if (!vm && !vext_elem_mask(v0, i)) {
514                 /* set masked-off elements to 1s */
515                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
516                                   (i + k * max_elems + 1) * esz);
517                 k++;
518                 continue;
519             }
520             addr = base + ((i * nf + k) << log2_esz);
521             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
522             k++;
523         }
524     }
525     env->vstart = 0;
526 
527     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
528 }
529 
530 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
531 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
532                   CPURISCVState *env, uint32_t desc)      \
533 {                                                         \
534     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
535               ctzl(sizeof(ETYPE)), GETPC());              \
536 }
537 
538 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
539 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
540 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
541 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
542 
543 #define DO_SWAP(N, M) (M)
544 #define DO_AND(N, M)  (N & M)
545 #define DO_XOR(N, M)  (N ^ M)
546 #define DO_OR(N, M)   (N | M)
547 #define DO_ADD(N, M)  (N + M)
548 
549 /* Signed min/max */
550 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
551 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
552 
553 /*
554  * load and store whole register instructions
555  */
556 static void
557 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
558                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
559 {
560     uint32_t i, k, off, pos;
561     uint32_t nf = vext_nf(desc);
562     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
563     uint32_t max_elems = vlenb >> log2_esz;
564 
565     k = env->vstart / max_elems;
566     off = env->vstart % max_elems;
567 
568     if (off) {
569         /* load/store rest of elements of current segment pointed by vstart */
570         for (pos = off; pos < max_elems; pos++, env->vstart++) {
571             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
572             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
573                       ra);
574         }
575         k++;
576     }
577 
578     /* load/store elements for rest of segments */
579     for (; k < nf; k++) {
580         for (i = 0; i < max_elems; i++, env->vstart++) {
581             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
582             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
583         }
584     }
585 
586     env->vstart = 0;
587 }
588 
589 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
590 void HELPER(NAME)(void *vd, target_ulong base,       \
591                   CPURISCVState *env, uint32_t desc) \
592 {                                                    \
593     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
594                     ctzl(sizeof(ETYPE)), GETPC());   \
595 }
596 
597 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
598 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
599 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
600 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
601 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
602 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
603 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
604 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
605 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
606 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
607 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
608 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
609 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
610 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
611 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
612 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
613 
614 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
615 void HELPER(NAME)(void *vd, target_ulong base,       \
616                   CPURISCVState *env, uint32_t desc) \
617 {                                                    \
618     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
619                     ctzl(sizeof(ETYPE)), GETPC());   \
620 }
621 
622 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
623 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
624 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
625 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
626 
627 /*
628  * Vector Integer Arithmetic Instructions
629  */
630 
631 /* (TD, T1, T2, TX1, TX2) */
632 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
633 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
634 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
635 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
636 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
637 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
638 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
639 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
640 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
641 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
642 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
643 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
644 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
645 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
646 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
647 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
648 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
649 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
650 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
651 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
652 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
653 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
654 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
655 
656 #define DO_SUB(N, M) (N - M)
657 #define DO_RSUB(N, M) (M - N)
658 
659 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
660 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
661 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
662 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
663 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
664 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
665 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
666 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
667 
668 GEN_VEXT_VV(vadd_vv_b, 1)
669 GEN_VEXT_VV(vadd_vv_h, 2)
670 GEN_VEXT_VV(vadd_vv_w, 4)
671 GEN_VEXT_VV(vadd_vv_d, 8)
672 GEN_VEXT_VV(vsub_vv_b, 1)
673 GEN_VEXT_VV(vsub_vv_h, 2)
674 GEN_VEXT_VV(vsub_vv_w, 4)
675 GEN_VEXT_VV(vsub_vv_d, 8)
676 
677 
678 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
679 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
680 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
681 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
682 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
683 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
684 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
685 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
686 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
687 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
688 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
689 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
690 
691 GEN_VEXT_VX(vadd_vx_b, 1)
692 GEN_VEXT_VX(vadd_vx_h, 2)
693 GEN_VEXT_VX(vadd_vx_w, 4)
694 GEN_VEXT_VX(vadd_vx_d, 8)
695 GEN_VEXT_VX(vsub_vx_b, 1)
696 GEN_VEXT_VX(vsub_vx_h, 2)
697 GEN_VEXT_VX(vsub_vx_w, 4)
698 GEN_VEXT_VX(vsub_vx_d, 8)
699 GEN_VEXT_VX(vrsub_vx_b, 1)
700 GEN_VEXT_VX(vrsub_vx_h, 2)
701 GEN_VEXT_VX(vrsub_vx_w, 4)
702 GEN_VEXT_VX(vrsub_vx_d, 8)
703 
704 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
705 {
706     intptr_t oprsz = simd_oprsz(desc);
707     intptr_t i;
708 
709     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
710         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
711     }
712 }
713 
714 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
715 {
716     intptr_t oprsz = simd_oprsz(desc);
717     intptr_t i;
718 
719     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
720         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
721     }
722 }
723 
724 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
725 {
726     intptr_t oprsz = simd_oprsz(desc);
727     intptr_t i;
728 
729     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
730         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
731     }
732 }
733 
734 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
735 {
736     intptr_t oprsz = simd_oprsz(desc);
737     intptr_t i;
738 
739     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
740         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
741     }
742 }
743 
744 /* Vector Widening Integer Add/Subtract */
745 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
746 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
747 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
748 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
749 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
750 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
751 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
752 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
753 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
754 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
755 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
756 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
757 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
758 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
759 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
760 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
761 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
762 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
763 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
764 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
765 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
766 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
767 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
768 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
769 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
770 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
771 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
772 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
773 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
774 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
775 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
776 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
777 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
778 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
779 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
780 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
781 GEN_VEXT_VV(vwaddu_vv_b, 2)
782 GEN_VEXT_VV(vwaddu_vv_h, 4)
783 GEN_VEXT_VV(vwaddu_vv_w, 8)
784 GEN_VEXT_VV(vwsubu_vv_b, 2)
785 GEN_VEXT_VV(vwsubu_vv_h, 4)
786 GEN_VEXT_VV(vwsubu_vv_w, 8)
787 GEN_VEXT_VV(vwadd_vv_b, 2)
788 GEN_VEXT_VV(vwadd_vv_h, 4)
789 GEN_VEXT_VV(vwadd_vv_w, 8)
790 GEN_VEXT_VV(vwsub_vv_b, 2)
791 GEN_VEXT_VV(vwsub_vv_h, 4)
792 GEN_VEXT_VV(vwsub_vv_w, 8)
793 GEN_VEXT_VV(vwaddu_wv_b, 2)
794 GEN_VEXT_VV(vwaddu_wv_h, 4)
795 GEN_VEXT_VV(vwaddu_wv_w, 8)
796 GEN_VEXT_VV(vwsubu_wv_b, 2)
797 GEN_VEXT_VV(vwsubu_wv_h, 4)
798 GEN_VEXT_VV(vwsubu_wv_w, 8)
799 GEN_VEXT_VV(vwadd_wv_b, 2)
800 GEN_VEXT_VV(vwadd_wv_h, 4)
801 GEN_VEXT_VV(vwadd_wv_w, 8)
802 GEN_VEXT_VV(vwsub_wv_b, 2)
803 GEN_VEXT_VV(vwsub_wv_h, 4)
804 GEN_VEXT_VV(vwsub_wv_w, 8)
805 
806 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
807 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
808 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
809 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
810 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
811 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
812 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
813 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
814 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
815 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
816 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
817 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
818 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
819 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
820 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
821 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
822 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
823 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
824 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
825 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
826 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
827 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
828 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
829 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
830 GEN_VEXT_VX(vwaddu_vx_b, 2)
831 GEN_VEXT_VX(vwaddu_vx_h, 4)
832 GEN_VEXT_VX(vwaddu_vx_w, 8)
833 GEN_VEXT_VX(vwsubu_vx_b, 2)
834 GEN_VEXT_VX(vwsubu_vx_h, 4)
835 GEN_VEXT_VX(vwsubu_vx_w, 8)
836 GEN_VEXT_VX(vwadd_vx_b, 2)
837 GEN_VEXT_VX(vwadd_vx_h, 4)
838 GEN_VEXT_VX(vwadd_vx_w, 8)
839 GEN_VEXT_VX(vwsub_vx_b, 2)
840 GEN_VEXT_VX(vwsub_vx_h, 4)
841 GEN_VEXT_VX(vwsub_vx_w, 8)
842 GEN_VEXT_VX(vwaddu_wx_b, 2)
843 GEN_VEXT_VX(vwaddu_wx_h, 4)
844 GEN_VEXT_VX(vwaddu_wx_w, 8)
845 GEN_VEXT_VX(vwsubu_wx_b, 2)
846 GEN_VEXT_VX(vwsubu_wx_h, 4)
847 GEN_VEXT_VX(vwsubu_wx_w, 8)
848 GEN_VEXT_VX(vwadd_wx_b, 2)
849 GEN_VEXT_VX(vwadd_wx_h, 4)
850 GEN_VEXT_VX(vwadd_wx_w, 8)
851 GEN_VEXT_VX(vwsub_wx_b, 2)
852 GEN_VEXT_VX(vwsub_wx_h, 4)
853 GEN_VEXT_VX(vwsub_wx_w, 8)
854 
855 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
856 #define DO_VADC(N, M, C) (N + M + C)
857 #define DO_VSBC(N, M, C) (N - M - C)
858 
859 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
860 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
861                   CPURISCVState *env, uint32_t desc)          \
862 {                                                             \
863     uint32_t vl = env->vl;                                    \
864     uint32_t esz = sizeof(ETYPE);                             \
865     uint32_t total_elems =                                    \
866         vext_get_total_elems(env, desc, esz);                 \
867     uint32_t vta = vext_vta(desc);                            \
868     uint32_t i;                                               \
869                                                               \
870     for (i = env->vstart; i < vl; i++) {                      \
871         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
872         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
873         ETYPE carry = vext_elem_mask(v0, i);                  \
874                                                               \
875         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
876     }                                                         \
877     env->vstart = 0;                                          \
878     /* set tail elements to 1s */                             \
879     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
880 }
881 
882 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
883 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
884 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
885 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
886 
887 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
888 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
889 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
890 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
891 
892 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
893 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
894                   CPURISCVState *env, uint32_t desc)                     \
895 {                                                                        \
896     uint32_t vl = env->vl;                                               \
897     uint32_t esz = sizeof(ETYPE);                                        \
898     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
899     uint32_t vta = vext_vta(desc);                                       \
900     uint32_t i;                                                          \
901                                                                          \
902     for (i = env->vstart; i < vl; i++) {                                 \
903         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
904         ETYPE carry = vext_elem_mask(v0, i);                             \
905                                                                          \
906         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
907     }                                                                    \
908     env->vstart = 0;                                                     \
909     /* set tail elements to 1s */                                        \
910     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
911 }
912 
913 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
914 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
915 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
916 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
917 
918 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
919 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
920 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
921 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
922 
923 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
924                           (__typeof(N))(N + M) < N)
925 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
926 
927 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
928 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
929                   CPURISCVState *env, uint32_t desc)          \
930 {                                                             \
931     uint32_t vl = env->vl;                                    \
932     uint32_t vm = vext_vm(desc);                              \
933     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
934     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
935     uint32_t i;                                               \
936                                                               \
937     for (i = env->vstart; i < vl; i++) {                      \
938         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
939         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
940         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
941         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
942     }                                                         \
943     env->vstart = 0;                                          \
944     /*
945      * mask destination register are always tail-agnostic
946      * set tail elements to 1s
947      */                                                       \
948     if (vta_all_1s) {                                         \
949         for (; i < total_elems; i++) {                        \
950             vext_set_elem_mask(vd, i, 1);                     \
951         }                                                     \
952     }                                                         \
953 }
954 
955 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
956 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
957 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
958 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
959 
960 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
961 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
962 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
963 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
964 
965 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
966 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
967                   void *vs2, CPURISCVState *env, uint32_t desc) \
968 {                                                               \
969     uint32_t vl = env->vl;                                      \
970     uint32_t vm = vext_vm(desc);                                \
971     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
972     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
973     uint32_t i;                                                 \
974                                                                 \
975     for (i = env->vstart; i < vl; i++) {                        \
976         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
977         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
978         vext_set_elem_mask(vd, i,                               \
979                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
980     }                                                           \
981     env->vstart = 0;                                            \
982     /*
983      * mask destination register are always tail-agnostic
984      * set tail elements to 1s
985      */                                                         \
986     if (vta_all_1s) {                                           \
987         for (; i < total_elems; i++) {                          \
988             vext_set_elem_mask(vd, i, 1);                       \
989         }                                                       \
990     }                                                           \
991 }
992 
993 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
994 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
995 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
996 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
997 
998 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
999 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1000 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1001 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1002 
1003 /* Vector Bitwise Logical Instructions */
1004 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1005 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1006 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1007 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1008 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1009 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1010 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1011 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1012 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1013 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1014 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1015 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1016 GEN_VEXT_VV(vand_vv_b, 1)
1017 GEN_VEXT_VV(vand_vv_h, 2)
1018 GEN_VEXT_VV(vand_vv_w, 4)
1019 GEN_VEXT_VV(vand_vv_d, 8)
1020 GEN_VEXT_VV(vor_vv_b, 1)
1021 GEN_VEXT_VV(vor_vv_h, 2)
1022 GEN_VEXT_VV(vor_vv_w, 4)
1023 GEN_VEXT_VV(vor_vv_d, 8)
1024 GEN_VEXT_VV(vxor_vv_b, 1)
1025 GEN_VEXT_VV(vxor_vv_h, 2)
1026 GEN_VEXT_VV(vxor_vv_w, 4)
1027 GEN_VEXT_VV(vxor_vv_d, 8)
1028 
1029 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1030 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1031 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1032 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1033 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1034 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1035 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1036 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1037 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1038 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1039 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1040 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1041 GEN_VEXT_VX(vand_vx_b, 1)
1042 GEN_VEXT_VX(vand_vx_h, 2)
1043 GEN_VEXT_VX(vand_vx_w, 4)
1044 GEN_VEXT_VX(vand_vx_d, 8)
1045 GEN_VEXT_VX(vor_vx_b, 1)
1046 GEN_VEXT_VX(vor_vx_h, 2)
1047 GEN_VEXT_VX(vor_vx_w, 4)
1048 GEN_VEXT_VX(vor_vx_d, 8)
1049 GEN_VEXT_VX(vxor_vx_b, 1)
1050 GEN_VEXT_VX(vxor_vx_h, 2)
1051 GEN_VEXT_VX(vxor_vx_w, 4)
1052 GEN_VEXT_VX(vxor_vx_d, 8)
1053 
1054 /* Vector Single-Width Bit Shift Instructions */
1055 #define DO_SLL(N, M)  (N << (M))
1056 #define DO_SRL(N, M)  (N >> (M))
1057 
1058 /* generate the helpers for shift instructions with two vector operators */
1059 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1060 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1061                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1062 {                                                                         \
1063     uint32_t vm = vext_vm(desc);                                          \
1064     uint32_t vl = env->vl;                                                \
1065     uint32_t esz = sizeof(TS1);                                           \
1066     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1067     uint32_t vta = vext_vta(desc);                                        \
1068     uint32_t vma = vext_vma(desc);                                        \
1069     uint32_t i;                                                           \
1070                                                                           \
1071     for (i = env->vstart; i < vl; i++) {                                  \
1072         if (!vm && !vext_elem_mask(v0, i)) {                              \
1073             /* set masked-off elements to 1s */                           \
1074             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1075             continue;                                                     \
1076         }                                                                 \
1077         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1078         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1079         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1080     }                                                                     \
1081     env->vstart = 0;                                                      \
1082     /* set tail elements to 1s */                                         \
1083     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1084 }
1085 
1086 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1087 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1088 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1089 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1090 
1091 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1092 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1093 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1094 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1095 
1096 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1097 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1098 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1099 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1100 
1101 /*
1102  * generate the helpers for shift instructions with one vector and one scalar
1103  */
1104 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1105 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1106                   void *vs2, CPURISCVState *env,            \
1107                   uint32_t desc)                            \
1108 {                                                           \
1109     uint32_t vm = vext_vm(desc);                            \
1110     uint32_t vl = env->vl;                                  \
1111     uint32_t esz = sizeof(TD);                              \
1112     uint32_t total_elems =                                  \
1113         vext_get_total_elems(env, desc, esz);               \
1114     uint32_t vta = vext_vta(desc);                          \
1115     uint32_t vma = vext_vma(desc);                          \
1116     uint32_t i;                                             \
1117                                                             \
1118     for (i = env->vstart; i < vl; i++) {                    \
1119         if (!vm && !vext_elem_mask(v0, i)) {                \
1120             /* set masked-off elements to 1s */             \
1121             vext_set_elems_1s(vd, vma, i * esz,             \
1122                               (i + 1) * esz);               \
1123             continue;                                       \
1124         }                                                   \
1125         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1126         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1127     }                                                       \
1128     env->vstart = 0;                                        \
1129     /* set tail elements to 1s */                           \
1130     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1131 }
1132 
1133 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1134 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1135 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1136 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1137 
1138 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1139 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1140 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1141 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1142 
1143 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1144 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1145 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1146 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1147 
1148 /* Vector Narrowing Integer Right Shift Instructions */
1149 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1150 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1151 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1152 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1153 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1154 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1155 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1156 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1157 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1158 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1159 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1160 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1161 
1162 /* Vector Integer Comparison Instructions */
1163 #define DO_MSEQ(N, M) (N == M)
1164 #define DO_MSNE(N, M) (N != M)
1165 #define DO_MSLT(N, M) (N < M)
1166 #define DO_MSLE(N, M) (N <= M)
1167 #define DO_MSGT(N, M) (N > M)
1168 
1169 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1170 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1171                   CPURISCVState *env, uint32_t desc)          \
1172 {                                                             \
1173     uint32_t vm = vext_vm(desc);                              \
1174     uint32_t vl = env->vl;                                    \
1175     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1176     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1177     uint32_t vma = vext_vma(desc);                            \
1178     uint32_t i;                                               \
1179                                                               \
1180     for (i = env->vstart; i < vl; i++) {                      \
1181         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1182         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1183         if (!vm && !vext_elem_mask(v0, i)) {                  \
1184             /* set masked-off elements to 1s */               \
1185             if (vma) {                                        \
1186                 vext_set_elem_mask(vd, i, 1);                 \
1187             }                                                 \
1188             continue;                                         \
1189         }                                                     \
1190         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1191     }                                                         \
1192     env->vstart = 0;                                          \
1193     /*
1194      * mask destination register are always tail-agnostic
1195      * set tail elements to 1s
1196      */                                                       \
1197     if (vta_all_1s) {                                         \
1198         for (; i < total_elems; i++) {                        \
1199             vext_set_elem_mask(vd, i, 1);                     \
1200         }                                                     \
1201     }                                                         \
1202 }
1203 
1204 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1205 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1206 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1207 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1208 
1209 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1210 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1211 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1212 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1213 
1214 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1215 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1216 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1217 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1218 
1219 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1220 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1221 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1222 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1223 
1224 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1225 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1226 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1227 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1228 
1229 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1230 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1231 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1232 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1233 
1234 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1235 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1236                   CPURISCVState *env, uint32_t desc)                \
1237 {                                                                   \
1238     uint32_t vm = vext_vm(desc);                                    \
1239     uint32_t vl = env->vl;                                          \
1240     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1241     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1242     uint32_t vma = vext_vma(desc);                                  \
1243     uint32_t i;                                                     \
1244                                                                     \
1245     for (i = env->vstart; i < vl; i++) {                            \
1246         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1247         if (!vm && !vext_elem_mask(v0, i)) {                        \
1248             /* set masked-off elements to 1s */                     \
1249             if (vma) {                                              \
1250                 vext_set_elem_mask(vd, i, 1);                       \
1251             }                                                       \
1252             continue;                                               \
1253         }                                                           \
1254         vext_set_elem_mask(vd, i,                                   \
1255                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1256     }                                                               \
1257     env->vstart = 0;                                                \
1258     /*
1259      * mask destination register are always tail-agnostic
1260      * set tail elements to 1s
1261      */                                                             \
1262     if (vta_all_1s) {                                               \
1263         for (; i < total_elems; i++) {                              \
1264             vext_set_elem_mask(vd, i, 1);                           \
1265         }                                                           \
1266     }                                                               \
1267 }
1268 
1269 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1270 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1271 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1272 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1273 
1274 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1275 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1276 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1277 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1278 
1279 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1280 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1281 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1282 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1283 
1284 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1285 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1286 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1287 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1288 
1289 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1290 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1291 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1292 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1293 
1294 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1295 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1296 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1297 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1298 
1299 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1300 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1301 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1302 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1303 
1304 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1305 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1306 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1307 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1308 
1309 /* Vector Integer Min/Max Instructions */
1310 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1311 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1312 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1313 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1314 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1315 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1316 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1317 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1318 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1319 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1320 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1321 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1322 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1323 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1324 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1325 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1326 GEN_VEXT_VV(vminu_vv_b, 1)
1327 GEN_VEXT_VV(vminu_vv_h, 2)
1328 GEN_VEXT_VV(vminu_vv_w, 4)
1329 GEN_VEXT_VV(vminu_vv_d, 8)
1330 GEN_VEXT_VV(vmin_vv_b, 1)
1331 GEN_VEXT_VV(vmin_vv_h, 2)
1332 GEN_VEXT_VV(vmin_vv_w, 4)
1333 GEN_VEXT_VV(vmin_vv_d, 8)
1334 GEN_VEXT_VV(vmaxu_vv_b, 1)
1335 GEN_VEXT_VV(vmaxu_vv_h, 2)
1336 GEN_VEXT_VV(vmaxu_vv_w, 4)
1337 GEN_VEXT_VV(vmaxu_vv_d, 8)
1338 GEN_VEXT_VV(vmax_vv_b, 1)
1339 GEN_VEXT_VV(vmax_vv_h, 2)
1340 GEN_VEXT_VV(vmax_vv_w, 4)
1341 GEN_VEXT_VV(vmax_vv_d, 8)
1342 
1343 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1344 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1345 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1346 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1347 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1348 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1349 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1350 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1351 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1352 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1353 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1354 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1355 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1356 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1357 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1358 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1359 GEN_VEXT_VX(vminu_vx_b, 1)
1360 GEN_VEXT_VX(vminu_vx_h, 2)
1361 GEN_VEXT_VX(vminu_vx_w, 4)
1362 GEN_VEXT_VX(vminu_vx_d, 8)
1363 GEN_VEXT_VX(vmin_vx_b, 1)
1364 GEN_VEXT_VX(vmin_vx_h, 2)
1365 GEN_VEXT_VX(vmin_vx_w, 4)
1366 GEN_VEXT_VX(vmin_vx_d, 8)
1367 GEN_VEXT_VX(vmaxu_vx_b, 1)
1368 GEN_VEXT_VX(vmaxu_vx_h, 2)
1369 GEN_VEXT_VX(vmaxu_vx_w, 4)
1370 GEN_VEXT_VX(vmaxu_vx_d, 8)
1371 GEN_VEXT_VX(vmax_vx_b, 1)
1372 GEN_VEXT_VX(vmax_vx_h, 2)
1373 GEN_VEXT_VX(vmax_vx_w, 4)
1374 GEN_VEXT_VX(vmax_vx_d, 8)
1375 
1376 /* Vector Single-Width Integer Multiply Instructions */
1377 #define DO_MUL(N, M) (N * M)
1378 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1379 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1380 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1381 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1382 GEN_VEXT_VV(vmul_vv_b, 1)
1383 GEN_VEXT_VV(vmul_vv_h, 2)
1384 GEN_VEXT_VV(vmul_vv_w, 4)
1385 GEN_VEXT_VV(vmul_vv_d, 8)
1386 
1387 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1388 {
1389     return (int16_t)s2 * (int16_t)s1 >> 8;
1390 }
1391 
1392 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1393 {
1394     return (int32_t)s2 * (int32_t)s1 >> 16;
1395 }
1396 
1397 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1398 {
1399     return (int64_t)s2 * (int64_t)s1 >> 32;
1400 }
1401 
1402 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1403 {
1404     uint64_t hi_64, lo_64;
1405 
1406     muls64(&lo_64, &hi_64, s1, s2);
1407     return hi_64;
1408 }
1409 
1410 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1411 {
1412     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1413 }
1414 
1415 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1416 {
1417     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1418 }
1419 
1420 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1421 {
1422     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1423 }
1424 
1425 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1426 {
1427     uint64_t hi_64, lo_64;
1428 
1429     mulu64(&lo_64, &hi_64, s2, s1);
1430     return hi_64;
1431 }
1432 
1433 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1434 {
1435     return (int16_t)s2 * (uint16_t)s1 >> 8;
1436 }
1437 
1438 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1439 {
1440     return (int32_t)s2 * (uint32_t)s1 >> 16;
1441 }
1442 
1443 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1444 {
1445     return (int64_t)s2 * (uint64_t)s1 >> 32;
1446 }
1447 
1448 /*
1449  * Let  A = signed operand,
1450  *      B = unsigned operand
1451  *      P = mulu64(A, B), unsigned product
1452  *
1453  * LET  X = 2 ** 64  - A, 2's complement of A
1454  *      SP = signed product
1455  * THEN
1456  *      IF A < 0
1457  *          SP = -X * B
1458  *             = -(2 ** 64 - A) * B
1459  *             = A * B - 2 ** 64 * B
1460  *             = P - 2 ** 64 * B
1461  *      ELSE
1462  *          SP = P
1463  * THEN
1464  *      HI_P -= (A < 0 ? B : 0)
1465  */
1466 
1467 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1468 {
1469     uint64_t hi_64, lo_64;
1470 
1471     mulu64(&lo_64, &hi_64, s2, s1);
1472 
1473     hi_64 -= s2 < 0 ? s1 : 0;
1474     return hi_64;
1475 }
1476 
1477 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1478 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1479 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1480 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1481 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1482 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1483 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1484 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1485 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1486 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1487 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1488 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1489 GEN_VEXT_VV(vmulh_vv_b, 1)
1490 GEN_VEXT_VV(vmulh_vv_h, 2)
1491 GEN_VEXT_VV(vmulh_vv_w, 4)
1492 GEN_VEXT_VV(vmulh_vv_d, 8)
1493 GEN_VEXT_VV(vmulhu_vv_b, 1)
1494 GEN_VEXT_VV(vmulhu_vv_h, 2)
1495 GEN_VEXT_VV(vmulhu_vv_w, 4)
1496 GEN_VEXT_VV(vmulhu_vv_d, 8)
1497 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1498 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1499 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1500 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1501 
1502 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1503 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1504 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1505 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1506 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1507 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1508 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1509 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1510 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1511 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1512 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1513 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1514 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1515 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1516 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1517 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1518 GEN_VEXT_VX(vmul_vx_b, 1)
1519 GEN_VEXT_VX(vmul_vx_h, 2)
1520 GEN_VEXT_VX(vmul_vx_w, 4)
1521 GEN_VEXT_VX(vmul_vx_d, 8)
1522 GEN_VEXT_VX(vmulh_vx_b, 1)
1523 GEN_VEXT_VX(vmulh_vx_h, 2)
1524 GEN_VEXT_VX(vmulh_vx_w, 4)
1525 GEN_VEXT_VX(vmulh_vx_d, 8)
1526 GEN_VEXT_VX(vmulhu_vx_b, 1)
1527 GEN_VEXT_VX(vmulhu_vx_h, 2)
1528 GEN_VEXT_VX(vmulhu_vx_w, 4)
1529 GEN_VEXT_VX(vmulhu_vx_d, 8)
1530 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1531 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1532 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1533 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1534 
1535 /* Vector Integer Divide Instructions */
1536 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1537 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1538 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1539         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1540 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1541         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1542 
1543 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1544 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1545 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1546 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1547 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1548 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1549 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1550 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1551 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1552 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1553 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1554 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1555 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1556 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1557 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1558 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1559 GEN_VEXT_VV(vdivu_vv_b, 1)
1560 GEN_VEXT_VV(vdivu_vv_h, 2)
1561 GEN_VEXT_VV(vdivu_vv_w, 4)
1562 GEN_VEXT_VV(vdivu_vv_d, 8)
1563 GEN_VEXT_VV(vdiv_vv_b, 1)
1564 GEN_VEXT_VV(vdiv_vv_h, 2)
1565 GEN_VEXT_VV(vdiv_vv_w, 4)
1566 GEN_VEXT_VV(vdiv_vv_d, 8)
1567 GEN_VEXT_VV(vremu_vv_b, 1)
1568 GEN_VEXT_VV(vremu_vv_h, 2)
1569 GEN_VEXT_VV(vremu_vv_w, 4)
1570 GEN_VEXT_VV(vremu_vv_d, 8)
1571 GEN_VEXT_VV(vrem_vv_b, 1)
1572 GEN_VEXT_VV(vrem_vv_h, 2)
1573 GEN_VEXT_VV(vrem_vv_w, 4)
1574 GEN_VEXT_VV(vrem_vv_d, 8)
1575 
1576 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1577 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1578 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1579 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1580 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1581 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1582 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1583 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1584 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1585 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1586 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1587 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1588 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1589 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1590 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1591 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1592 GEN_VEXT_VX(vdivu_vx_b, 1)
1593 GEN_VEXT_VX(vdivu_vx_h, 2)
1594 GEN_VEXT_VX(vdivu_vx_w, 4)
1595 GEN_VEXT_VX(vdivu_vx_d, 8)
1596 GEN_VEXT_VX(vdiv_vx_b, 1)
1597 GEN_VEXT_VX(vdiv_vx_h, 2)
1598 GEN_VEXT_VX(vdiv_vx_w, 4)
1599 GEN_VEXT_VX(vdiv_vx_d, 8)
1600 GEN_VEXT_VX(vremu_vx_b, 1)
1601 GEN_VEXT_VX(vremu_vx_h, 2)
1602 GEN_VEXT_VX(vremu_vx_w, 4)
1603 GEN_VEXT_VX(vremu_vx_d, 8)
1604 GEN_VEXT_VX(vrem_vx_b, 1)
1605 GEN_VEXT_VX(vrem_vx_h, 2)
1606 GEN_VEXT_VX(vrem_vx_w, 4)
1607 GEN_VEXT_VX(vrem_vx_d, 8)
1608 
1609 /* Vector Widening Integer Multiply Instructions */
1610 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1611 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1612 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1613 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1614 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1615 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1616 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1617 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1618 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1619 GEN_VEXT_VV(vwmul_vv_b, 2)
1620 GEN_VEXT_VV(vwmul_vv_h, 4)
1621 GEN_VEXT_VV(vwmul_vv_w, 8)
1622 GEN_VEXT_VV(vwmulu_vv_b, 2)
1623 GEN_VEXT_VV(vwmulu_vv_h, 4)
1624 GEN_VEXT_VV(vwmulu_vv_w, 8)
1625 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1626 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1627 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1628 
1629 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1630 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1631 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1632 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1633 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1634 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1635 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1636 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1637 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1638 GEN_VEXT_VX(vwmul_vx_b, 2)
1639 GEN_VEXT_VX(vwmul_vx_h, 4)
1640 GEN_VEXT_VX(vwmul_vx_w, 8)
1641 GEN_VEXT_VX(vwmulu_vx_b, 2)
1642 GEN_VEXT_VX(vwmulu_vx_h, 4)
1643 GEN_VEXT_VX(vwmulu_vx_w, 8)
1644 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1645 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1646 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1647 
1648 /* Vector Single-Width Integer Multiply-Add Instructions */
1649 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1650 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1651 {                                                                  \
1652     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1653     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1654     TD d = *((TD *)vd + HD(i));                                    \
1655     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1656 }
1657 
1658 #define DO_MACC(N, M, D) (M * N + D)
1659 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1660 #define DO_MADD(N, M, D) (M * D + N)
1661 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1662 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1663 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1664 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1665 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1666 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1667 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1668 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1669 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1670 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1671 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1672 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1673 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1674 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1675 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1676 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1677 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1678 GEN_VEXT_VV(vmacc_vv_b, 1)
1679 GEN_VEXT_VV(vmacc_vv_h, 2)
1680 GEN_VEXT_VV(vmacc_vv_w, 4)
1681 GEN_VEXT_VV(vmacc_vv_d, 8)
1682 GEN_VEXT_VV(vnmsac_vv_b, 1)
1683 GEN_VEXT_VV(vnmsac_vv_h, 2)
1684 GEN_VEXT_VV(vnmsac_vv_w, 4)
1685 GEN_VEXT_VV(vnmsac_vv_d, 8)
1686 GEN_VEXT_VV(vmadd_vv_b, 1)
1687 GEN_VEXT_VV(vmadd_vv_h, 2)
1688 GEN_VEXT_VV(vmadd_vv_w, 4)
1689 GEN_VEXT_VV(vmadd_vv_d, 8)
1690 GEN_VEXT_VV(vnmsub_vv_b, 1)
1691 GEN_VEXT_VV(vnmsub_vv_h, 2)
1692 GEN_VEXT_VV(vnmsub_vv_w, 4)
1693 GEN_VEXT_VV(vnmsub_vv_d, 8)
1694 
1695 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1696 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1697 {                                                                   \
1698     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1699     TD d = *((TD *)vd + HD(i));                                     \
1700     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1701 }
1702 
1703 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1704 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1705 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1706 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1707 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1708 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1709 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1710 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1711 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1712 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1713 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1714 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1715 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1716 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1717 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1718 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1719 GEN_VEXT_VX(vmacc_vx_b, 1)
1720 GEN_VEXT_VX(vmacc_vx_h, 2)
1721 GEN_VEXT_VX(vmacc_vx_w, 4)
1722 GEN_VEXT_VX(vmacc_vx_d, 8)
1723 GEN_VEXT_VX(vnmsac_vx_b, 1)
1724 GEN_VEXT_VX(vnmsac_vx_h, 2)
1725 GEN_VEXT_VX(vnmsac_vx_w, 4)
1726 GEN_VEXT_VX(vnmsac_vx_d, 8)
1727 GEN_VEXT_VX(vmadd_vx_b, 1)
1728 GEN_VEXT_VX(vmadd_vx_h, 2)
1729 GEN_VEXT_VX(vmadd_vx_w, 4)
1730 GEN_VEXT_VX(vmadd_vx_d, 8)
1731 GEN_VEXT_VX(vnmsub_vx_b, 1)
1732 GEN_VEXT_VX(vnmsub_vx_h, 2)
1733 GEN_VEXT_VX(vnmsub_vx_w, 4)
1734 GEN_VEXT_VX(vnmsub_vx_d, 8)
1735 
1736 /* Vector Widening Integer Multiply-Add Instructions */
1737 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1738 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1739 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1740 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1741 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1742 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1743 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1744 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1745 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1746 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1747 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1748 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1749 GEN_VEXT_VV(vwmacc_vv_b, 2)
1750 GEN_VEXT_VV(vwmacc_vv_h, 4)
1751 GEN_VEXT_VV(vwmacc_vv_w, 8)
1752 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1753 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1754 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1755 
1756 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1757 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1758 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1759 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1760 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1761 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1762 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1763 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1764 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1765 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1766 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1767 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1768 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1769 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1770 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1771 GEN_VEXT_VX(vwmacc_vx_b, 2)
1772 GEN_VEXT_VX(vwmacc_vx_h, 4)
1773 GEN_VEXT_VX(vwmacc_vx_w, 8)
1774 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1775 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1776 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1777 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1778 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1779 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1780 
1781 /* Vector Integer Merge and Move Instructions */
1782 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1783 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1784                   uint32_t desc)                                     \
1785 {                                                                    \
1786     uint32_t vl = env->vl;                                           \
1787     uint32_t esz = sizeof(ETYPE);                                    \
1788     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1789     uint32_t vta = vext_vta(desc);                                   \
1790     uint32_t i;                                                      \
1791                                                                      \
1792     for (i = env->vstart; i < vl; i++) {                             \
1793         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1794         *((ETYPE *)vd + H(i)) = s1;                                  \
1795     }                                                                \
1796     env->vstart = 0;                                                 \
1797     /* set tail elements to 1s */                                    \
1798     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1799 }
1800 
1801 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1802 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1803 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1804 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1805 
1806 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1807 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1808                   uint32_t desc)                                     \
1809 {                                                                    \
1810     uint32_t vl = env->vl;                                           \
1811     uint32_t esz = sizeof(ETYPE);                                    \
1812     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1813     uint32_t vta = vext_vta(desc);                                   \
1814     uint32_t i;                                                      \
1815                                                                      \
1816     for (i = env->vstart; i < vl; i++) {                             \
1817         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1818     }                                                                \
1819     env->vstart = 0;                                                 \
1820     /* set tail elements to 1s */                                    \
1821     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1822 }
1823 
1824 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1825 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1826 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1827 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1828 
1829 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1830 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1831                   CPURISCVState *env, uint32_t desc)                 \
1832 {                                                                    \
1833     uint32_t vl = env->vl;                                           \
1834     uint32_t esz = sizeof(ETYPE);                                    \
1835     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1836     uint32_t vta = vext_vta(desc);                                   \
1837     uint32_t i;                                                      \
1838                                                                      \
1839     for (i = env->vstart; i < vl; i++) {                             \
1840         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1841         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1842     }                                                                \
1843     env->vstart = 0;                                                 \
1844     /* set tail elements to 1s */                                    \
1845     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1846 }
1847 
1848 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1849 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1850 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1851 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1852 
1853 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1854 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1855                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1856 {                                                                    \
1857     uint32_t vl = env->vl;                                           \
1858     uint32_t esz = sizeof(ETYPE);                                    \
1859     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1860     uint32_t vta = vext_vta(desc);                                   \
1861     uint32_t i;                                                      \
1862                                                                      \
1863     for (i = env->vstart; i < vl; i++) {                             \
1864         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1865         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1866                    (ETYPE)(target_long)s1);                          \
1867         *((ETYPE *)vd + H(i)) = d;                                   \
1868     }                                                                \
1869     env->vstart = 0;                                                 \
1870     /* set tail elements to 1s */                                    \
1871     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1872 }
1873 
1874 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1875 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1876 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1877 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1878 
1879 /*
1880  * Vector Fixed-Point Arithmetic Instructions
1881  */
1882 
1883 /* Vector Single-Width Saturating Add and Subtract */
1884 
1885 /*
1886  * As fixed point instructions probably have round mode and saturation,
1887  * define common macros for fixed point here.
1888  */
1889 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1890                           CPURISCVState *env, int vxrm);
1891 
1892 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1893 static inline void                                                  \
1894 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1895           CPURISCVState *env, int vxrm)                             \
1896 {                                                                   \
1897     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1898     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1899     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1900 }
1901 
1902 static inline void
1903 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1904              CPURISCVState *env,
1905              uint32_t vl, uint32_t vm, int vxrm,
1906              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1907 {
1908     for (uint32_t i = env->vstart; i < vl; i++) {
1909         if (!vm && !vext_elem_mask(v0, i)) {
1910             /* set masked-off elements to 1s */
1911             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1912             continue;
1913         }
1914         fn(vd, vs1, vs2, i, env, vxrm);
1915     }
1916     env->vstart = 0;
1917 }
1918 
1919 static inline void
1920 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1921              CPURISCVState *env,
1922              uint32_t desc,
1923              opivv2_rm_fn *fn, uint32_t esz)
1924 {
1925     uint32_t vm = vext_vm(desc);
1926     uint32_t vl = env->vl;
1927     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1928     uint32_t vta = vext_vta(desc);
1929     uint32_t vma = vext_vma(desc);
1930 
1931     switch (env->vxrm) {
1932     case 0: /* rnu */
1933         vext_vv_rm_1(vd, v0, vs1, vs2,
1934                      env, vl, vm, 0, fn, vma, esz);
1935         break;
1936     case 1: /* rne */
1937         vext_vv_rm_1(vd, v0, vs1, vs2,
1938                      env, vl, vm, 1, fn, vma, esz);
1939         break;
1940     case 2: /* rdn */
1941         vext_vv_rm_1(vd, v0, vs1, vs2,
1942                      env, vl, vm, 2, fn, vma, esz);
1943         break;
1944     default: /* rod */
1945         vext_vv_rm_1(vd, v0, vs1, vs2,
1946                      env, vl, vm, 3, fn, vma, esz);
1947         break;
1948     }
1949     /* set tail elements to 1s */
1950     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
1951 }
1952 
1953 /* generate helpers for fixed point instructions with OPIVV format */
1954 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
1955 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1956                   CPURISCVState *env, uint32_t desc)            \
1957 {                                                               \
1958     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
1959                  do_##NAME, ESZ);                               \
1960 }
1961 
1962 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
1963                              uint8_t b)
1964 {
1965     uint8_t res = a + b;
1966     if (res < a) {
1967         res = UINT8_MAX;
1968         env->vxsat = 0x1;
1969     }
1970     return res;
1971 }
1972 
1973 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
1974                                uint16_t b)
1975 {
1976     uint16_t res = a + b;
1977     if (res < a) {
1978         res = UINT16_MAX;
1979         env->vxsat = 0x1;
1980     }
1981     return res;
1982 }
1983 
1984 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
1985                                uint32_t b)
1986 {
1987     uint32_t res = a + b;
1988     if (res < a) {
1989         res = UINT32_MAX;
1990         env->vxsat = 0x1;
1991     }
1992     return res;
1993 }
1994 
1995 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
1996                                uint64_t b)
1997 {
1998     uint64_t res = a + b;
1999     if (res < a) {
2000         res = UINT64_MAX;
2001         env->vxsat = 0x1;
2002     }
2003     return res;
2004 }
2005 
2006 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2007 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2008 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2009 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2010 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2011 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2012 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2013 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2014 
2015 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2016                           CPURISCVState *env, int vxrm);
2017 
2018 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2019 static inline void                                                  \
2020 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2021           CPURISCVState *env, int vxrm)                             \
2022 {                                                                   \
2023     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2024     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2025 }
2026 
2027 static inline void
2028 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2029              CPURISCVState *env,
2030              uint32_t vl, uint32_t vm, int vxrm,
2031              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2032 {
2033     for (uint32_t i = env->vstart; i < vl; i++) {
2034         if (!vm && !vext_elem_mask(v0, i)) {
2035             /* set masked-off elements to 1s */
2036             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2037             continue;
2038         }
2039         fn(vd, s1, vs2, i, env, vxrm);
2040     }
2041     env->vstart = 0;
2042 }
2043 
2044 static inline void
2045 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2046              CPURISCVState *env,
2047              uint32_t desc,
2048              opivx2_rm_fn *fn, uint32_t esz)
2049 {
2050     uint32_t vm = vext_vm(desc);
2051     uint32_t vl = env->vl;
2052     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2053     uint32_t vta = vext_vta(desc);
2054     uint32_t vma = vext_vma(desc);
2055 
2056     switch (env->vxrm) {
2057     case 0: /* rnu */
2058         vext_vx_rm_1(vd, v0, s1, vs2,
2059                      env, vl, vm, 0, fn, vma, esz);
2060         break;
2061     case 1: /* rne */
2062         vext_vx_rm_1(vd, v0, s1, vs2,
2063                      env, vl, vm, 1, fn, vma, esz);
2064         break;
2065     case 2: /* rdn */
2066         vext_vx_rm_1(vd, v0, s1, vs2,
2067                      env, vl, vm, 2, fn, vma, esz);
2068         break;
2069     default: /* rod */
2070         vext_vx_rm_1(vd, v0, s1, vs2,
2071                      env, vl, vm, 3, fn, vma, esz);
2072         break;
2073     }
2074     /* set tail elements to 1s */
2075     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2076 }
2077 
2078 /* generate helpers for fixed point instructions with OPIVX format */
2079 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2080 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2081                   void *vs2, CPURISCVState *env,          \
2082                   uint32_t desc)                          \
2083 {                                                         \
2084     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2085                  do_##NAME, ESZ);                         \
2086 }
2087 
2088 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2089 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2090 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2091 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2092 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2093 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2094 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2095 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2096 
2097 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2098 {
2099     int8_t res = a + b;
2100     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2101         res = a > 0 ? INT8_MAX : INT8_MIN;
2102         env->vxsat = 0x1;
2103     }
2104     return res;
2105 }
2106 
2107 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2108                              int16_t b)
2109 {
2110     int16_t res = a + b;
2111     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2112         res = a > 0 ? INT16_MAX : INT16_MIN;
2113         env->vxsat = 0x1;
2114     }
2115     return res;
2116 }
2117 
2118 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2119                              int32_t b)
2120 {
2121     int32_t res = a + b;
2122     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2123         res = a > 0 ? INT32_MAX : INT32_MIN;
2124         env->vxsat = 0x1;
2125     }
2126     return res;
2127 }
2128 
2129 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2130                              int64_t b)
2131 {
2132     int64_t res = a + b;
2133     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2134         res = a > 0 ? INT64_MAX : INT64_MIN;
2135         env->vxsat = 0x1;
2136     }
2137     return res;
2138 }
2139 
2140 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2141 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2142 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2143 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2144 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2145 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2146 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2147 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2148 
2149 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2150 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2151 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2152 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2153 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2154 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2155 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2156 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2157 
2158 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2159                              uint8_t b)
2160 {
2161     uint8_t res = a - b;
2162     if (res > a) {
2163         res = 0;
2164         env->vxsat = 0x1;
2165     }
2166     return res;
2167 }
2168 
2169 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2170                                uint16_t b)
2171 {
2172     uint16_t res = a - b;
2173     if (res > a) {
2174         res = 0;
2175         env->vxsat = 0x1;
2176     }
2177     return res;
2178 }
2179 
2180 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2181                                uint32_t b)
2182 {
2183     uint32_t res = a - b;
2184     if (res > a) {
2185         res = 0;
2186         env->vxsat = 0x1;
2187     }
2188     return res;
2189 }
2190 
2191 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2192                                uint64_t b)
2193 {
2194     uint64_t res = a - b;
2195     if (res > a) {
2196         res = 0;
2197         env->vxsat = 0x1;
2198     }
2199     return res;
2200 }
2201 
2202 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2203 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2204 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2205 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2206 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2207 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2208 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2209 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2210 
2211 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2212 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2213 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2214 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2215 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2216 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2217 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2218 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2219 
2220 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2221 {
2222     int8_t res = a - b;
2223     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2224         res = a >= 0 ? INT8_MAX : INT8_MIN;
2225         env->vxsat = 0x1;
2226     }
2227     return res;
2228 }
2229 
2230 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2231                              int16_t b)
2232 {
2233     int16_t res = a - b;
2234     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2235         res = a >= 0 ? INT16_MAX : INT16_MIN;
2236         env->vxsat = 0x1;
2237     }
2238     return res;
2239 }
2240 
2241 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2242                              int32_t b)
2243 {
2244     int32_t res = a - b;
2245     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2246         res = a >= 0 ? INT32_MAX : INT32_MIN;
2247         env->vxsat = 0x1;
2248     }
2249     return res;
2250 }
2251 
2252 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2253                              int64_t b)
2254 {
2255     int64_t res = a - b;
2256     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2257         res = a >= 0 ? INT64_MAX : INT64_MIN;
2258         env->vxsat = 0x1;
2259     }
2260     return res;
2261 }
2262 
2263 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2264 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2265 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2266 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2267 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2268 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2269 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2270 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2271 
2272 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2273 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2274 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2275 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2276 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2277 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2278 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2279 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2280 
2281 /* Vector Single-Width Averaging Add and Subtract */
2282 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2283 {
2284     uint8_t d = extract64(v, shift, 1);
2285     uint8_t d1;
2286     uint64_t D1, D2;
2287 
2288     if (shift == 0 || shift > 64) {
2289         return 0;
2290     }
2291 
2292     d1 = extract64(v, shift - 1, 1);
2293     D1 = extract64(v, 0, shift);
2294     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2295         return d1;
2296     } else if (vxrm == 1) { /* round-to-nearest-even */
2297         if (shift > 1) {
2298             D2 = extract64(v, 0, shift - 1);
2299             return d1 & ((D2 != 0) | d);
2300         } else {
2301             return d1 & d;
2302         }
2303     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2304         return !d & (D1 != 0);
2305     }
2306     return 0; /* round-down (truncate) */
2307 }
2308 
2309 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2310                              int32_t b)
2311 {
2312     int64_t res = (int64_t)a + b;
2313     uint8_t round = get_round(vxrm, res, 1);
2314 
2315     return (res >> 1) + round;
2316 }
2317 
2318 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2319                              int64_t b)
2320 {
2321     int64_t res = a + b;
2322     uint8_t round = get_round(vxrm, res, 1);
2323     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2324 
2325     /* With signed overflow, bit 64 is inverse of bit 63. */
2326     return ((res >> 1) ^ over) + round;
2327 }
2328 
2329 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2330 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2331 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2332 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2333 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2334 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2335 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2336 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2337 
2338 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2339 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2340 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2341 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2342 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2343 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2344 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2345 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2346 
2347 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2348                                uint32_t a, uint32_t b)
2349 {
2350     uint64_t res = (uint64_t)a + b;
2351     uint8_t round = get_round(vxrm, res, 1);
2352 
2353     return (res >> 1) + round;
2354 }
2355 
2356 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2357                                uint64_t a, uint64_t b)
2358 {
2359     uint64_t res = a + b;
2360     uint8_t round = get_round(vxrm, res, 1);
2361     uint64_t over = (uint64_t)(res < a) << 63;
2362 
2363     return ((res >> 1) | over) + round;
2364 }
2365 
2366 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2367 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2368 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2369 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2370 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2371 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2372 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2373 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2374 
2375 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2376 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2377 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2378 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2379 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2380 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2381 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2382 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2383 
2384 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2385                              int32_t b)
2386 {
2387     int64_t res = (int64_t)a - b;
2388     uint8_t round = get_round(vxrm, res, 1);
2389 
2390     return (res >> 1) + round;
2391 }
2392 
2393 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2394                              int64_t b)
2395 {
2396     int64_t res = (int64_t)a - b;
2397     uint8_t round = get_round(vxrm, res, 1);
2398     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2399 
2400     /* With signed overflow, bit 64 is inverse of bit 63. */
2401     return ((res >> 1) ^ over) + round;
2402 }
2403 
2404 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2405 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2406 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2407 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2408 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2409 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2410 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2411 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2412 
2413 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2414 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2415 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2416 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2417 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2418 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2419 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2420 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2421 
2422 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2423                                uint32_t a, uint32_t b)
2424 {
2425     int64_t res = (int64_t)a - b;
2426     uint8_t round = get_round(vxrm, res, 1);
2427 
2428     return (res >> 1) + round;
2429 }
2430 
2431 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2432                                uint64_t a, uint64_t b)
2433 {
2434     uint64_t res = (uint64_t)a - b;
2435     uint8_t round = get_round(vxrm, res, 1);
2436     uint64_t over = (uint64_t)(res > a) << 63;
2437 
2438     return ((res >> 1) | over) + round;
2439 }
2440 
2441 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2442 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2443 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2444 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2445 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2446 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2447 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2448 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2449 
2450 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2451 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2452 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2453 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2454 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2455 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2456 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2457 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2458 
2459 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2460 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2461 {
2462     uint8_t round;
2463     int16_t res;
2464 
2465     res = (int16_t)a * (int16_t)b;
2466     round = get_round(vxrm, res, 7);
2467     res = (res >> 7) + round;
2468 
2469     if (res > INT8_MAX) {
2470         env->vxsat = 0x1;
2471         return INT8_MAX;
2472     } else if (res < INT8_MIN) {
2473         env->vxsat = 0x1;
2474         return INT8_MIN;
2475     } else {
2476         return res;
2477     }
2478 }
2479 
2480 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2481 {
2482     uint8_t round;
2483     int32_t res;
2484 
2485     res = (int32_t)a * (int32_t)b;
2486     round = get_round(vxrm, res, 15);
2487     res = (res >> 15) + round;
2488 
2489     if (res > INT16_MAX) {
2490         env->vxsat = 0x1;
2491         return INT16_MAX;
2492     } else if (res < INT16_MIN) {
2493         env->vxsat = 0x1;
2494         return INT16_MIN;
2495     } else {
2496         return res;
2497     }
2498 }
2499 
2500 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2501 {
2502     uint8_t round;
2503     int64_t res;
2504 
2505     res = (int64_t)a * (int64_t)b;
2506     round = get_round(vxrm, res, 31);
2507     res = (res >> 31) + round;
2508 
2509     if (res > INT32_MAX) {
2510         env->vxsat = 0x1;
2511         return INT32_MAX;
2512     } else if (res < INT32_MIN) {
2513         env->vxsat = 0x1;
2514         return INT32_MIN;
2515     } else {
2516         return res;
2517     }
2518 }
2519 
2520 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2521 {
2522     uint8_t round;
2523     uint64_t hi_64, lo_64;
2524     int64_t res;
2525 
2526     if (a == INT64_MIN && b == INT64_MIN) {
2527         env->vxsat = 1;
2528         return INT64_MAX;
2529     }
2530 
2531     muls64(&lo_64, &hi_64, a, b);
2532     round = get_round(vxrm, lo_64, 63);
2533     /*
2534      * Cannot overflow, as there are always
2535      * 2 sign bits after multiply.
2536      */
2537     res = (hi_64 << 1) | (lo_64 >> 63);
2538     if (round) {
2539         if (res == INT64_MAX) {
2540             env->vxsat = 1;
2541         } else {
2542             res += 1;
2543         }
2544     }
2545     return res;
2546 }
2547 
2548 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2549 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2550 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2551 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2552 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2553 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2554 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2555 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2556 
2557 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2558 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2559 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2560 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2561 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2562 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2563 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2564 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2565 
2566 /* Vector Single-Width Scaling Shift Instructions */
2567 static inline uint8_t
2568 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2569 {
2570     uint8_t round, shift = b & 0x7;
2571     uint8_t res;
2572 
2573     round = get_round(vxrm, a, shift);
2574     res = (a >> shift) + round;
2575     return res;
2576 }
2577 static inline uint16_t
2578 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2579 {
2580     uint8_t round, shift = b & 0xf;
2581 
2582     round = get_round(vxrm, a, shift);
2583     return (a >> shift) + round;
2584 }
2585 static inline uint32_t
2586 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2587 {
2588     uint8_t round, shift = b & 0x1f;
2589 
2590     round = get_round(vxrm, a, shift);
2591     return (a >> shift) + round;
2592 }
2593 static inline uint64_t
2594 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2595 {
2596     uint8_t round, shift = b & 0x3f;
2597 
2598     round = get_round(vxrm, a, shift);
2599     return (a >> shift) + round;
2600 }
2601 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2602 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2603 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2604 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2605 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2606 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2607 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2608 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2609 
2610 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2611 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2612 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2613 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2614 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2615 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2616 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2617 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2618 
2619 static inline int8_t
2620 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2621 {
2622     uint8_t round, shift = b & 0x7;
2623 
2624     round = get_round(vxrm, a, shift);
2625     return (a >> shift) + round;
2626 }
2627 static inline int16_t
2628 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2629 {
2630     uint8_t round, shift = b & 0xf;
2631 
2632     round = get_round(vxrm, a, shift);
2633     return (a >> shift) + round;
2634 }
2635 static inline int32_t
2636 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2637 {
2638     uint8_t round, shift = b & 0x1f;
2639 
2640     round = get_round(vxrm, a, shift);
2641     return (a >> shift) + round;
2642 }
2643 static inline int64_t
2644 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2645 {
2646     uint8_t round, shift = b & 0x3f;
2647 
2648     round = get_round(vxrm, a, shift);
2649     return (a >> shift) + round;
2650 }
2651 
2652 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2653 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2654 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2655 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2656 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2657 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2658 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2659 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2660 
2661 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2662 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2663 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2664 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2665 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2666 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2667 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2668 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2669 
2670 /* Vector Narrowing Fixed-Point Clip Instructions */
2671 static inline int8_t
2672 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2673 {
2674     uint8_t round, shift = b & 0xf;
2675     int16_t res;
2676 
2677     round = get_round(vxrm, a, shift);
2678     res = (a >> shift) + round;
2679     if (res > INT8_MAX) {
2680         env->vxsat = 0x1;
2681         return INT8_MAX;
2682     } else if (res < INT8_MIN) {
2683         env->vxsat = 0x1;
2684         return INT8_MIN;
2685     } else {
2686         return res;
2687     }
2688 }
2689 
2690 static inline int16_t
2691 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2692 {
2693     uint8_t round, shift = b & 0x1f;
2694     int32_t res;
2695 
2696     round = get_round(vxrm, a, shift);
2697     res = (a >> shift) + round;
2698     if (res > INT16_MAX) {
2699         env->vxsat = 0x1;
2700         return INT16_MAX;
2701     } else if (res < INT16_MIN) {
2702         env->vxsat = 0x1;
2703         return INT16_MIN;
2704     } else {
2705         return res;
2706     }
2707 }
2708 
2709 static inline int32_t
2710 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2711 {
2712     uint8_t round, shift = b & 0x3f;
2713     int64_t res;
2714 
2715     round = get_round(vxrm, a, shift);
2716     res = (a >> shift) + round;
2717     if (res > INT32_MAX) {
2718         env->vxsat = 0x1;
2719         return INT32_MAX;
2720     } else if (res < INT32_MIN) {
2721         env->vxsat = 0x1;
2722         return INT32_MIN;
2723     } else {
2724         return res;
2725     }
2726 }
2727 
2728 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2729 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2730 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2731 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2732 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2733 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2734 
2735 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2736 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2737 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2738 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2739 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2740 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2741 
2742 static inline uint8_t
2743 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2744 {
2745     uint8_t round, shift = b & 0xf;
2746     uint16_t res;
2747 
2748     round = get_round(vxrm, a, shift);
2749     res = (a >> shift) + round;
2750     if (res > UINT8_MAX) {
2751         env->vxsat = 0x1;
2752         return UINT8_MAX;
2753     } else {
2754         return res;
2755     }
2756 }
2757 
2758 static inline uint16_t
2759 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2760 {
2761     uint8_t round, shift = b & 0x1f;
2762     uint32_t res;
2763 
2764     round = get_round(vxrm, a, shift);
2765     res = (a >> shift) + round;
2766     if (res > UINT16_MAX) {
2767         env->vxsat = 0x1;
2768         return UINT16_MAX;
2769     } else {
2770         return res;
2771     }
2772 }
2773 
2774 static inline uint32_t
2775 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2776 {
2777     uint8_t round, shift = b & 0x3f;
2778     uint64_t res;
2779 
2780     round = get_round(vxrm, a, shift);
2781     res = (a >> shift) + round;
2782     if (res > UINT32_MAX) {
2783         env->vxsat = 0x1;
2784         return UINT32_MAX;
2785     } else {
2786         return res;
2787     }
2788 }
2789 
2790 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2791 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2792 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2793 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2794 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2795 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2796 
2797 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2798 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2799 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2800 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2801 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2802 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2803 
2804 /*
2805  * Vector Float Point Arithmetic Instructions
2806  */
2807 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2808 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2809 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2810                       CPURISCVState *env)                      \
2811 {                                                              \
2812     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2813     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2814     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2815 }
2816 
2817 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2818 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2819                   void *vs2, CPURISCVState *env,          \
2820                   uint32_t desc)                          \
2821 {                                                         \
2822     uint32_t vm = vext_vm(desc);                          \
2823     uint32_t vl = env->vl;                                \
2824     uint32_t total_elems =                                \
2825         vext_get_total_elems(env, desc, ESZ);             \
2826     uint32_t vta = vext_vta(desc);                        \
2827     uint32_t vma = vext_vma(desc);                        \
2828     uint32_t i;                                           \
2829                                                           \
2830     for (i = env->vstart; i < vl; i++) {                  \
2831         if (!vm && !vext_elem_mask(v0, i)) {              \
2832             /* set masked-off elements to 1s */           \
2833             vext_set_elems_1s(vd, vma, i * ESZ,           \
2834                               (i + 1) * ESZ);             \
2835             continue;                                     \
2836         }                                                 \
2837         do_##NAME(vd, vs1, vs2, i, env);                  \
2838     }                                                     \
2839     env->vstart = 0;                                      \
2840     /* set tail elements to 1s */                         \
2841     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2842                       total_elems * ESZ);                 \
2843 }
2844 
2845 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2846 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2847 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2848 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2849 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2850 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2851 
2852 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2853 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2854                       CPURISCVState *env)                      \
2855 {                                                              \
2856     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2857     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2858 }
2859 
2860 #define GEN_VEXT_VF(NAME, ESZ)                            \
2861 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2862                   void *vs2, CPURISCVState *env,          \
2863                   uint32_t desc)                          \
2864 {                                                         \
2865     uint32_t vm = vext_vm(desc);                          \
2866     uint32_t vl = env->vl;                                \
2867     uint32_t total_elems =                                \
2868         vext_get_total_elems(env, desc, ESZ);             \
2869     uint32_t vta = vext_vta(desc);                        \
2870     uint32_t vma = vext_vma(desc);                        \
2871     uint32_t i;                                           \
2872                                                           \
2873     for (i = env->vstart; i < vl; i++) {                  \
2874         if (!vm && !vext_elem_mask(v0, i)) {              \
2875             /* set masked-off elements to 1s */           \
2876             vext_set_elems_1s(vd, vma, i * ESZ,           \
2877                               (i + 1) * ESZ);             \
2878             continue;                                     \
2879         }                                                 \
2880         do_##NAME(vd, s1, vs2, i, env);                   \
2881     }                                                     \
2882     env->vstart = 0;                                      \
2883     /* set tail elements to 1s */                         \
2884     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2885                       total_elems * ESZ);                 \
2886 }
2887 
2888 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2889 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2890 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2891 GEN_VEXT_VF(vfadd_vf_h, 2)
2892 GEN_VEXT_VF(vfadd_vf_w, 4)
2893 GEN_VEXT_VF(vfadd_vf_d, 8)
2894 
2895 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2896 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2897 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2898 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2899 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2900 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2901 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2902 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2903 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2904 GEN_VEXT_VF(vfsub_vf_h, 2)
2905 GEN_VEXT_VF(vfsub_vf_w, 4)
2906 GEN_VEXT_VF(vfsub_vf_d, 8)
2907 
2908 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2909 {
2910     return float16_sub(b, a, s);
2911 }
2912 
2913 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2914 {
2915     return float32_sub(b, a, s);
2916 }
2917 
2918 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2919 {
2920     return float64_sub(b, a, s);
2921 }
2922 
2923 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2924 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2925 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2926 GEN_VEXT_VF(vfrsub_vf_h, 2)
2927 GEN_VEXT_VF(vfrsub_vf_w, 4)
2928 GEN_VEXT_VF(vfrsub_vf_d, 8)
2929 
2930 /* Vector Widening Floating-Point Add/Subtract Instructions */
2931 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2932 {
2933     return float32_add(float16_to_float32(a, true, s),
2934                        float16_to_float32(b, true, s), s);
2935 }
2936 
2937 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2938 {
2939     return float64_add(float32_to_float64(a, s),
2940                        float32_to_float64(b, s), s);
2941 
2942 }
2943 
2944 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
2945 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
2946 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
2947 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
2948 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
2949 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
2950 GEN_VEXT_VF(vfwadd_vf_h, 4)
2951 GEN_VEXT_VF(vfwadd_vf_w, 8)
2952 
2953 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
2954 {
2955     return float32_sub(float16_to_float32(a, true, s),
2956                        float16_to_float32(b, true, s), s);
2957 }
2958 
2959 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
2960 {
2961     return float64_sub(float32_to_float64(a, s),
2962                        float32_to_float64(b, s), s);
2963 
2964 }
2965 
2966 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
2967 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
2968 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
2969 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
2970 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
2971 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
2972 GEN_VEXT_VF(vfwsub_vf_h, 4)
2973 GEN_VEXT_VF(vfwsub_vf_w, 8)
2974 
2975 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
2976 {
2977     return float32_add(a, float16_to_float32(b, true, s), s);
2978 }
2979 
2980 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
2981 {
2982     return float64_add(a, float32_to_float64(b, s), s);
2983 }
2984 
2985 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
2986 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
2987 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
2988 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
2989 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
2990 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
2991 GEN_VEXT_VF(vfwadd_wf_h, 4)
2992 GEN_VEXT_VF(vfwadd_wf_w, 8)
2993 
2994 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
2995 {
2996     return float32_sub(a, float16_to_float32(b, true, s), s);
2997 }
2998 
2999 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3000 {
3001     return float64_sub(a, float32_to_float64(b, s), s);
3002 }
3003 
3004 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3005 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3006 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3007 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3008 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3009 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3010 GEN_VEXT_VF(vfwsub_wf_h, 4)
3011 GEN_VEXT_VF(vfwsub_wf_w, 8)
3012 
3013 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3014 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3015 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3016 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3017 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3018 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3019 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3020 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3021 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3022 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3023 GEN_VEXT_VF(vfmul_vf_h, 2)
3024 GEN_VEXT_VF(vfmul_vf_w, 4)
3025 GEN_VEXT_VF(vfmul_vf_d, 8)
3026 
3027 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3028 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3029 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3030 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3031 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3032 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3033 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3034 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3035 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3036 GEN_VEXT_VF(vfdiv_vf_h, 2)
3037 GEN_VEXT_VF(vfdiv_vf_w, 4)
3038 GEN_VEXT_VF(vfdiv_vf_d, 8)
3039 
3040 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3041 {
3042     return float16_div(b, a, s);
3043 }
3044 
3045 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3046 {
3047     return float32_div(b, a, s);
3048 }
3049 
3050 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3051 {
3052     return float64_div(b, a, s);
3053 }
3054 
3055 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3056 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3057 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3058 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3059 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3060 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3061 
3062 /* Vector Widening Floating-Point Multiply */
3063 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3064 {
3065     return float32_mul(float16_to_float32(a, true, s),
3066                        float16_to_float32(b, true, s), s);
3067 }
3068 
3069 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3070 {
3071     return float64_mul(float32_to_float64(a, s),
3072                        float32_to_float64(b, s), s);
3073 
3074 }
3075 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3076 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3077 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3078 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3079 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3080 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3081 GEN_VEXT_VF(vfwmul_vf_h, 4)
3082 GEN_VEXT_VF(vfwmul_vf_w, 8)
3083 
3084 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3085 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3086 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3087                       CPURISCVState *env)                          \
3088 {                                                                  \
3089     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3090     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3091     TD d = *((TD *)vd + HD(i));                                    \
3092     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3093 }
3094 
3095 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3096 {
3097     return float16_muladd(a, b, d, 0, s);
3098 }
3099 
3100 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3101 {
3102     return float32_muladd(a, b, d, 0, s);
3103 }
3104 
3105 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3106 {
3107     return float64_muladd(a, b, d, 0, s);
3108 }
3109 
3110 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3111 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3112 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3113 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3114 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3115 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3116 
3117 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3118 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3119                       CPURISCVState *env)                         \
3120 {                                                                 \
3121     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3122     TD d = *((TD *)vd + HD(i));                                   \
3123     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3124 }
3125 
3126 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3127 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3128 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3129 GEN_VEXT_VF(vfmacc_vf_h, 2)
3130 GEN_VEXT_VF(vfmacc_vf_w, 4)
3131 GEN_VEXT_VF(vfmacc_vf_d, 8)
3132 
3133 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3134 {
3135     return float16_muladd(a, b, d, float_muladd_negate_c |
3136                                    float_muladd_negate_product, s);
3137 }
3138 
3139 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3140 {
3141     return float32_muladd(a, b, d, float_muladd_negate_c |
3142                                    float_muladd_negate_product, s);
3143 }
3144 
3145 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3146 {
3147     return float64_muladd(a, b, d, float_muladd_negate_c |
3148                                    float_muladd_negate_product, s);
3149 }
3150 
3151 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3152 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3153 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3154 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3155 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3156 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3157 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3158 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3159 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3160 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3161 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3162 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3163 
3164 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3165 {
3166     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3167 }
3168 
3169 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3170 {
3171     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3172 }
3173 
3174 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3175 {
3176     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3177 }
3178 
3179 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3180 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3181 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3182 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3183 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3184 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3185 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3186 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3187 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3188 GEN_VEXT_VF(vfmsac_vf_h, 2)
3189 GEN_VEXT_VF(vfmsac_vf_w, 4)
3190 GEN_VEXT_VF(vfmsac_vf_d, 8)
3191 
3192 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3193 {
3194     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3195 }
3196 
3197 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3198 {
3199     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3200 }
3201 
3202 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3203 {
3204     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3205 }
3206 
3207 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3208 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3209 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3210 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3211 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3212 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3213 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3214 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3215 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3216 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3217 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3218 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3219 
3220 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3221 {
3222     return float16_muladd(d, b, a, 0, s);
3223 }
3224 
3225 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3226 {
3227     return float32_muladd(d, b, a, 0, s);
3228 }
3229 
3230 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3231 {
3232     return float64_muladd(d, b, a, 0, s);
3233 }
3234 
3235 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3236 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3237 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3238 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3239 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3240 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3241 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3242 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3243 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3244 GEN_VEXT_VF(vfmadd_vf_h, 2)
3245 GEN_VEXT_VF(vfmadd_vf_w, 4)
3246 GEN_VEXT_VF(vfmadd_vf_d, 8)
3247 
3248 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3249 {
3250     return float16_muladd(d, b, a, float_muladd_negate_c |
3251                                    float_muladd_negate_product, s);
3252 }
3253 
3254 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3255 {
3256     return float32_muladd(d, b, a, float_muladd_negate_c |
3257                                    float_muladd_negate_product, s);
3258 }
3259 
3260 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3261 {
3262     return float64_muladd(d, b, a, float_muladd_negate_c |
3263                                    float_muladd_negate_product, s);
3264 }
3265 
3266 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3267 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3268 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3269 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3270 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3271 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3272 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3273 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3274 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3275 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3276 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3277 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3278 
3279 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3280 {
3281     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3282 }
3283 
3284 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3285 {
3286     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3287 }
3288 
3289 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3290 {
3291     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3292 }
3293 
3294 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3295 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3296 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3297 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3298 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3299 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3300 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3301 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3302 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3303 GEN_VEXT_VF(vfmsub_vf_h, 2)
3304 GEN_VEXT_VF(vfmsub_vf_w, 4)
3305 GEN_VEXT_VF(vfmsub_vf_d, 8)
3306 
3307 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3308 {
3309     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3310 }
3311 
3312 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3313 {
3314     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3315 }
3316 
3317 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3318 {
3319     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3320 }
3321 
3322 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3323 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3324 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3325 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3326 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3327 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3328 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3329 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3330 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3331 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3332 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3333 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3334 
3335 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3336 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3337 {
3338     return float32_muladd(float16_to_float32(a, true, s),
3339                           float16_to_float32(b, true, s), d, 0, s);
3340 }
3341 
3342 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3343 {
3344     return float64_muladd(float32_to_float64(a, s),
3345                           float32_to_float64(b, s), d, 0, s);
3346 }
3347 
3348 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3349 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3350 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3351 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3352 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3353 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3354 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3355 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3356 
3357 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3358 {
3359     return float32_muladd(bfloat16_to_float32(a, s),
3360                           bfloat16_to_float32(b, s), d, 0, s);
3361 }
3362 
3363 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3364 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3365 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3366 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3367 
3368 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3369 {
3370     return float32_muladd(float16_to_float32(a, true, s),
3371                           float16_to_float32(b, true, s), d,
3372                           float_muladd_negate_c | float_muladd_negate_product,
3373                           s);
3374 }
3375 
3376 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3377 {
3378     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3379                           d, float_muladd_negate_c |
3380                              float_muladd_negate_product, s);
3381 }
3382 
3383 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3384 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3385 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3386 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3387 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3388 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3389 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3390 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3391 
3392 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3393 {
3394     return float32_muladd(float16_to_float32(a, true, s),
3395                           float16_to_float32(b, true, s), d,
3396                           float_muladd_negate_c, s);
3397 }
3398 
3399 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3400 {
3401     return float64_muladd(float32_to_float64(a, s),
3402                           float32_to_float64(b, s), d,
3403                           float_muladd_negate_c, s);
3404 }
3405 
3406 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3407 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3408 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3409 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3410 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3411 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3412 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3413 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3414 
3415 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3416 {
3417     return float32_muladd(float16_to_float32(a, true, s),
3418                           float16_to_float32(b, true, s), d,
3419                           float_muladd_negate_product, s);
3420 }
3421 
3422 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3423 {
3424     return float64_muladd(float32_to_float64(a, s),
3425                           float32_to_float64(b, s), d,
3426                           float_muladd_negate_product, s);
3427 }
3428 
3429 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3430 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3431 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3432 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3433 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3434 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3435 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3436 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3437 
3438 /* Vector Floating-Point Square-Root Instruction */
3439 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3440 static void do_##NAME(void *vd, void *vs2, int i,      \
3441                       CPURISCVState *env)              \
3442 {                                                      \
3443     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3444     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3445 }
3446 
3447 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3448 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3449                   CPURISCVState *env, uint32_t desc)   \
3450 {                                                      \
3451     uint32_t vm = vext_vm(desc);                       \
3452     uint32_t vl = env->vl;                             \
3453     uint32_t total_elems =                             \
3454         vext_get_total_elems(env, desc, ESZ);          \
3455     uint32_t vta = vext_vta(desc);                     \
3456     uint32_t vma = vext_vma(desc);                     \
3457     uint32_t i;                                        \
3458                                                        \
3459     if (vl == 0) {                                     \
3460         return;                                        \
3461     }                                                  \
3462     for (i = env->vstart; i < vl; i++) {               \
3463         if (!vm && !vext_elem_mask(v0, i)) {           \
3464             /* set masked-off elements to 1s */        \
3465             vext_set_elems_1s(vd, vma, i * ESZ,        \
3466                               (i + 1) * ESZ);          \
3467             continue;                                  \
3468         }                                              \
3469         do_##NAME(vd, vs2, i, env);                    \
3470     }                                                  \
3471     env->vstart = 0;                                   \
3472     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3473                       total_elems * ESZ);              \
3474 }
3475 
3476 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3477 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3478 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3479 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3480 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3481 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3482 
3483 /*
3484  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3485  *
3486  * Adapted from riscv-v-spec recip.c:
3487  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3488  */
3489 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3490 {
3491     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3492     uint64_t exp = extract64(f, frac_size, exp_size);
3493     uint64_t frac = extract64(f, 0, frac_size);
3494 
3495     const uint8_t lookup_table[] = {
3496         52, 51, 50, 48, 47, 46, 44, 43,
3497         42, 41, 40, 39, 38, 36, 35, 34,
3498         33, 32, 31, 30, 30, 29, 28, 27,
3499         26, 25, 24, 23, 23, 22, 21, 20,
3500         19, 19, 18, 17, 16, 16, 15, 14,
3501         14, 13, 12, 12, 11, 10, 10, 9,
3502         9, 8, 7, 7, 6, 6, 5, 4,
3503         4, 3, 3, 2, 2, 1, 1, 0,
3504         127, 125, 123, 121, 119, 118, 116, 114,
3505         113, 111, 109, 108, 106, 105, 103, 102,
3506         100, 99, 97, 96, 95, 93, 92, 91,
3507         90, 88, 87, 86, 85, 84, 83, 82,
3508         80, 79, 78, 77, 76, 75, 74, 73,
3509         72, 71, 70, 70, 69, 68, 67, 66,
3510         65, 64, 63, 63, 62, 61, 60, 59,
3511         59, 58, 57, 56, 56, 55, 54, 53
3512     };
3513     const int precision = 7;
3514 
3515     if (exp == 0 && frac != 0) { /* subnormal */
3516         /* Normalize the subnormal. */
3517         while (extract64(frac, frac_size - 1, 1) == 0) {
3518             exp--;
3519             frac <<= 1;
3520         }
3521 
3522         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3523     }
3524 
3525     int idx = ((exp & 1) << (precision - 1)) |
3526               (frac >> (frac_size - precision + 1));
3527     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3528                         (frac_size - precision);
3529     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3530 
3531     uint64_t val = 0;
3532     val = deposit64(val, 0, frac_size, out_frac);
3533     val = deposit64(val, frac_size, exp_size, out_exp);
3534     val = deposit64(val, frac_size + exp_size, 1, sign);
3535     return val;
3536 }
3537 
3538 static float16 frsqrt7_h(float16 f, float_status *s)
3539 {
3540     int exp_size = 5, frac_size = 10;
3541     bool sign = float16_is_neg(f);
3542 
3543     /*
3544      * frsqrt7(sNaN) = canonical NaN
3545      * frsqrt7(-inf) = canonical NaN
3546      * frsqrt7(-normal) = canonical NaN
3547      * frsqrt7(-subnormal) = canonical NaN
3548      */
3549     if (float16_is_signaling_nan(f, s) ||
3550         (float16_is_infinity(f) && sign) ||
3551         (float16_is_normal(f) && sign) ||
3552         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3553         s->float_exception_flags |= float_flag_invalid;
3554         return float16_default_nan(s);
3555     }
3556 
3557     /* frsqrt7(qNaN) = canonical NaN */
3558     if (float16_is_quiet_nan(f, s)) {
3559         return float16_default_nan(s);
3560     }
3561 
3562     /* frsqrt7(+-0) = +-inf */
3563     if (float16_is_zero(f)) {
3564         s->float_exception_flags |= float_flag_divbyzero;
3565         return float16_set_sign(float16_infinity, sign);
3566     }
3567 
3568     /* frsqrt7(+inf) = +0 */
3569     if (float16_is_infinity(f) && !sign) {
3570         return float16_set_sign(float16_zero, sign);
3571     }
3572 
3573     /* +normal, +subnormal */
3574     uint64_t val = frsqrt7(f, exp_size, frac_size);
3575     return make_float16(val);
3576 }
3577 
3578 static float32 frsqrt7_s(float32 f, float_status *s)
3579 {
3580     int exp_size = 8, frac_size = 23;
3581     bool sign = float32_is_neg(f);
3582 
3583     /*
3584      * frsqrt7(sNaN) = canonical NaN
3585      * frsqrt7(-inf) = canonical NaN
3586      * frsqrt7(-normal) = canonical NaN
3587      * frsqrt7(-subnormal) = canonical NaN
3588      */
3589     if (float32_is_signaling_nan(f, s) ||
3590         (float32_is_infinity(f) && sign) ||
3591         (float32_is_normal(f) && sign) ||
3592         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3593         s->float_exception_flags |= float_flag_invalid;
3594         return float32_default_nan(s);
3595     }
3596 
3597     /* frsqrt7(qNaN) = canonical NaN */
3598     if (float32_is_quiet_nan(f, s)) {
3599         return float32_default_nan(s);
3600     }
3601 
3602     /* frsqrt7(+-0) = +-inf */
3603     if (float32_is_zero(f)) {
3604         s->float_exception_flags |= float_flag_divbyzero;
3605         return float32_set_sign(float32_infinity, sign);
3606     }
3607 
3608     /* frsqrt7(+inf) = +0 */
3609     if (float32_is_infinity(f) && !sign) {
3610         return float32_set_sign(float32_zero, sign);
3611     }
3612 
3613     /* +normal, +subnormal */
3614     uint64_t val = frsqrt7(f, exp_size, frac_size);
3615     return make_float32(val);
3616 }
3617 
3618 static float64 frsqrt7_d(float64 f, float_status *s)
3619 {
3620     int exp_size = 11, frac_size = 52;
3621     bool sign = float64_is_neg(f);
3622 
3623     /*
3624      * frsqrt7(sNaN) = canonical NaN
3625      * frsqrt7(-inf) = canonical NaN
3626      * frsqrt7(-normal) = canonical NaN
3627      * frsqrt7(-subnormal) = canonical NaN
3628      */
3629     if (float64_is_signaling_nan(f, s) ||
3630         (float64_is_infinity(f) && sign) ||
3631         (float64_is_normal(f) && sign) ||
3632         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3633         s->float_exception_flags |= float_flag_invalid;
3634         return float64_default_nan(s);
3635     }
3636 
3637     /* frsqrt7(qNaN) = canonical NaN */
3638     if (float64_is_quiet_nan(f, s)) {
3639         return float64_default_nan(s);
3640     }
3641 
3642     /* frsqrt7(+-0) = +-inf */
3643     if (float64_is_zero(f)) {
3644         s->float_exception_flags |= float_flag_divbyzero;
3645         return float64_set_sign(float64_infinity, sign);
3646     }
3647 
3648     /* frsqrt7(+inf) = +0 */
3649     if (float64_is_infinity(f) && !sign) {
3650         return float64_set_sign(float64_zero, sign);
3651     }
3652 
3653     /* +normal, +subnormal */
3654     uint64_t val = frsqrt7(f, exp_size, frac_size);
3655     return make_float64(val);
3656 }
3657 
3658 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3659 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3660 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3661 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3662 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3663 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3664 
3665 /*
3666  * Vector Floating-Point Reciprocal Estimate Instruction
3667  *
3668  * Adapted from riscv-v-spec recip.c:
3669  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3670  */
3671 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3672                       float_status *s)
3673 {
3674     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3675     uint64_t exp = extract64(f, frac_size, exp_size);
3676     uint64_t frac = extract64(f, 0, frac_size);
3677 
3678     const uint8_t lookup_table[] = {
3679         127, 125, 123, 121, 119, 117, 116, 114,
3680         112, 110, 109, 107, 105, 104, 102, 100,
3681         99, 97, 96, 94, 93, 91, 90, 88,
3682         87, 85, 84, 83, 81, 80, 79, 77,
3683         76, 75, 74, 72, 71, 70, 69, 68,
3684         66, 65, 64, 63, 62, 61, 60, 59,
3685         58, 57, 56, 55, 54, 53, 52, 51,
3686         50, 49, 48, 47, 46, 45, 44, 43,
3687         42, 41, 40, 40, 39, 38, 37, 36,
3688         35, 35, 34, 33, 32, 31, 31, 30,
3689         29, 28, 28, 27, 26, 25, 25, 24,
3690         23, 23, 22, 21, 21, 20, 19, 19,
3691         18, 17, 17, 16, 15, 15, 14, 14,
3692         13, 12, 12, 11, 11, 10, 9, 9,
3693         8, 8, 7, 7, 6, 5, 5, 4,
3694         4, 3, 3, 2, 2, 1, 1, 0
3695     };
3696     const int precision = 7;
3697 
3698     if (exp == 0 && frac != 0) { /* subnormal */
3699         /* Normalize the subnormal. */
3700         while (extract64(frac, frac_size - 1, 1) == 0) {
3701             exp--;
3702             frac <<= 1;
3703         }
3704 
3705         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3706 
3707         if (exp != 0 && exp != UINT64_MAX) {
3708             /*
3709              * Overflow to inf or max value of same sign,
3710              * depending on sign and rounding mode.
3711              */
3712             s->float_exception_flags |= (float_flag_inexact |
3713                                          float_flag_overflow);
3714 
3715             if ((s->float_rounding_mode == float_round_to_zero) ||
3716                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3717                 ((s->float_rounding_mode == float_round_up) && sign)) {
3718                 /* Return greatest/negative finite value. */
3719                 return (sign << (exp_size + frac_size)) |
3720                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3721             } else {
3722                 /* Return +-inf. */
3723                 return (sign << (exp_size + frac_size)) |
3724                        MAKE_64BIT_MASK(frac_size, exp_size);
3725             }
3726         }
3727     }
3728 
3729     int idx = frac >> (frac_size - precision);
3730     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3731                         (frac_size - precision);
3732     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3733 
3734     if (out_exp == 0 || out_exp == UINT64_MAX) {
3735         /*
3736          * The result is subnormal, but don't raise the underflow exception,
3737          * because there's no additional loss of precision.
3738          */
3739         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3740         if (out_exp == UINT64_MAX) {
3741             out_frac >>= 1;
3742             out_exp = 0;
3743         }
3744     }
3745 
3746     uint64_t val = 0;
3747     val = deposit64(val, 0, frac_size, out_frac);
3748     val = deposit64(val, frac_size, exp_size, out_exp);
3749     val = deposit64(val, frac_size + exp_size, 1, sign);
3750     return val;
3751 }
3752 
3753 static float16 frec7_h(float16 f, float_status *s)
3754 {
3755     int exp_size = 5, frac_size = 10;
3756     bool sign = float16_is_neg(f);
3757 
3758     /* frec7(+-inf) = +-0 */
3759     if (float16_is_infinity(f)) {
3760         return float16_set_sign(float16_zero, sign);
3761     }
3762 
3763     /* frec7(+-0) = +-inf */
3764     if (float16_is_zero(f)) {
3765         s->float_exception_flags |= float_flag_divbyzero;
3766         return float16_set_sign(float16_infinity, sign);
3767     }
3768 
3769     /* frec7(sNaN) = canonical NaN */
3770     if (float16_is_signaling_nan(f, s)) {
3771         s->float_exception_flags |= float_flag_invalid;
3772         return float16_default_nan(s);
3773     }
3774 
3775     /* frec7(qNaN) = canonical NaN */
3776     if (float16_is_quiet_nan(f, s)) {
3777         return float16_default_nan(s);
3778     }
3779 
3780     /* +-normal, +-subnormal */
3781     uint64_t val = frec7(f, exp_size, frac_size, s);
3782     return make_float16(val);
3783 }
3784 
3785 static float32 frec7_s(float32 f, float_status *s)
3786 {
3787     int exp_size = 8, frac_size = 23;
3788     bool sign = float32_is_neg(f);
3789 
3790     /* frec7(+-inf) = +-0 */
3791     if (float32_is_infinity(f)) {
3792         return float32_set_sign(float32_zero, sign);
3793     }
3794 
3795     /* frec7(+-0) = +-inf */
3796     if (float32_is_zero(f)) {
3797         s->float_exception_flags |= float_flag_divbyzero;
3798         return float32_set_sign(float32_infinity, sign);
3799     }
3800 
3801     /* frec7(sNaN) = canonical NaN */
3802     if (float32_is_signaling_nan(f, s)) {
3803         s->float_exception_flags |= float_flag_invalid;
3804         return float32_default_nan(s);
3805     }
3806 
3807     /* frec7(qNaN) = canonical NaN */
3808     if (float32_is_quiet_nan(f, s)) {
3809         return float32_default_nan(s);
3810     }
3811 
3812     /* +-normal, +-subnormal */
3813     uint64_t val = frec7(f, exp_size, frac_size, s);
3814     return make_float32(val);
3815 }
3816 
3817 static float64 frec7_d(float64 f, float_status *s)
3818 {
3819     int exp_size = 11, frac_size = 52;
3820     bool sign = float64_is_neg(f);
3821 
3822     /* frec7(+-inf) = +-0 */
3823     if (float64_is_infinity(f)) {
3824         return float64_set_sign(float64_zero, sign);
3825     }
3826 
3827     /* frec7(+-0) = +-inf */
3828     if (float64_is_zero(f)) {
3829         s->float_exception_flags |= float_flag_divbyzero;
3830         return float64_set_sign(float64_infinity, sign);
3831     }
3832 
3833     /* frec7(sNaN) = canonical NaN */
3834     if (float64_is_signaling_nan(f, s)) {
3835         s->float_exception_flags |= float_flag_invalid;
3836         return float64_default_nan(s);
3837     }
3838 
3839     /* frec7(qNaN) = canonical NaN */
3840     if (float64_is_quiet_nan(f, s)) {
3841         return float64_default_nan(s);
3842     }
3843 
3844     /* +-normal, +-subnormal */
3845     uint64_t val = frec7(f, exp_size, frac_size, s);
3846     return make_float64(val);
3847 }
3848 
3849 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3850 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3851 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3852 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3853 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3854 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3855 
3856 /* Vector Floating-Point MIN/MAX Instructions */
3857 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3858 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3859 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3860 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3861 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3862 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3863 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3864 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3865 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3866 GEN_VEXT_VF(vfmin_vf_h, 2)
3867 GEN_VEXT_VF(vfmin_vf_w, 4)
3868 GEN_VEXT_VF(vfmin_vf_d, 8)
3869 
3870 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3871 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3872 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3873 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3874 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3875 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3876 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3877 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3878 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3879 GEN_VEXT_VF(vfmax_vf_h, 2)
3880 GEN_VEXT_VF(vfmax_vf_w, 4)
3881 GEN_VEXT_VF(vfmax_vf_d, 8)
3882 
3883 /* Vector Floating-Point Sign-Injection Instructions */
3884 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3885 {
3886     return deposit64(b, 0, 15, a);
3887 }
3888 
3889 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3890 {
3891     return deposit64(b, 0, 31, a);
3892 }
3893 
3894 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3895 {
3896     return deposit64(b, 0, 63, a);
3897 }
3898 
3899 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3900 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3901 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3902 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3903 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3904 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3905 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3906 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3907 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3908 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3909 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3910 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3911 
3912 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3913 {
3914     return deposit64(~b, 0, 15, a);
3915 }
3916 
3917 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3918 {
3919     return deposit64(~b, 0, 31, a);
3920 }
3921 
3922 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3923 {
3924     return deposit64(~b, 0, 63, a);
3925 }
3926 
3927 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3928 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3929 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3930 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3931 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3932 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3933 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3934 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3935 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3936 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
3937 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
3938 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
3939 
3940 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3941 {
3942     return deposit64(b ^ a, 0, 15, a);
3943 }
3944 
3945 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3946 {
3947     return deposit64(b ^ a, 0, 31, a);
3948 }
3949 
3950 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3951 {
3952     return deposit64(b ^ a, 0, 63, a);
3953 }
3954 
3955 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3956 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3957 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3958 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
3959 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
3960 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
3961 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3962 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3963 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3964 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
3965 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
3966 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
3967 
3968 /* Vector Floating-Point Compare Instructions */
3969 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3970 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3971                   CPURISCVState *env, uint32_t desc)          \
3972 {                                                             \
3973     uint32_t vm = vext_vm(desc);                              \
3974     uint32_t vl = env->vl;                                    \
3975     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
3976     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
3977     uint32_t vma = vext_vma(desc);                            \
3978     uint32_t i;                                               \
3979                                                               \
3980     for (i = env->vstart; i < vl; i++) {                      \
3981         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3982         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3983         if (!vm && !vext_elem_mask(v0, i)) {                  \
3984             /* set masked-off elements to 1s */               \
3985             if (vma) {                                        \
3986                 vext_set_elem_mask(vd, i, 1);                 \
3987             }                                                 \
3988             continue;                                         \
3989         }                                                     \
3990         vext_set_elem_mask(vd, i,                             \
3991                            DO_OP(s2, s1, &env->fp_status));   \
3992     }                                                         \
3993     env->vstart = 0;                                          \
3994     /*
3995      * mask destination register are always tail-agnostic
3996      * set tail elements to 1s
3997      */                                                       \
3998     if (vta_all_1s) {                                         \
3999         for (; i < total_elems; i++) {                        \
4000             vext_set_elem_mask(vd, i, 1);                     \
4001         }                                                     \
4002     }                                                         \
4003 }
4004 
4005 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4006 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4007 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4008 
4009 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4010 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4011                   CPURISCVState *env, uint32_t desc)                \
4012 {                                                                   \
4013     uint32_t vm = vext_vm(desc);                                    \
4014     uint32_t vl = env->vl;                                          \
4015     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4016     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4017     uint32_t vma = vext_vma(desc);                                  \
4018     uint32_t i;                                                     \
4019                                                                     \
4020     for (i = env->vstart; i < vl; i++) {                            \
4021         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4022         if (!vm && !vext_elem_mask(v0, i)) {                        \
4023             /* set masked-off elements to 1s */                     \
4024             if (vma) {                                              \
4025                 vext_set_elem_mask(vd, i, 1);                       \
4026             }                                                       \
4027             continue;                                               \
4028         }                                                           \
4029         vext_set_elem_mask(vd, i,                                   \
4030                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4031     }                                                               \
4032     env->vstart = 0;                                                \
4033     /*
4034      * mask destination register are always tail-agnostic
4035      * set tail elements to 1s
4036      */                                                             \
4037     if (vta_all_1s) {                                               \
4038         for (; i < total_elems; i++) {                              \
4039             vext_set_elem_mask(vd, i, 1);                           \
4040         }                                                           \
4041     }                                                               \
4042 }
4043 
4044 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4045 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4046 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4047 
4048 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4049 {
4050     FloatRelation compare = float16_compare_quiet(a, b, s);
4051     return compare != float_relation_equal;
4052 }
4053 
4054 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4055 {
4056     FloatRelation compare = float32_compare_quiet(a, b, s);
4057     return compare != float_relation_equal;
4058 }
4059 
4060 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4061 {
4062     FloatRelation compare = float64_compare_quiet(a, b, s);
4063     return compare != float_relation_equal;
4064 }
4065 
4066 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4067 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4068 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4069 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4070 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4071 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4072 
4073 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4074 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4075 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4076 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4077 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4078 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4079 
4080 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4081 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4082 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4083 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4084 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4085 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4086 
4087 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4088 {
4089     FloatRelation compare = float16_compare(a, b, s);
4090     return compare == float_relation_greater;
4091 }
4092 
4093 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4094 {
4095     FloatRelation compare = float32_compare(a, b, s);
4096     return compare == float_relation_greater;
4097 }
4098 
4099 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4100 {
4101     FloatRelation compare = float64_compare(a, b, s);
4102     return compare == float_relation_greater;
4103 }
4104 
4105 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4106 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4107 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4108 
4109 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4110 {
4111     FloatRelation compare = float16_compare(a, b, s);
4112     return compare == float_relation_greater ||
4113            compare == float_relation_equal;
4114 }
4115 
4116 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4117 {
4118     FloatRelation compare = float32_compare(a, b, s);
4119     return compare == float_relation_greater ||
4120            compare == float_relation_equal;
4121 }
4122 
4123 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4124 {
4125     FloatRelation compare = float64_compare(a, b, s);
4126     return compare == float_relation_greater ||
4127            compare == float_relation_equal;
4128 }
4129 
4130 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4131 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4132 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4133 
4134 /* Vector Floating-Point Classify Instruction */
4135 target_ulong fclass_h(uint64_t frs1)
4136 {
4137     float16 f = frs1;
4138     bool sign = float16_is_neg(f);
4139 
4140     if (float16_is_infinity(f)) {
4141         return sign ? 1 << 0 : 1 << 7;
4142     } else if (float16_is_zero(f)) {
4143         return sign ? 1 << 3 : 1 << 4;
4144     } else if (float16_is_zero_or_denormal(f)) {
4145         return sign ? 1 << 2 : 1 << 5;
4146     } else if (float16_is_any_nan(f)) {
4147         float_status s = { }; /* for snan_bit_is_one */
4148         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4149     } else {
4150         return sign ? 1 << 1 : 1 << 6;
4151     }
4152 }
4153 
4154 target_ulong fclass_s(uint64_t frs1)
4155 {
4156     float32 f = frs1;
4157     bool sign = float32_is_neg(f);
4158 
4159     if (float32_is_infinity(f)) {
4160         return sign ? 1 << 0 : 1 << 7;
4161     } else if (float32_is_zero(f)) {
4162         return sign ? 1 << 3 : 1 << 4;
4163     } else if (float32_is_zero_or_denormal(f)) {
4164         return sign ? 1 << 2 : 1 << 5;
4165     } else if (float32_is_any_nan(f)) {
4166         float_status s = { }; /* for snan_bit_is_one */
4167         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4168     } else {
4169         return sign ? 1 << 1 : 1 << 6;
4170     }
4171 }
4172 
4173 target_ulong fclass_d(uint64_t frs1)
4174 {
4175     float64 f = frs1;
4176     bool sign = float64_is_neg(f);
4177 
4178     if (float64_is_infinity(f)) {
4179         return sign ? 1 << 0 : 1 << 7;
4180     } else if (float64_is_zero(f)) {
4181         return sign ? 1 << 3 : 1 << 4;
4182     } else if (float64_is_zero_or_denormal(f)) {
4183         return sign ? 1 << 2 : 1 << 5;
4184     } else if (float64_is_any_nan(f)) {
4185         float_status s = { }; /* for snan_bit_is_one */
4186         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4187     } else {
4188         return sign ? 1 << 1 : 1 << 6;
4189     }
4190 }
4191 
4192 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4193 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4194 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4195 GEN_VEXT_V(vfclass_v_h, 2)
4196 GEN_VEXT_V(vfclass_v_w, 4)
4197 GEN_VEXT_V(vfclass_v_d, 8)
4198 
4199 /* Vector Floating-Point Merge Instruction */
4200 
4201 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4202 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4203                   CPURISCVState *env, uint32_t desc)          \
4204 {                                                             \
4205     uint32_t vm = vext_vm(desc);                              \
4206     uint32_t vl = env->vl;                                    \
4207     uint32_t esz = sizeof(ETYPE);                             \
4208     uint32_t total_elems =                                    \
4209         vext_get_total_elems(env, desc, esz);                 \
4210     uint32_t vta = vext_vta(desc);                            \
4211     uint32_t i;                                               \
4212                                                               \
4213     for (i = env->vstart; i < vl; i++) {                      \
4214         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4215         *((ETYPE *)vd + H(i)) =                               \
4216             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4217     }                                                         \
4218     env->vstart = 0;                                          \
4219     /* set tail elements to 1s */                             \
4220     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4221 }
4222 
4223 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4224 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4225 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4226 
4227 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4228 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4229 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4230 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4231 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4232 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4233 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4234 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4235 
4236 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4237 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4238 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4239 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4240 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4241 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4242 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4243 
4244 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4245 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4246 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4247 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4248 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4249 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4250 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4251 
4252 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4253 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4254 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4255 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4256 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4257 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4258 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4259 
4260 /* Widening Floating-Point/Integer Type-Convert Instructions */
4261 /* (TD, T2, TX2) */
4262 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4263 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4264 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4265 /*
4266  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4267  */
4268 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4269 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4270 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4271 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4272 
4273 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4274 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4275 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4276 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4277 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4278 
4279 /*
4280  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4281  */
4282 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4283 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4284 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4285 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4286 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4287 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4288 
4289 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4290 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4291 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4292 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4293 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4294 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4295 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4296 
4297 /*
4298  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4299  */
4300 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4301 {
4302     return float16_to_float32(a, true, s);
4303 }
4304 
4305 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4306 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4307 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4308 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4309 
4310 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4311 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4312 
4313 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4314 /* (TD, T2, TX2) */
4315 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4316 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4317 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4318 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4319 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4320 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4321 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4322 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4323 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4324 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4325 
4326 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4327 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4328 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4329 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4330 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4331 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4332 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4333 
4334 /*
4335  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4336  */
4337 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4338 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4339 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4340 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4341 
4342 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4343 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4344 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4345 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4346 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4347 
4348 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4349 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4350 {
4351     return float32_to_float16(a, true, s);
4352 }
4353 
4354 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4355 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4356 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4357 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4358 
4359 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4360 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4361 
4362 /*
4363  * Vector Reduction Operations
4364  */
4365 /* Vector Single-Width Integer Reduction Instructions */
4366 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4367 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4368                   void *vs2, CPURISCVState *env,          \
4369                   uint32_t desc)                          \
4370 {                                                         \
4371     uint32_t vm = vext_vm(desc);                          \
4372     uint32_t vl = env->vl;                                \
4373     uint32_t esz = sizeof(TD);                            \
4374     uint32_t vlenb = simd_maxsz(desc);                    \
4375     uint32_t vta = vext_vta(desc);                        \
4376     uint32_t i;                                           \
4377     TD s1 =  *((TD *)vs1 + HD(0));                        \
4378                                                           \
4379     for (i = env->vstart; i < vl; i++) {                  \
4380         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4381         if (!vm && !vext_elem_mask(v0, i)) {              \
4382             continue;                                     \
4383         }                                                 \
4384         s1 = OP(s1, (TD)s2);                              \
4385     }                                                     \
4386     *((TD *)vd + HD(0)) = s1;                             \
4387     env->vstart = 0;                                      \
4388     /* set tail elements to 1s */                         \
4389     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4390 }
4391 
4392 /* vd[0] = sum(vs1[0], vs2[*]) */
4393 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4394 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4395 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4396 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4397 
4398 /* vd[0] = maxu(vs1[0], vs2[*]) */
4399 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4400 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4401 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4402 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4403 
4404 /* vd[0] = max(vs1[0], vs2[*]) */
4405 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4406 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4407 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4408 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4409 
4410 /* vd[0] = minu(vs1[0], vs2[*]) */
4411 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4412 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4413 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4414 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4415 
4416 /* vd[0] = min(vs1[0], vs2[*]) */
4417 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4418 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4419 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4420 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4421 
4422 /* vd[0] = and(vs1[0], vs2[*]) */
4423 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4424 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4425 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4426 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4427 
4428 /* vd[0] = or(vs1[0], vs2[*]) */
4429 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4430 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4431 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4432 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4433 
4434 /* vd[0] = xor(vs1[0], vs2[*]) */
4435 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4436 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4437 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4438 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4439 
4440 /* Vector Widening Integer Reduction Instructions */
4441 /* signed sum reduction into double-width accumulator */
4442 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4443 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4444 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4445 
4446 /* Unsigned sum reduction into double-width accumulator */
4447 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4448 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4449 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4450 
4451 /* Vector Single-Width Floating-Point Reduction Instructions */
4452 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4453 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4454                   void *vs2, CPURISCVState *env,           \
4455                   uint32_t desc)                           \
4456 {                                                          \
4457     uint32_t vm = vext_vm(desc);                           \
4458     uint32_t vl = env->vl;                                 \
4459     uint32_t esz = sizeof(TD);                             \
4460     uint32_t vlenb = simd_maxsz(desc);                     \
4461     uint32_t vta = vext_vta(desc);                         \
4462     uint32_t i;                                            \
4463     TD s1 =  *((TD *)vs1 + HD(0));                         \
4464                                                            \
4465     for (i = env->vstart; i < vl; i++) {                   \
4466         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4467         if (!vm && !vext_elem_mask(v0, i)) {               \
4468             continue;                                      \
4469         }                                                  \
4470         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4471     }                                                      \
4472     *((TD *)vd + HD(0)) = s1;                              \
4473     env->vstart = 0;                                       \
4474     /* set tail elements to 1s */                          \
4475     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4476 }
4477 
4478 /* Unordered sum */
4479 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4480 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4481 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4482 
4483 /* Ordered sum */
4484 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4485 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4486 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4487 
4488 /* Maximum value */
4489 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4490               float16_maximum_number)
4491 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4492               float32_maximum_number)
4493 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4494               float64_maximum_number)
4495 
4496 /* Minimum value */
4497 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4498               float16_minimum_number)
4499 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4500               float32_minimum_number)
4501 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4502               float64_minimum_number)
4503 
4504 /* Vector Widening Floating-Point Add Instructions */
4505 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4506 {
4507     return float32_add(a, float16_to_float32(b, true, s), s);
4508 }
4509 
4510 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4511 {
4512     return float64_add(a, float32_to_float64(b, s), s);
4513 }
4514 
4515 /* Vector Widening Floating-Point Reduction Instructions */
4516 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4517 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4518 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4519 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4520 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4521 
4522 /*
4523  * Vector Mask Operations
4524  */
4525 /* Vector Mask-Register Logical Instructions */
4526 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4527 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4528                   void *vs2, CPURISCVState *env,          \
4529                   uint32_t desc)                          \
4530 {                                                         \
4531     uint32_t vl = env->vl;                                \
4532     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4533     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4534     uint32_t i;                                           \
4535     int a, b;                                             \
4536                                                           \
4537     for (i = env->vstart; i < vl; i++) {                  \
4538         a = vext_elem_mask(vs1, i);                       \
4539         b = vext_elem_mask(vs2, i);                       \
4540         vext_set_elem_mask(vd, i, OP(b, a));              \
4541     }                                                     \
4542     env->vstart = 0;                                      \
4543     /*
4544      * mask destination register are always tail-agnostic
4545      * set tail elements to 1s
4546      */                                                   \
4547     if (vta_all_1s) {                                     \
4548         for (; i < total_elems; i++) {                    \
4549             vext_set_elem_mask(vd, i, 1);                 \
4550         }                                                 \
4551     }                                                     \
4552 }
4553 
4554 #define DO_NAND(N, M)  (!(N & M))
4555 #define DO_ANDNOT(N, M)  (N & !M)
4556 #define DO_NOR(N, M)  (!(N | M))
4557 #define DO_ORNOT(N, M)  (N | !M)
4558 #define DO_XNOR(N, M)  (!(N ^ M))
4559 
4560 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4561 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4562 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4563 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4564 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4565 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4566 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4567 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4568 
4569 /* Vector count population in mask vcpop */
4570 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4571                              uint32_t desc)
4572 {
4573     target_ulong cnt = 0;
4574     uint32_t vm = vext_vm(desc);
4575     uint32_t vl = env->vl;
4576     int i;
4577 
4578     for (i = env->vstart; i < vl; i++) {
4579         if (vm || vext_elem_mask(v0, i)) {
4580             if (vext_elem_mask(vs2, i)) {
4581                 cnt++;
4582             }
4583         }
4584     }
4585     env->vstart = 0;
4586     return cnt;
4587 }
4588 
4589 /* vfirst find-first-set mask bit */
4590 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4591                               uint32_t desc)
4592 {
4593     uint32_t vm = vext_vm(desc);
4594     uint32_t vl = env->vl;
4595     int i;
4596 
4597     for (i = env->vstart; i < vl; i++) {
4598         if (vm || vext_elem_mask(v0, i)) {
4599             if (vext_elem_mask(vs2, i)) {
4600                 return i;
4601             }
4602         }
4603     }
4604     env->vstart = 0;
4605     return -1LL;
4606 }
4607 
4608 enum set_mask_type {
4609     ONLY_FIRST = 1,
4610     INCLUDE_FIRST,
4611     BEFORE_FIRST,
4612 };
4613 
4614 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4615                    uint32_t desc, enum set_mask_type type)
4616 {
4617     uint32_t vm = vext_vm(desc);
4618     uint32_t vl = env->vl;
4619     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4620     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4621     uint32_t vma = vext_vma(desc);
4622     int i;
4623     bool first_mask_bit = false;
4624 
4625     for (i = env->vstart; i < vl; i++) {
4626         if (!vm && !vext_elem_mask(v0, i)) {
4627             /* set masked-off elements to 1s */
4628             if (vma) {
4629                 vext_set_elem_mask(vd, i, 1);
4630             }
4631             continue;
4632         }
4633         /* write a zero to all following active elements */
4634         if (first_mask_bit) {
4635             vext_set_elem_mask(vd, i, 0);
4636             continue;
4637         }
4638         if (vext_elem_mask(vs2, i)) {
4639             first_mask_bit = true;
4640             if (type == BEFORE_FIRST) {
4641                 vext_set_elem_mask(vd, i, 0);
4642             } else {
4643                 vext_set_elem_mask(vd, i, 1);
4644             }
4645         } else {
4646             if (type == ONLY_FIRST) {
4647                 vext_set_elem_mask(vd, i, 0);
4648             } else {
4649                 vext_set_elem_mask(vd, i, 1);
4650             }
4651         }
4652     }
4653     env->vstart = 0;
4654     /*
4655      * mask destination register are always tail-agnostic
4656      * set tail elements to 1s
4657      */
4658     if (vta_all_1s) {
4659         for (; i < total_elems; i++) {
4660             vext_set_elem_mask(vd, i, 1);
4661         }
4662     }
4663 }
4664 
4665 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4666                      uint32_t desc)
4667 {
4668     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4669 }
4670 
4671 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4672                      uint32_t desc)
4673 {
4674     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4675 }
4676 
4677 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4678                      uint32_t desc)
4679 {
4680     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4681 }
4682 
4683 /* Vector Iota Instruction */
4684 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4685 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4686                   uint32_t desc)                                          \
4687 {                                                                         \
4688     uint32_t vm = vext_vm(desc);                                          \
4689     uint32_t vl = env->vl;                                                \
4690     uint32_t esz = sizeof(ETYPE);                                         \
4691     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4692     uint32_t vta = vext_vta(desc);                                        \
4693     uint32_t vma = vext_vma(desc);                                        \
4694     uint32_t sum = 0;                                                     \
4695     int i;                                                                \
4696                                                                           \
4697     for (i = env->vstart; i < vl; i++) {                                  \
4698         if (!vm && !vext_elem_mask(v0, i)) {                              \
4699             /* set masked-off elements to 1s */                           \
4700             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4701             continue;                                                     \
4702         }                                                                 \
4703         *((ETYPE *)vd + H(i)) = sum;                                      \
4704         if (vext_elem_mask(vs2, i)) {                                     \
4705             sum++;                                                        \
4706         }                                                                 \
4707     }                                                                     \
4708     env->vstart = 0;                                                      \
4709     /* set tail elements to 1s */                                         \
4710     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4711 }
4712 
4713 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4714 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4715 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4716 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4717 
4718 /* Vector Element Index Instruction */
4719 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4720 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4721 {                                                                         \
4722     uint32_t vm = vext_vm(desc);                                          \
4723     uint32_t vl = env->vl;                                                \
4724     uint32_t esz = sizeof(ETYPE);                                         \
4725     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4726     uint32_t vta = vext_vta(desc);                                        \
4727     uint32_t vma = vext_vma(desc);                                        \
4728     int i;                                                                \
4729                                                                           \
4730     for (i = env->vstart; i < vl; i++) {                                  \
4731         if (!vm && !vext_elem_mask(v0, i)) {                              \
4732             /* set masked-off elements to 1s */                           \
4733             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4734             continue;                                                     \
4735         }                                                                 \
4736         *((ETYPE *)vd + H(i)) = i;                                        \
4737     }                                                                     \
4738     env->vstart = 0;                                                      \
4739     /* set tail elements to 1s */                                         \
4740     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4741 }
4742 
4743 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4744 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4745 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4746 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4747 
4748 /*
4749  * Vector Permutation Instructions
4750  */
4751 
4752 /* Vector Slide Instructions */
4753 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4754 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4755                   CPURISCVState *env, uint32_t desc)                      \
4756 {                                                                         \
4757     uint32_t vm = vext_vm(desc);                                          \
4758     uint32_t vl = env->vl;                                                \
4759     uint32_t esz = sizeof(ETYPE);                                         \
4760     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4761     uint32_t vta = vext_vta(desc);                                        \
4762     uint32_t vma = vext_vma(desc);                                        \
4763     target_ulong offset = s1, i_min, i;                                   \
4764                                                                           \
4765     i_min = MAX(env->vstart, offset);                                     \
4766     for (i = i_min; i < vl; i++) {                                        \
4767         if (!vm && !vext_elem_mask(v0, i)) {                              \
4768             /* set masked-off elements to 1s */                           \
4769             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4770             continue;                                                     \
4771         }                                                                 \
4772         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4773     }                                                                     \
4774     /* set tail elements to 1s */                                         \
4775     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4776 }
4777 
4778 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4779 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4780 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4781 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4782 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4783 
4784 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4785 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4786                   CPURISCVState *env, uint32_t desc)                      \
4787 {                                                                         \
4788     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4789     uint32_t vm = vext_vm(desc);                                          \
4790     uint32_t vl = env->vl;                                                \
4791     uint32_t esz = sizeof(ETYPE);                                         \
4792     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4793     uint32_t vta = vext_vta(desc);                                        \
4794     uint32_t vma = vext_vma(desc);                                        \
4795     target_ulong i_max, i_min, i;                                         \
4796                                                                           \
4797     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
4798     i_max = MAX(i_min, env->vstart);                                      \
4799     for (i = env->vstart; i < i_max; ++i) {                               \
4800         if (!vm && !vext_elem_mask(v0, i)) {                              \
4801             /* set masked-off elements to 1s */                           \
4802             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4803             continue;                                                     \
4804         }                                                                 \
4805         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4806     }                                                                     \
4807                                                                           \
4808     for (i = i_max; i < vl; ++i) {                                        \
4809         if (vm || vext_elem_mask(v0, i)) {                                \
4810             *((ETYPE *)vd + H(i)) = 0;                                    \
4811         }                                                                 \
4812     }                                                                     \
4813                                                                           \
4814     env->vstart = 0;                                                      \
4815     /* set tail elements to 1s */                                         \
4816     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4817 }
4818 
4819 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4820 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4821 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4822 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4823 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4824 
4825 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4826 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4827                                  void *vs2, CPURISCVState *env,             \
4828                                  uint32_t desc)                             \
4829 {                                                                           \
4830     typedef uint##BITWIDTH##_t ETYPE;                                       \
4831     uint32_t vm = vext_vm(desc);                                            \
4832     uint32_t vl = env->vl;                                                  \
4833     uint32_t esz = sizeof(ETYPE);                                           \
4834     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4835     uint32_t vta = vext_vta(desc);                                          \
4836     uint32_t vma = vext_vma(desc);                                          \
4837     uint32_t i;                                                             \
4838                                                                             \
4839     for (i = env->vstart; i < vl; i++) {                                    \
4840         if (!vm && !vext_elem_mask(v0, i)) {                                \
4841             /* set masked-off elements to 1s */                             \
4842             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4843             continue;                                                       \
4844         }                                                                   \
4845         if (i == 0) {                                                       \
4846             *((ETYPE *)vd + H(i)) = s1;                                     \
4847         } else {                                                            \
4848             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4849         }                                                                   \
4850     }                                                                       \
4851     env->vstart = 0;                                                        \
4852     /* set tail elements to 1s */                                           \
4853     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4854 }
4855 
4856 GEN_VEXT_VSLIE1UP(8,  H1)
4857 GEN_VEXT_VSLIE1UP(16, H2)
4858 GEN_VEXT_VSLIE1UP(32, H4)
4859 GEN_VEXT_VSLIE1UP(64, H8)
4860 
4861 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4862 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4863                   CPURISCVState *env, uint32_t desc)              \
4864 {                                                                 \
4865     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4866 }
4867 
4868 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4869 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4870 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4871 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4872 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4873 
4874 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4875 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4876                                    void *vs2, CPURISCVState *env,             \
4877                                    uint32_t desc)                             \
4878 {                                                                             \
4879     typedef uint##BITWIDTH##_t ETYPE;                                         \
4880     uint32_t vm = vext_vm(desc);                                              \
4881     uint32_t vl = env->vl;                                                    \
4882     uint32_t esz = sizeof(ETYPE);                                             \
4883     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4884     uint32_t vta = vext_vta(desc);                                            \
4885     uint32_t vma = vext_vma(desc);                                            \
4886     uint32_t i;                                                               \
4887                                                                               \
4888     for (i = env->vstart; i < vl; i++) {                                      \
4889         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4890             /* set masked-off elements to 1s */                               \
4891             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4892             continue;                                                         \
4893         }                                                                     \
4894         if (i == vl - 1) {                                                    \
4895             *((ETYPE *)vd + H(i)) = s1;                                       \
4896         } else {                                                              \
4897             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4898         }                                                                     \
4899     }                                                                         \
4900     env->vstart = 0;                                                          \
4901     /* set tail elements to 1s */                                             \
4902     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4903 }
4904 
4905 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4906 GEN_VEXT_VSLIDE1DOWN(16, H2)
4907 GEN_VEXT_VSLIDE1DOWN(32, H4)
4908 GEN_VEXT_VSLIDE1DOWN(64, H8)
4909 
4910 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4911 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4912                   CPURISCVState *env, uint32_t desc)              \
4913 {                                                                 \
4914     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4915 }
4916 
4917 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4918 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4919 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4920 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4921 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4922 
4923 /* Vector Floating-Point Slide Instructions */
4924 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4925 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4926                   CPURISCVState *env, uint32_t desc)          \
4927 {                                                             \
4928     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4929 }
4930 
4931 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4932 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4933 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4934 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4935 
4936 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4937 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4938                   CPURISCVState *env, uint32_t desc)          \
4939 {                                                             \
4940     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4941 }
4942 
4943 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4944 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4945 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4946 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4947 
4948 /* Vector Register Gather Instruction */
4949 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4950 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4951                   CPURISCVState *env, uint32_t desc)                      \
4952 {                                                                         \
4953     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4954     uint32_t vm = vext_vm(desc);                                          \
4955     uint32_t vl = env->vl;                                                \
4956     uint32_t esz = sizeof(TS2);                                           \
4957     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4958     uint32_t vta = vext_vta(desc);                                        \
4959     uint32_t vma = vext_vma(desc);                                        \
4960     uint64_t index;                                                       \
4961     uint32_t i;                                                           \
4962                                                                           \
4963     for (i = env->vstart; i < vl; i++) {                                  \
4964         if (!vm && !vext_elem_mask(v0, i)) {                              \
4965             /* set masked-off elements to 1s */                           \
4966             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4967             continue;                                                     \
4968         }                                                                 \
4969         index = *((TS1 *)vs1 + HS1(i));                                   \
4970         if (index >= vlmax) {                                             \
4971             *((TS2 *)vd + HS2(i)) = 0;                                    \
4972         } else {                                                          \
4973             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4974         }                                                                 \
4975     }                                                                     \
4976     env->vstart = 0;                                                      \
4977     /* set tail elements to 1s */                                         \
4978     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4979 }
4980 
4981 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4982 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4983 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4984 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4985 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4986 
4987 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4988 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4989 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
4990 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
4991 
4992 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4993 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4994                   CPURISCVState *env, uint32_t desc)                      \
4995 {                                                                         \
4996     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4997     uint32_t vm = vext_vm(desc);                                          \
4998     uint32_t vl = env->vl;                                                \
4999     uint32_t esz = sizeof(ETYPE);                                         \
5000     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5001     uint32_t vta = vext_vta(desc);                                        \
5002     uint32_t vma = vext_vma(desc);                                        \
5003     uint64_t index = s1;                                                  \
5004     uint32_t i;                                                           \
5005                                                                           \
5006     for (i = env->vstart; i < vl; i++) {                                  \
5007         if (!vm && !vext_elem_mask(v0, i)) {                              \
5008             /* set masked-off elements to 1s */                           \
5009             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5010             continue;                                                     \
5011         }                                                                 \
5012         if (index >= vlmax) {                                             \
5013             *((ETYPE *)vd + H(i)) = 0;                                    \
5014         } else {                                                          \
5015             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5016         }                                                                 \
5017     }                                                                     \
5018     env->vstart = 0;                                                      \
5019     /* set tail elements to 1s */                                         \
5020     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5021 }
5022 
5023 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5024 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5025 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5026 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5027 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5028 
5029 /* Vector Compress Instruction */
5030 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5031 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5032                   CPURISCVState *env, uint32_t desc)                      \
5033 {                                                                         \
5034     uint32_t vl = env->vl;                                                \
5035     uint32_t esz = sizeof(ETYPE);                                         \
5036     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5037     uint32_t vta = vext_vta(desc);                                        \
5038     uint32_t num = 0, i;                                                  \
5039                                                                           \
5040     for (i = env->vstart; i < vl; i++) {                                  \
5041         if (!vext_elem_mask(vs1, i)) {                                    \
5042             continue;                                                     \
5043         }                                                                 \
5044         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5045         num++;                                                            \
5046     }                                                                     \
5047     env->vstart = 0;                                                      \
5048     /* set tail elements to 1s */                                         \
5049     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5050 }
5051 
5052 /* Compress into vd elements of vs2 where vs1 is enabled */
5053 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5054 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5055 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5056 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5057 
5058 /* Vector Whole Register Move */
5059 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5060 {
5061     /* EEW = SEW */
5062     uint32_t maxsz = simd_maxsz(desc);
5063     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5064     uint32_t startb = env->vstart * sewb;
5065     uint32_t i = startb;
5066 
5067     memcpy((uint8_t *)vd + H1(i),
5068            (uint8_t *)vs2 + H1(i),
5069            maxsz - startb);
5070 
5071     env->vstart = 0;
5072 }
5073 
5074 /* Vector Integer Extension */
5075 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5076 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5077                   CPURISCVState *env, uint32_t desc)             \
5078 {                                                                \
5079     uint32_t vl = env->vl;                                       \
5080     uint32_t vm = vext_vm(desc);                                 \
5081     uint32_t esz = sizeof(ETYPE);                                \
5082     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5083     uint32_t vta = vext_vta(desc);                               \
5084     uint32_t vma = vext_vma(desc);                               \
5085     uint32_t i;                                                  \
5086                                                                  \
5087     for (i = env->vstart; i < vl; i++) {                         \
5088         if (!vm && !vext_elem_mask(v0, i)) {                     \
5089             /* set masked-off elements to 1s */                  \
5090             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5091             continue;                                            \
5092         }                                                        \
5093         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5094     }                                                            \
5095     env->vstart = 0;                                             \
5096     /* set tail elements to 1s */                                \
5097     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5098 }
5099 
5100 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5101 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5102 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5103 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5104 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5105 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5106 
5107 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5108 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5109 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5110 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5111 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5112 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5113