xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 38581e5c)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vta_all_1s(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
133 }
134 
135 /*
136  * Get the maximum number of elements can be operated.
137  *
138  * log2_esz: log2 of element size in bytes.
139  */
140 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
141 {
142     /*
143      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
144      * so vlen in bytes (vlenb) is encoded as maxsz.
145      */
146     uint32_t vlenb = simd_maxsz(desc);
147 
148     /* Return VLMAX */
149     int scale = vext_lmul(desc) - log2_esz;
150     return scale < 0 ? vlenb >> -scale : vlenb << scale;
151 }
152 
153 /*
154  * Get number of total elements, including prestart, body and tail elements.
155  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
156  * are held in the same vector register.
157  */
158 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
159                                             uint32_t esz)
160 {
161     uint32_t vlenb = simd_maxsz(desc);
162     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
163     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
164                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
165     return (vlenb << emul) / esz;
166 }
167 
168 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
169 {
170     return (addr & env->cur_pmmask) | env->cur_pmbase;
171 }
172 
173 /*
174  * This function checks watchpoint before real load operation.
175  *
176  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
177  * In user mode, there is no watchpoint support now.
178  *
179  * It will trigger an exception if there is no mapping in TLB
180  * and page table walk can't fill the TLB entry. Then the guest
181  * software can return here after process the exception or never return.
182  */
183 static void probe_pages(CPURISCVState *env, target_ulong addr,
184                         target_ulong len, uintptr_t ra,
185                         MMUAccessType access_type)
186 {
187     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
188     target_ulong curlen = MIN(pagelen, len);
189 
190     probe_access(env, adjust_addr(env, addr), curlen, access_type,
191                  cpu_mmu_index(env, false), ra);
192     if (len > curlen) {
193         addr += curlen;
194         curlen = len - curlen;
195         probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                      cpu_mmu_index(env, false), ra);
197     }
198 }
199 
200 /* set agnostic elements to 1s */
201 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
202                               uint32_t tot)
203 {
204     if (is_agnostic == 0) {
205         /* policy undisturbed */
206         return;
207     }
208     if (tot - cnt == 0) {
209         return ;
210     }
211     memset(base + cnt, -1, tot - cnt);
212 }
213 
214 static inline void vext_set_elem_mask(void *v0, int index,
215                                       uint8_t value)
216 {
217     int idx = index / 64;
218     int pos = index % 64;
219     uint64_t old = ((uint64_t *)v0)[idx];
220     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
221 }
222 
223 /*
224  * Earlier designs (pre-0.9) had a varying number of bits
225  * per mask value (MLEN). In the 0.9 design, MLEN=1.
226  * (Section 4.5)
227  */
228 static inline int vext_elem_mask(void *v0, int index)
229 {
230     int idx = index / 64;
231     int pos = index  % 64;
232     return (((uint64_t *)v0)[idx] >> pos) & 1;
233 }
234 
235 /* elements operations for load and store */
236 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
237                                uint32_t idx, void *vd, uintptr_t retaddr);
238 
239 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
240 static void NAME(CPURISCVState *env, abi_ptr addr,         \
241                  uint32_t idx, void *vd, uintptr_t retaddr)\
242 {                                                          \
243     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
244     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
245 }                                                          \
246 
247 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
248 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
249 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
250 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
251 
252 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
253 static void NAME(CPURISCVState *env, abi_ptr addr,         \
254                  uint32_t idx, void *vd, uintptr_t retaddr)\
255 {                                                          \
256     ETYPE data = *((ETYPE *)vd + H(idx));                  \
257     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
258 }
259 
260 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
261 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
262 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
263 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
264 
265 /*
266  *** stride: access vector element from strided memory
267  */
268 static void
269 vext_ldst_stride(void *vd, void *v0, target_ulong base,
270                  target_ulong stride, CPURISCVState *env,
271                  uint32_t desc, uint32_t vm,
272                  vext_ldst_elem_fn *ldst_elem,
273                  uint32_t log2_esz, uintptr_t ra)
274 {
275     uint32_t i, k;
276     uint32_t nf = vext_nf(desc);
277     uint32_t max_elems = vext_max_elems(desc, log2_esz);
278     uint32_t esz = 1 << log2_esz;
279     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
280     uint32_t vta = vext_vta(desc);
281 
282     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
283         if (!vm && !vext_elem_mask(v0, i)) {
284             continue;
285         }
286 
287         k = 0;
288         while (k < nf) {
289             target_ulong addr = base + stride * i + (k << log2_esz);
290             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
291             k++;
292         }
293     }
294     env->vstart = 0;
295     /* set tail elements to 1s */
296     for (k = 0; k < nf; ++k) {
297         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
298                           (k * max_elems + max_elems) * esz);
299     }
300     if (nf * max_elems % total_elems != 0) {
301         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
302         uint32_t registers_used =
303             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
304         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
305                           registers_used * vlenb);
306     }
307 }
308 
309 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
310 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
311                   target_ulong stride, CPURISCVState *env,              \
312                   uint32_t desc)                                        \
313 {                                                                       \
314     uint32_t vm = vext_vm(desc);                                        \
315     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
316                      ctzl(sizeof(ETYPE)), GETPC());                     \
317 }
318 
319 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
320 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
321 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
322 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
323 
324 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
325 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
326                   target_ulong stride, CPURISCVState *env,              \
327                   uint32_t desc)                                        \
328 {                                                                       \
329     uint32_t vm = vext_vm(desc);                                        \
330     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
331                      ctzl(sizeof(ETYPE)), GETPC());                     \
332 }
333 
334 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
335 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
336 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
337 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
338 
339 /*
340  *** unit-stride: access elements stored contiguously in memory
341  */
342 
343 /* unmasked unit-stride load and store operation*/
344 static void
345 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
346              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
347              uintptr_t ra)
348 {
349     uint32_t i, k;
350     uint32_t nf = vext_nf(desc);
351     uint32_t max_elems = vext_max_elems(desc, log2_esz);
352     uint32_t esz = 1 << log2_esz;
353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
354     uint32_t vta = vext_vta(desc);
355 
356     /* load bytes from guest memory */
357     for (i = env->vstart; i < evl; i++, env->vstart++) {
358         k = 0;
359         while (k < nf) {
360             target_ulong addr = base + ((i * nf + k) << log2_esz);
361             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
362             k++;
363         }
364     }
365     env->vstart = 0;
366     /* set tail elements to 1s */
367     for (k = 0; k < nf; ++k) {
368         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
369                           (k * max_elems + max_elems) * esz);
370     }
371     if (nf * max_elems % total_elems != 0) {
372         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
373         uint32_t registers_used =
374             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
375         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
376                           registers_used * vlenb);
377     }
378 }
379 
380 /*
381  * masked unit-stride load and store operation will be a special case of stride,
382  * stride = NF * sizeof (MTYPE)
383  */
384 
385 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
386 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
387                          CPURISCVState *env, uint32_t desc)             \
388 {                                                                       \
389     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
390     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
391                      ctzl(sizeof(ETYPE)), GETPC());                     \
392 }                                                                       \
393                                                                         \
394 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
395                   CPURISCVState *env, uint32_t desc)                    \
396 {                                                                       \
397     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
398                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
399 }
400 
401 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
402 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
403 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
404 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
405 
406 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
407 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
408                          CPURISCVState *env, uint32_t desc)              \
409 {                                                                        \
410     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
411     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
412                      ctzl(sizeof(ETYPE)), GETPC());                      \
413 }                                                                        \
414                                                                          \
415 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
416                   CPURISCVState *env, uint32_t desc)                     \
417 {                                                                        \
418     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
419                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
420 }
421 
422 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
423 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
424 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
425 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
426 
427 /*
428  *** unit stride mask load and store, EEW = 1
429  */
430 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
431                     CPURISCVState *env, uint32_t desc)
432 {
433     /* evl = ceil(vl/8) */
434     uint8_t evl = (env->vl + 7) >> 3;
435     vext_ldst_us(vd, base, env, desc, lde_b,
436                  0, evl, GETPC());
437 }
438 
439 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, ste_b,
445                  0, evl, GETPC());
446 }
447 
448 /*
449  *** index: access vector element from indexed memory
450  */
451 typedef target_ulong vext_get_index_addr(target_ulong base,
452         uint32_t idx, void *vs2);
453 
454 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
455 static target_ulong NAME(target_ulong base,            \
456                          uint32_t idx, void *vs2)      \
457 {                                                      \
458     return (base + *((ETYPE *)vs2 + H(idx)));          \
459 }
460 
461 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
462 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
463 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
464 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
465 
466 static inline void
467 vext_ldst_index(void *vd, void *v0, target_ulong base,
468                 void *vs2, CPURISCVState *env, uint32_t desc,
469                 vext_get_index_addr get_index_addr,
470                 vext_ldst_elem_fn *ldst_elem,
471                 uint32_t log2_esz, uintptr_t ra)
472 {
473     uint32_t i, k;
474     uint32_t nf = vext_nf(desc);
475     uint32_t vm = vext_vm(desc);
476     uint32_t max_elems = vext_max_elems(desc, log2_esz);
477     uint32_t esz = 1 << log2_esz;
478     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
479     uint32_t vta = vext_vta(desc);
480 
481     /* load bytes from guest memory */
482     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
483         if (!vm && !vext_elem_mask(v0, i)) {
484             continue;
485         }
486 
487         k = 0;
488         while (k < nf) {
489             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
490             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
491             k++;
492         }
493     }
494     env->vstart = 0;
495     /* set tail elements to 1s */
496     for (k = 0; k < nf; ++k) {
497         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
498                           (k * max_elems + max_elems) * esz);
499     }
500     if (nf * max_elems % total_elems != 0) {
501         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
502         uint32_t registers_used =
503             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
504         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
505                           registers_used * vlenb);
506     }
507 }
508 
509 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
510 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
511                   void *vs2, CPURISCVState *env, uint32_t desc)            \
512 {                                                                          \
513     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
514                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
515 }
516 
517 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
529 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
530 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
531 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
532 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
533 
534 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
535 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
536                   void *vs2, CPURISCVState *env, uint32_t desc)  \
537 {                                                                \
538     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
539                     STORE_FN, ctzl(sizeof(ETYPE)),               \
540                     GETPC());                                    \
541 }
542 
543 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
555 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
556 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
557 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
558 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
559 
560 /*
561  *** unit-stride fault-only-fisrt load instructions
562  */
563 static inline void
564 vext_ldff(void *vd, void *v0, target_ulong base,
565           CPURISCVState *env, uint32_t desc,
566           vext_ldst_elem_fn *ldst_elem,
567           uint32_t log2_esz, uintptr_t ra)
568 {
569     void *host;
570     uint32_t i, k, vl = 0;
571     uint32_t nf = vext_nf(desc);
572     uint32_t vm = vext_vm(desc);
573     uint32_t max_elems = vext_max_elems(desc, log2_esz);
574     uint32_t esz = 1 << log2_esz;
575     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
576     uint32_t vta = vext_vta(desc);
577     target_ulong addr, offset, remain;
578 
579     /* probe every access*/
580     for (i = env->vstart; i < env->vl; i++) {
581         if (!vm && !vext_elem_mask(v0, i)) {
582             continue;
583         }
584         addr = adjust_addr(env, base + i * (nf << log2_esz));
585         if (i == 0) {
586             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
587         } else {
588             /* if it triggers an exception, no need to check watchpoint */
589             remain = nf << log2_esz;
590             while (remain > 0) {
591                 offset = -(addr | TARGET_PAGE_MASK);
592                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
593                                          cpu_mmu_index(env, false));
594                 if (host) {
595 #ifdef CONFIG_USER_ONLY
596                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
597                         vl = i;
598                         goto ProbeSuccess;
599                     }
600 #else
601                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
602 #endif
603                 } else {
604                     vl = i;
605                     goto ProbeSuccess;
606                 }
607                 if (remain <=  offset) {
608                     break;
609                 }
610                 remain -= offset;
611                 addr = adjust_addr(env, addr + offset);
612             }
613         }
614     }
615 ProbeSuccess:
616     /* load bytes from guest memory */
617     if (vl != 0) {
618         env->vl = vl;
619     }
620     for (i = env->vstart; i < env->vl; i++) {
621         k = 0;
622         if (!vm && !vext_elem_mask(v0, i)) {
623             continue;
624         }
625         while (k < nf) {
626             target_ulong addr = base + ((i * nf + k) << log2_esz);
627             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
628             k++;
629         }
630     }
631     env->vstart = 0;
632     /* set tail elements to 1s */
633     for (k = 0; k < nf; ++k) {
634         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
635                           (k * max_elems + max_elems) * esz);
636     }
637     if (nf * max_elems % total_elems != 0) {
638         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
639         uint32_t registers_used =
640             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
641         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
642                           registers_used * vlenb);
643     }
644 }
645 
646 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
647 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
648                   CPURISCVState *env, uint32_t desc)      \
649 {                                                         \
650     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
651               ctzl(sizeof(ETYPE)), GETPC());              \
652 }
653 
654 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
655 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
656 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
657 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
658 
659 #define DO_SWAP(N, M) (M)
660 #define DO_AND(N, M)  (N & M)
661 #define DO_XOR(N, M)  (N ^ M)
662 #define DO_OR(N, M)   (N | M)
663 #define DO_ADD(N, M)  (N + M)
664 
665 /* Signed min/max */
666 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
667 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
668 
669 /* Unsigned min/max */
670 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
671 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
672 
673 /*
674  *** load and store whole register instructions
675  */
676 static void
677 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
678                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
679 {
680     uint32_t i, k, off, pos;
681     uint32_t nf = vext_nf(desc);
682     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
683     uint32_t max_elems = vlenb >> log2_esz;
684 
685     k = env->vstart / max_elems;
686     off = env->vstart % max_elems;
687 
688     if (off) {
689         /* load/store rest of elements of current segment pointed by vstart */
690         for (pos = off; pos < max_elems; pos++, env->vstart++) {
691             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
692             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
693         }
694         k++;
695     }
696 
697     /* load/store elements for rest of segments */
698     for (; k < nf; k++) {
699         for (i = 0; i < max_elems; i++, env->vstart++) {
700             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
701             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
702         }
703     }
704 
705     env->vstart = 0;
706 }
707 
708 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
709 void HELPER(NAME)(void *vd, target_ulong base,       \
710                   CPURISCVState *env, uint32_t desc) \
711 {                                                    \
712     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
713                     ctzl(sizeof(ETYPE)), GETPC());   \
714 }
715 
716 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
717 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
718 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
719 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
720 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
721 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
722 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
723 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
724 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
725 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
726 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
727 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
728 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
729 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
730 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
731 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
732 
733 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
734 void HELPER(NAME)(void *vd, target_ulong base,       \
735                   CPURISCVState *env, uint32_t desc) \
736 {                                                    \
737     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
738                     ctzl(sizeof(ETYPE)), GETPC());   \
739 }
740 
741 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
742 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
743 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
744 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
745 
746 /*
747  *** Vector Integer Arithmetic Instructions
748  */
749 
750 /* expand macro args before macro */
751 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
752 
753 /* (TD, T1, T2, TX1, TX2) */
754 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
755 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
756 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
757 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
758 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
759 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
760 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
761 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
762 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
763 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
764 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
765 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
766 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
767 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
768 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
769 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
770 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
771 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
772 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
773 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
774 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
775 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
776 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
777 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
778 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
779 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
780 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
781 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
782 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
783 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
784 
785 /* operation of two vector elements */
786 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
787 
788 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
789 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
790 {                                                               \
791     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
792     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
793     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
794 }
795 #define DO_SUB(N, M) (N - M)
796 #define DO_RSUB(N, M) (M - N)
797 
798 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
799 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
800 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
801 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
802 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
803 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
804 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
805 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
806 
807 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
808                        CPURISCVState *env, uint32_t desc,
809                        opivv2_fn *fn, uint32_t esz)
810 {
811     uint32_t vm = vext_vm(desc);
812     uint32_t vl = env->vl;
813     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
814     uint32_t vta = vext_vta(desc);
815     uint32_t i;
816 
817     for (i = env->vstart; i < vl; i++) {
818         if (!vm && !vext_elem_mask(v0, i)) {
819             continue;
820         }
821         fn(vd, vs1, vs2, i);
822     }
823     env->vstart = 0;
824     /* set tail elements to 1s */
825     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
826 }
827 
828 /* generate the helpers for OPIVV */
829 #define GEN_VEXT_VV(NAME, ESZ)                            \
830 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
831                   void *vs2, CPURISCVState *env,          \
832                   uint32_t desc)                          \
833 {                                                         \
834     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
835                do_##NAME, ESZ);                           \
836 }
837 
838 GEN_VEXT_VV(vadd_vv_b, 1)
839 GEN_VEXT_VV(vadd_vv_h, 2)
840 GEN_VEXT_VV(vadd_vv_w, 4)
841 GEN_VEXT_VV(vadd_vv_d, 8)
842 GEN_VEXT_VV(vsub_vv_b, 1)
843 GEN_VEXT_VV(vsub_vv_h, 2)
844 GEN_VEXT_VV(vsub_vv_w, 4)
845 GEN_VEXT_VV(vsub_vv_d, 8)
846 
847 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
848 
849 /*
850  * (T1)s1 gives the real operator type.
851  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
852  */
853 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
854 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
855 {                                                                   \
856     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
857     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
858 }
859 
860 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
861 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
862 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
863 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
864 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
865 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
866 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
867 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
868 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
869 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
870 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
871 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
872 
873 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
874                        CPURISCVState *env, uint32_t desc,
875                        opivx2_fn fn, uint32_t esz)
876 {
877     uint32_t vm = vext_vm(desc);
878     uint32_t vl = env->vl;
879     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
880     uint32_t vta = vext_vta(desc);
881     uint32_t i;
882 
883     for (i = env->vstart; i < vl; i++) {
884         if (!vm && !vext_elem_mask(v0, i)) {
885             continue;
886         }
887         fn(vd, s1, vs2, i);
888     }
889     env->vstart = 0;
890     /* set tail elements to 1s */
891     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
892 }
893 
894 /* generate the helpers for OPIVX */
895 #define GEN_VEXT_VX(NAME, ESZ)                            \
896 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
897                   void *vs2, CPURISCVState *env,          \
898                   uint32_t desc)                          \
899 {                                                         \
900     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
901                do_##NAME, ESZ);                           \
902 }
903 
904 GEN_VEXT_VX(vadd_vx_b, 1)
905 GEN_VEXT_VX(vadd_vx_h, 2)
906 GEN_VEXT_VX(vadd_vx_w, 4)
907 GEN_VEXT_VX(vadd_vx_d, 8)
908 GEN_VEXT_VX(vsub_vx_b, 1)
909 GEN_VEXT_VX(vsub_vx_h, 2)
910 GEN_VEXT_VX(vsub_vx_w, 4)
911 GEN_VEXT_VX(vsub_vx_d, 8)
912 GEN_VEXT_VX(vrsub_vx_b, 1)
913 GEN_VEXT_VX(vrsub_vx_h, 2)
914 GEN_VEXT_VX(vrsub_vx_w, 4)
915 GEN_VEXT_VX(vrsub_vx_d, 8)
916 
917 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
918 {
919     intptr_t oprsz = simd_oprsz(desc);
920     intptr_t i;
921 
922     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
923         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
924     }
925 }
926 
927 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
928 {
929     intptr_t oprsz = simd_oprsz(desc);
930     intptr_t i;
931 
932     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
933         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
934     }
935 }
936 
937 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
938 {
939     intptr_t oprsz = simd_oprsz(desc);
940     intptr_t i;
941 
942     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
943         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
944     }
945 }
946 
947 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
948 {
949     intptr_t oprsz = simd_oprsz(desc);
950     intptr_t i;
951 
952     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
953         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
954     }
955 }
956 
957 /* Vector Widening Integer Add/Subtract */
958 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
959 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
960 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
961 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
962 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
963 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
964 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
965 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
966 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
967 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
968 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
969 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
970 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
972 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
973 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
975 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
976 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
978 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
979 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
981 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
982 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
983 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
984 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
985 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
986 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
987 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
988 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
989 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
990 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
991 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
992 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
993 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
994 GEN_VEXT_VV(vwaddu_vv_b, 2)
995 GEN_VEXT_VV(vwaddu_vv_h, 4)
996 GEN_VEXT_VV(vwaddu_vv_w, 8)
997 GEN_VEXT_VV(vwsubu_vv_b, 2)
998 GEN_VEXT_VV(vwsubu_vv_h, 4)
999 GEN_VEXT_VV(vwsubu_vv_w, 8)
1000 GEN_VEXT_VV(vwadd_vv_b, 2)
1001 GEN_VEXT_VV(vwadd_vv_h, 4)
1002 GEN_VEXT_VV(vwadd_vv_w, 8)
1003 GEN_VEXT_VV(vwsub_vv_b, 2)
1004 GEN_VEXT_VV(vwsub_vv_h, 4)
1005 GEN_VEXT_VV(vwsub_vv_w, 8)
1006 GEN_VEXT_VV(vwaddu_wv_b, 2)
1007 GEN_VEXT_VV(vwaddu_wv_h, 4)
1008 GEN_VEXT_VV(vwaddu_wv_w, 8)
1009 GEN_VEXT_VV(vwsubu_wv_b, 2)
1010 GEN_VEXT_VV(vwsubu_wv_h, 4)
1011 GEN_VEXT_VV(vwsubu_wv_w, 8)
1012 GEN_VEXT_VV(vwadd_wv_b, 2)
1013 GEN_VEXT_VV(vwadd_wv_h, 4)
1014 GEN_VEXT_VV(vwadd_wv_w, 8)
1015 GEN_VEXT_VV(vwsub_wv_b, 2)
1016 GEN_VEXT_VV(vwsub_wv_h, 4)
1017 GEN_VEXT_VV(vwsub_wv_w, 8)
1018 
1019 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1022 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1025 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1028 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1031 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1032 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1034 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1035 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1037 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1038 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1040 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1041 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1043 GEN_VEXT_VX(vwaddu_vx_b, 2)
1044 GEN_VEXT_VX(vwaddu_vx_h, 4)
1045 GEN_VEXT_VX(vwaddu_vx_w, 8)
1046 GEN_VEXT_VX(vwsubu_vx_b, 2)
1047 GEN_VEXT_VX(vwsubu_vx_h, 4)
1048 GEN_VEXT_VX(vwsubu_vx_w, 8)
1049 GEN_VEXT_VX(vwadd_vx_b, 2)
1050 GEN_VEXT_VX(vwadd_vx_h, 4)
1051 GEN_VEXT_VX(vwadd_vx_w, 8)
1052 GEN_VEXT_VX(vwsub_vx_b, 2)
1053 GEN_VEXT_VX(vwsub_vx_h, 4)
1054 GEN_VEXT_VX(vwsub_vx_w, 8)
1055 GEN_VEXT_VX(vwaddu_wx_b, 2)
1056 GEN_VEXT_VX(vwaddu_wx_h, 4)
1057 GEN_VEXT_VX(vwaddu_wx_w, 8)
1058 GEN_VEXT_VX(vwsubu_wx_b, 2)
1059 GEN_VEXT_VX(vwsubu_wx_h, 4)
1060 GEN_VEXT_VX(vwsubu_wx_w, 8)
1061 GEN_VEXT_VX(vwadd_wx_b, 2)
1062 GEN_VEXT_VX(vwadd_wx_h, 4)
1063 GEN_VEXT_VX(vwadd_wx_w, 8)
1064 GEN_VEXT_VX(vwsub_wx_b, 2)
1065 GEN_VEXT_VX(vwsub_wx_h, 4)
1066 GEN_VEXT_VX(vwsub_wx_w, 8)
1067 
1068 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1069 #define DO_VADC(N, M, C) (N + M + C)
1070 #define DO_VSBC(N, M, C) (N - M - C)
1071 
1072 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1073 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1074                   CPURISCVState *env, uint32_t desc)          \
1075 {                                                             \
1076     uint32_t vl = env->vl;                                    \
1077     uint32_t esz = sizeof(ETYPE);                             \
1078     uint32_t total_elems =                                    \
1079         vext_get_total_elems(env, desc, esz);                 \
1080     uint32_t vta = vext_vta(desc);                            \
1081     uint32_t i;                                               \
1082                                                               \
1083     for (i = env->vstart; i < vl; i++) {                      \
1084         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1085         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1086         ETYPE carry = vext_elem_mask(v0, i);                  \
1087                                                               \
1088         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1089     }                                                         \
1090     env->vstart = 0;                                          \
1091     /* set tail elements to 1s */                             \
1092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1093 }
1094 
1095 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1097 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1098 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1099 
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1102 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1103 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1104 
1105 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1106 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1107                   CPURISCVState *env, uint32_t desc)                     \
1108 {                                                                        \
1109     uint32_t vl = env->vl;                                               \
1110     uint32_t esz = sizeof(ETYPE);                                        \
1111     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1112     uint32_t vta = vext_vta(desc);                                       \
1113     uint32_t i;                                                          \
1114                                                                          \
1115     for (i = env->vstart; i < vl; i++) {                                 \
1116         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1117         ETYPE carry = vext_elem_mask(v0, i);                             \
1118                                                                          \
1119         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1120     }                                                                    \
1121     env->vstart = 0;                                          \
1122     /* set tail elements to 1s */                                        \
1123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1124 }
1125 
1126 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1128 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1129 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1130 
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1133 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1134 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1135 
1136 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1137                           (__typeof(N))(N + M) < N)
1138 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1139 
1140 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1141 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1142                   CPURISCVState *env, uint32_t desc)          \
1143 {                                                             \
1144     uint32_t vl = env->vl;                                    \
1145     uint32_t vm = vext_vm(desc);                              \
1146     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1147     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1148     uint32_t i;                                               \
1149                                                               \
1150     for (i = env->vstart; i < vl; i++) {                      \
1151         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1152         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1153         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1154         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1155     }                                                         \
1156     env->vstart = 0;                                          \
1157     /* mask destination register are always tail-agnostic */  \
1158     /* set tail elements to 1s */                             \
1159     if (vta_all_1s) {                                         \
1160         for (; i < total_elems; i++) {                        \
1161             vext_set_elem_mask(vd, i, 1);                     \
1162         }                                                     \
1163     }                                                         \
1164 }
1165 
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1168 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1169 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1170 
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1173 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1174 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1175 
1176 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1177 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1178                   void *vs2, CPURISCVState *env, uint32_t desc) \
1179 {                                                               \
1180     uint32_t vl = env->vl;                                      \
1181     uint32_t vm = vext_vm(desc);                                \
1182     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1183     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1184     uint32_t i;                                                 \
1185                                                                 \
1186     for (i = env->vstart; i < vl; i++) {                        \
1187         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1188         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1189         vext_set_elem_mask(vd, i,                               \
1190                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1191     }                                                           \
1192     env->vstart = 0;                                            \
1193     /* mask destination register are always tail-agnostic */    \
1194     /* set tail elements to 1s */                               \
1195     if (vta_all_1s) {                                           \
1196         for (; i < total_elems; i++) {                          \
1197             vext_set_elem_mask(vd, i, 1);                       \
1198         }                                                       \
1199     }                                                           \
1200 }
1201 
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206 
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211 
1212 /* Vector Bitwise Logical Instructions */
1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225 GEN_VEXT_VV(vand_vv_b, 1)
1226 GEN_VEXT_VV(vand_vv_h, 2)
1227 GEN_VEXT_VV(vand_vv_w, 4)
1228 GEN_VEXT_VV(vand_vv_d, 8)
1229 GEN_VEXT_VV(vor_vv_b, 1)
1230 GEN_VEXT_VV(vor_vv_h, 2)
1231 GEN_VEXT_VV(vor_vv_w, 4)
1232 GEN_VEXT_VV(vor_vv_d, 8)
1233 GEN_VEXT_VV(vxor_vv_b, 1)
1234 GEN_VEXT_VV(vxor_vv_h, 2)
1235 GEN_VEXT_VV(vxor_vv_w, 4)
1236 GEN_VEXT_VV(vxor_vv_d, 8)
1237 
1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250 GEN_VEXT_VX(vand_vx_b, 1)
1251 GEN_VEXT_VX(vand_vx_h, 2)
1252 GEN_VEXT_VX(vand_vx_w, 4)
1253 GEN_VEXT_VX(vand_vx_d, 8)
1254 GEN_VEXT_VX(vor_vx_b, 1)
1255 GEN_VEXT_VX(vor_vx_h, 2)
1256 GEN_VEXT_VX(vor_vx_w, 4)
1257 GEN_VEXT_VX(vor_vx_d, 8)
1258 GEN_VEXT_VX(vxor_vx_b, 1)
1259 GEN_VEXT_VX(vxor_vx_h, 2)
1260 GEN_VEXT_VX(vxor_vx_w, 4)
1261 GEN_VEXT_VX(vxor_vx_d, 8)
1262 
1263 /* Vector Single-Width Bit Shift Instructions */
1264 #define DO_SLL(N, M)  (N << (M))
1265 #define DO_SRL(N, M)  (N >> (M))
1266 
1267 /* generate the helpers for shift instructions with two vector operators */
1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1271 {                                                                         \
1272     uint32_t vm = vext_vm(desc);                                          \
1273     uint32_t vl = env->vl;                                                \
1274     uint32_t esz = sizeof(TS1);                                           \
1275     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1276     uint32_t vta = vext_vta(desc);                                        \
1277     uint32_t i;                                                           \
1278                                                                           \
1279     for (i = env->vstart; i < vl; i++) {                                  \
1280         if (!vm && !vext_elem_mask(v0, i)) {                              \
1281             continue;                                                     \
1282         }                                                                 \
1283         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1284         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1285         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1286     }                                                                     \
1287     env->vstart = 0;                                                      \
1288     /* set tail elements to 1s */                                         \
1289     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1290 }
1291 
1292 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1293 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1294 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1296 
1297 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1301 
1302 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1303 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1304 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1306 
1307 /* generate the helpers for shift instructions with one vector and one scalar */
1308 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1309 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1310         void *vs2, CPURISCVState *env, uint32_t desc)       \
1311 {                                                           \
1312     uint32_t vm = vext_vm(desc);                            \
1313     uint32_t vl = env->vl;                                  \
1314     uint32_t esz = sizeof(TD);                              \
1315     uint32_t total_elems =                                  \
1316         vext_get_total_elems(env, desc, esz);               \
1317     uint32_t vta = vext_vta(desc);                          \
1318     uint32_t i;                                             \
1319                                                             \
1320     for (i = env->vstart; i < vl; i++) {                    \
1321         if (!vm && !vext_elem_mask(v0, i)) {                \
1322             continue;                                       \
1323         }                                                   \
1324         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1325         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1326     }                                                       \
1327     env->vstart = 0;                                        \
1328     /* set tail elements to 1s */                           \
1329     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1330 }
1331 
1332 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1333 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1334 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1336 
1337 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1338 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1341 
1342 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1346 
1347 /* Vector Narrowing Integer Right Shift Instructions */
1348 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1351 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1354 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1355 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1357 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1358 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1360 
1361 /* Vector Integer Comparison Instructions */
1362 #define DO_MSEQ(N, M) (N == M)
1363 #define DO_MSNE(N, M) (N != M)
1364 #define DO_MSLT(N, M) (N < M)
1365 #define DO_MSLE(N, M) (N <= M)
1366 #define DO_MSGT(N, M) (N > M)
1367 
1368 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1369 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1370                   CPURISCVState *env, uint32_t desc)          \
1371 {                                                             \
1372     uint32_t vm = vext_vm(desc);                              \
1373     uint32_t vl = env->vl;                                    \
1374     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1375     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1376     uint32_t i;                                               \
1377                                                               \
1378     for (i = env->vstart; i < vl; i++) {                      \
1379         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1380         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1381         if (!vm && !vext_elem_mask(v0, i)) {                  \
1382             continue;                                         \
1383         }                                                     \
1384         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1385     }                                                         \
1386     env->vstart = 0;                                          \
1387     /* mask destination register are always tail-agnostic */  \
1388     /* set tail elements to 1s */                             \
1389     if (vta_all_1s) {                                         \
1390         for (; i < total_elems; i++) {                        \
1391             vext_set_elem_mask(vd, i, 1);                     \
1392         }                                                     \
1393     }                                                         \
1394 }
1395 
1396 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1397 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1398 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1399 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1400 
1401 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1402 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1403 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1404 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1405 
1406 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1407 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1408 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1409 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1410 
1411 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1412 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1413 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1414 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1415 
1416 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1417 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1418 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1419 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1420 
1421 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1422 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1423 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1424 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1425 
1426 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1427 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1428                   CPURISCVState *env, uint32_t desc)                \
1429 {                                                                   \
1430     uint32_t vm = vext_vm(desc);                                    \
1431     uint32_t vl = env->vl;                                          \
1432     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1433     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1434     uint32_t i;                                                     \
1435                                                                     \
1436     for (i = env->vstart; i < vl; i++) {                            \
1437         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1438         if (!vm && !vext_elem_mask(v0, i)) {                        \
1439             continue;                                               \
1440         }                                                           \
1441         vext_set_elem_mask(vd, i,                                   \
1442                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1443     }                                                               \
1444     env->vstart = 0;                                                \
1445     /* mask destination register are always tail-agnostic */        \
1446     /* set tail elements to 1s */                                   \
1447     if (vta_all_1s) {                                               \
1448         for (; i < total_elems; i++) {                              \
1449             vext_set_elem_mask(vd, i, 1);                           \
1450         }                                                           \
1451     }                                                               \
1452 }
1453 
1454 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1455 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1456 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1457 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1458 
1459 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1460 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1461 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1462 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1463 
1464 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1465 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1466 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1467 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1468 
1469 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1470 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1471 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1472 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1473 
1474 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1475 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1476 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1477 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1478 
1479 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1480 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1481 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1482 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1483 
1484 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1485 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1486 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1487 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1488 
1489 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1490 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1491 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1492 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1493 
1494 /* Vector Integer Min/Max Instructions */
1495 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1496 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1497 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1498 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1499 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1500 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1501 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1502 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1503 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1504 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1505 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1506 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1507 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1508 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1509 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1510 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1511 GEN_VEXT_VV(vminu_vv_b, 1)
1512 GEN_VEXT_VV(vminu_vv_h, 2)
1513 GEN_VEXT_VV(vminu_vv_w, 4)
1514 GEN_VEXT_VV(vminu_vv_d, 8)
1515 GEN_VEXT_VV(vmin_vv_b, 1)
1516 GEN_VEXT_VV(vmin_vv_h, 2)
1517 GEN_VEXT_VV(vmin_vv_w, 4)
1518 GEN_VEXT_VV(vmin_vv_d, 8)
1519 GEN_VEXT_VV(vmaxu_vv_b, 1)
1520 GEN_VEXT_VV(vmaxu_vv_h, 2)
1521 GEN_VEXT_VV(vmaxu_vv_w, 4)
1522 GEN_VEXT_VV(vmaxu_vv_d, 8)
1523 GEN_VEXT_VV(vmax_vv_b, 1)
1524 GEN_VEXT_VV(vmax_vv_h, 2)
1525 GEN_VEXT_VV(vmax_vv_w, 4)
1526 GEN_VEXT_VV(vmax_vv_d, 8)
1527 
1528 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1529 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1530 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1531 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1532 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1533 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1534 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1535 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1536 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1537 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1538 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1539 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1540 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1541 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1542 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1543 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1544 GEN_VEXT_VX(vminu_vx_b, 1)
1545 GEN_VEXT_VX(vminu_vx_h, 2)
1546 GEN_VEXT_VX(vminu_vx_w, 4)
1547 GEN_VEXT_VX(vminu_vx_d, 8)
1548 GEN_VEXT_VX(vmin_vx_b, 1)
1549 GEN_VEXT_VX(vmin_vx_h, 2)
1550 GEN_VEXT_VX(vmin_vx_w, 4)
1551 GEN_VEXT_VX(vmin_vx_d, 8)
1552 GEN_VEXT_VX(vmaxu_vx_b, 1)
1553 GEN_VEXT_VX(vmaxu_vx_h, 2)
1554 GEN_VEXT_VX(vmaxu_vx_w, 4)
1555 GEN_VEXT_VX(vmaxu_vx_d, 8)
1556 GEN_VEXT_VX(vmax_vx_b, 1)
1557 GEN_VEXT_VX(vmax_vx_h, 2)
1558 GEN_VEXT_VX(vmax_vx_w, 4)
1559 GEN_VEXT_VX(vmax_vx_d, 8)
1560 
1561 /* Vector Single-Width Integer Multiply Instructions */
1562 #define DO_MUL(N, M) (N * M)
1563 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1564 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1565 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1566 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1567 GEN_VEXT_VV(vmul_vv_b, 1)
1568 GEN_VEXT_VV(vmul_vv_h, 2)
1569 GEN_VEXT_VV(vmul_vv_w, 4)
1570 GEN_VEXT_VV(vmul_vv_d, 8)
1571 
1572 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1573 {
1574     return (int16_t)s2 * (int16_t)s1 >> 8;
1575 }
1576 
1577 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1578 {
1579     return (int32_t)s2 * (int32_t)s1 >> 16;
1580 }
1581 
1582 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1583 {
1584     return (int64_t)s2 * (int64_t)s1 >> 32;
1585 }
1586 
1587 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1588 {
1589     uint64_t hi_64, lo_64;
1590 
1591     muls64(&lo_64, &hi_64, s1, s2);
1592     return hi_64;
1593 }
1594 
1595 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1596 {
1597     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1598 }
1599 
1600 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1601 {
1602     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1603 }
1604 
1605 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1606 {
1607     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1608 }
1609 
1610 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1611 {
1612     uint64_t hi_64, lo_64;
1613 
1614     mulu64(&lo_64, &hi_64, s2, s1);
1615     return hi_64;
1616 }
1617 
1618 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1619 {
1620     return (int16_t)s2 * (uint16_t)s1 >> 8;
1621 }
1622 
1623 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1624 {
1625     return (int32_t)s2 * (uint32_t)s1 >> 16;
1626 }
1627 
1628 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1629 {
1630     return (int64_t)s2 * (uint64_t)s1 >> 32;
1631 }
1632 
1633 /*
1634  * Let  A = signed operand,
1635  *      B = unsigned operand
1636  *      P = mulu64(A, B), unsigned product
1637  *
1638  * LET  X = 2 ** 64  - A, 2's complement of A
1639  *      SP = signed product
1640  * THEN
1641  *      IF A < 0
1642  *          SP = -X * B
1643  *             = -(2 ** 64 - A) * B
1644  *             = A * B - 2 ** 64 * B
1645  *             = P - 2 ** 64 * B
1646  *      ELSE
1647  *          SP = P
1648  * THEN
1649  *      HI_P -= (A < 0 ? B : 0)
1650  */
1651 
1652 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1653 {
1654     uint64_t hi_64, lo_64;
1655 
1656     mulu64(&lo_64, &hi_64, s2, s1);
1657 
1658     hi_64 -= s2 < 0 ? s1 : 0;
1659     return hi_64;
1660 }
1661 
1662 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1663 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1664 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1665 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1666 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1667 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1668 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1669 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1670 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1671 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1672 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1673 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1674 GEN_VEXT_VV(vmulh_vv_b, 1)
1675 GEN_VEXT_VV(vmulh_vv_h, 2)
1676 GEN_VEXT_VV(vmulh_vv_w, 4)
1677 GEN_VEXT_VV(vmulh_vv_d, 8)
1678 GEN_VEXT_VV(vmulhu_vv_b, 1)
1679 GEN_VEXT_VV(vmulhu_vv_h, 2)
1680 GEN_VEXT_VV(vmulhu_vv_w, 4)
1681 GEN_VEXT_VV(vmulhu_vv_d, 8)
1682 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1683 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1684 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1685 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1686 
1687 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1688 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1689 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1690 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1691 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1692 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1693 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1694 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1695 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1696 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1697 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1698 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1699 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1700 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1701 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1702 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1703 GEN_VEXT_VX(vmul_vx_b, 1)
1704 GEN_VEXT_VX(vmul_vx_h, 2)
1705 GEN_VEXT_VX(vmul_vx_w, 4)
1706 GEN_VEXT_VX(vmul_vx_d, 8)
1707 GEN_VEXT_VX(vmulh_vx_b, 1)
1708 GEN_VEXT_VX(vmulh_vx_h, 2)
1709 GEN_VEXT_VX(vmulh_vx_w, 4)
1710 GEN_VEXT_VX(vmulh_vx_d, 8)
1711 GEN_VEXT_VX(vmulhu_vx_b, 1)
1712 GEN_VEXT_VX(vmulhu_vx_h, 2)
1713 GEN_VEXT_VX(vmulhu_vx_w, 4)
1714 GEN_VEXT_VX(vmulhu_vx_d, 8)
1715 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1716 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1717 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1718 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1719 
1720 /* Vector Integer Divide Instructions */
1721 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1722 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1723 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1724         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1725 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1726         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1727 
1728 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1729 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1730 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1731 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1732 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1733 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1734 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1735 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1736 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1737 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1738 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1739 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1740 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1741 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1742 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1743 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1744 GEN_VEXT_VV(vdivu_vv_b, 1)
1745 GEN_VEXT_VV(vdivu_vv_h, 2)
1746 GEN_VEXT_VV(vdivu_vv_w, 4)
1747 GEN_VEXT_VV(vdivu_vv_d, 8)
1748 GEN_VEXT_VV(vdiv_vv_b, 1)
1749 GEN_VEXT_VV(vdiv_vv_h, 2)
1750 GEN_VEXT_VV(vdiv_vv_w, 4)
1751 GEN_VEXT_VV(vdiv_vv_d, 8)
1752 GEN_VEXT_VV(vremu_vv_b, 1)
1753 GEN_VEXT_VV(vremu_vv_h, 2)
1754 GEN_VEXT_VV(vremu_vv_w, 4)
1755 GEN_VEXT_VV(vremu_vv_d, 8)
1756 GEN_VEXT_VV(vrem_vv_b, 1)
1757 GEN_VEXT_VV(vrem_vv_h, 2)
1758 GEN_VEXT_VV(vrem_vv_w, 4)
1759 GEN_VEXT_VV(vrem_vv_d, 8)
1760 
1761 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1762 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1763 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1764 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1765 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1766 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1767 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1768 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1769 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1770 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1771 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1772 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1773 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1774 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1775 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1776 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1777 GEN_VEXT_VX(vdivu_vx_b, 1)
1778 GEN_VEXT_VX(vdivu_vx_h, 2)
1779 GEN_VEXT_VX(vdivu_vx_w, 4)
1780 GEN_VEXT_VX(vdivu_vx_d, 8)
1781 GEN_VEXT_VX(vdiv_vx_b, 1)
1782 GEN_VEXT_VX(vdiv_vx_h, 2)
1783 GEN_VEXT_VX(vdiv_vx_w, 4)
1784 GEN_VEXT_VX(vdiv_vx_d, 8)
1785 GEN_VEXT_VX(vremu_vx_b, 1)
1786 GEN_VEXT_VX(vremu_vx_h, 2)
1787 GEN_VEXT_VX(vremu_vx_w, 4)
1788 GEN_VEXT_VX(vremu_vx_d, 8)
1789 GEN_VEXT_VX(vrem_vx_b, 1)
1790 GEN_VEXT_VX(vrem_vx_h, 2)
1791 GEN_VEXT_VX(vrem_vx_w, 4)
1792 GEN_VEXT_VX(vrem_vx_d, 8)
1793 
1794 /* Vector Widening Integer Multiply Instructions */
1795 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1796 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1797 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1798 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1799 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1800 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1801 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1802 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1803 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1804 GEN_VEXT_VV(vwmul_vv_b, 2)
1805 GEN_VEXT_VV(vwmul_vv_h, 4)
1806 GEN_VEXT_VV(vwmul_vv_w, 8)
1807 GEN_VEXT_VV(vwmulu_vv_b, 2)
1808 GEN_VEXT_VV(vwmulu_vv_h, 4)
1809 GEN_VEXT_VV(vwmulu_vv_w, 8)
1810 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1811 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1812 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1813 
1814 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1815 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1816 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1817 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1818 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1819 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1820 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1821 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1822 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1823 GEN_VEXT_VX(vwmul_vx_b, 2)
1824 GEN_VEXT_VX(vwmul_vx_h, 4)
1825 GEN_VEXT_VX(vwmul_vx_w, 8)
1826 GEN_VEXT_VX(vwmulu_vx_b, 2)
1827 GEN_VEXT_VX(vwmulu_vx_h, 4)
1828 GEN_VEXT_VX(vwmulu_vx_w, 8)
1829 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1830 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1831 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1832 
1833 /* Vector Single-Width Integer Multiply-Add Instructions */
1834 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1835 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1836 {                                                                  \
1837     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1838     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1839     TD d = *((TD *)vd + HD(i));                                    \
1840     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1841 }
1842 
1843 #define DO_MACC(N, M, D) (M * N + D)
1844 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1845 #define DO_MADD(N, M, D) (M * D + N)
1846 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1847 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1848 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1849 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1850 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1851 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1852 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1853 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1854 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1855 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1856 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1857 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1858 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1859 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1860 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1861 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1862 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1863 GEN_VEXT_VV(vmacc_vv_b, 1)
1864 GEN_VEXT_VV(vmacc_vv_h, 2)
1865 GEN_VEXT_VV(vmacc_vv_w, 4)
1866 GEN_VEXT_VV(vmacc_vv_d, 8)
1867 GEN_VEXT_VV(vnmsac_vv_b, 1)
1868 GEN_VEXT_VV(vnmsac_vv_h, 2)
1869 GEN_VEXT_VV(vnmsac_vv_w, 4)
1870 GEN_VEXT_VV(vnmsac_vv_d, 8)
1871 GEN_VEXT_VV(vmadd_vv_b, 1)
1872 GEN_VEXT_VV(vmadd_vv_h, 2)
1873 GEN_VEXT_VV(vmadd_vv_w, 4)
1874 GEN_VEXT_VV(vmadd_vv_d, 8)
1875 GEN_VEXT_VV(vnmsub_vv_b, 1)
1876 GEN_VEXT_VV(vnmsub_vv_h, 2)
1877 GEN_VEXT_VV(vnmsub_vv_w, 4)
1878 GEN_VEXT_VV(vnmsub_vv_d, 8)
1879 
1880 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1881 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1882 {                                                                   \
1883     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1884     TD d = *((TD *)vd + HD(i));                                     \
1885     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1886 }
1887 
1888 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1889 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1890 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1891 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1892 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1893 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1894 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1895 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1896 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1897 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1898 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1899 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1900 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1901 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1902 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1903 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1904 GEN_VEXT_VX(vmacc_vx_b, 1)
1905 GEN_VEXT_VX(vmacc_vx_h, 2)
1906 GEN_VEXT_VX(vmacc_vx_w, 4)
1907 GEN_VEXT_VX(vmacc_vx_d, 8)
1908 GEN_VEXT_VX(vnmsac_vx_b, 1)
1909 GEN_VEXT_VX(vnmsac_vx_h, 2)
1910 GEN_VEXT_VX(vnmsac_vx_w, 4)
1911 GEN_VEXT_VX(vnmsac_vx_d, 8)
1912 GEN_VEXT_VX(vmadd_vx_b, 1)
1913 GEN_VEXT_VX(vmadd_vx_h, 2)
1914 GEN_VEXT_VX(vmadd_vx_w, 4)
1915 GEN_VEXT_VX(vmadd_vx_d, 8)
1916 GEN_VEXT_VX(vnmsub_vx_b, 1)
1917 GEN_VEXT_VX(vnmsub_vx_h, 2)
1918 GEN_VEXT_VX(vnmsub_vx_w, 4)
1919 GEN_VEXT_VX(vnmsub_vx_d, 8)
1920 
1921 /* Vector Widening Integer Multiply-Add Instructions */
1922 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1923 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1924 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1925 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1926 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1927 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1928 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1929 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1930 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1931 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1932 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1933 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1934 GEN_VEXT_VV(vwmacc_vv_b, 2)
1935 GEN_VEXT_VV(vwmacc_vv_h, 4)
1936 GEN_VEXT_VV(vwmacc_vv_w, 8)
1937 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1938 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1939 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1940 
1941 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1942 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1943 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1944 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1945 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1946 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1947 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1948 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1949 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1950 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1951 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1952 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1953 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1954 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1955 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1956 GEN_VEXT_VX(vwmacc_vx_b, 2)
1957 GEN_VEXT_VX(vwmacc_vx_h, 4)
1958 GEN_VEXT_VX(vwmacc_vx_w, 8)
1959 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1960 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1961 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1962 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1963 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1964 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1965 
1966 /* Vector Integer Merge and Move Instructions */
1967 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1968 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1969                   uint32_t desc)                                     \
1970 {                                                                    \
1971     uint32_t vl = env->vl;                                           \
1972     uint32_t i;                                                      \
1973                                                                      \
1974     for (i = env->vstart; i < vl; i++) {                             \
1975         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1976         *((ETYPE *)vd + H(i)) = s1;                                  \
1977     }                                                                \
1978     env->vstart = 0;                                                 \
1979 }
1980 
1981 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1982 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1983 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1984 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1985 
1986 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1987 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1988                   uint32_t desc)                                     \
1989 {                                                                    \
1990     uint32_t vl = env->vl;                                           \
1991     uint32_t i;                                                      \
1992                                                                      \
1993     for (i = env->vstart; i < vl; i++) {                             \
1994         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1995     }                                                                \
1996     env->vstart = 0;                                                 \
1997 }
1998 
1999 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2000 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2001 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2002 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2003 
2004 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2005 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2006                   CPURISCVState *env, uint32_t desc)                 \
2007 {                                                                    \
2008     uint32_t vl = env->vl;                                           \
2009     uint32_t i;                                                      \
2010                                                                      \
2011     for (i = env->vstart; i < vl; i++) {                             \
2012         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2013         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2014     }                                                                \
2015     env->vstart = 0;                                                 \
2016 }
2017 
2018 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2019 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2020 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2021 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2022 
2023 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2024 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2025                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2026 {                                                                    \
2027     uint32_t vl = env->vl;                                           \
2028     uint32_t i;                                                      \
2029                                                                      \
2030     for (i = env->vstart; i < vl; i++) {                             \
2031         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2032         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2033                    (ETYPE)(target_long)s1);                          \
2034         *((ETYPE *)vd + H(i)) = d;                                   \
2035     }                                                                \
2036     env->vstart = 0;                                                 \
2037 }
2038 
2039 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2040 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2041 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2042 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2043 
2044 /*
2045  *** Vector Fixed-Point Arithmetic Instructions
2046  */
2047 
2048 /* Vector Single-Width Saturating Add and Subtract */
2049 
2050 /*
2051  * As fixed point instructions probably have round mode and saturation,
2052  * define common macros for fixed point here.
2053  */
2054 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2055                           CPURISCVState *env, int vxrm);
2056 
2057 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2058 static inline void                                                  \
2059 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2060           CPURISCVState *env, int vxrm)                             \
2061 {                                                                   \
2062     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2063     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2064     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2065 }
2066 
2067 static inline void
2068 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2069              CPURISCVState *env,
2070              uint32_t vl, uint32_t vm, int vxrm,
2071              opivv2_rm_fn *fn)
2072 {
2073     for (uint32_t i = env->vstart; i < vl; i++) {
2074         if (!vm && !vext_elem_mask(v0, i)) {
2075             continue;
2076         }
2077         fn(vd, vs1, vs2, i, env, vxrm);
2078     }
2079     env->vstart = 0;
2080 }
2081 
2082 static inline void
2083 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2084              CPURISCVState *env,
2085              uint32_t desc,
2086              opivv2_rm_fn *fn)
2087 {
2088     uint32_t vm = vext_vm(desc);
2089     uint32_t vl = env->vl;
2090 
2091     switch (env->vxrm) {
2092     case 0: /* rnu */
2093         vext_vv_rm_1(vd, v0, vs1, vs2,
2094                      env, vl, vm, 0, fn);
2095         break;
2096     case 1: /* rne */
2097         vext_vv_rm_1(vd, v0, vs1, vs2,
2098                      env, vl, vm, 1, fn);
2099         break;
2100     case 2: /* rdn */
2101         vext_vv_rm_1(vd, v0, vs1, vs2,
2102                      env, vl, vm, 2, fn);
2103         break;
2104     default: /* rod */
2105         vext_vv_rm_1(vd, v0, vs1, vs2,
2106                      env, vl, vm, 3, fn);
2107         break;
2108     }
2109 }
2110 
2111 /* generate helpers for fixed point instructions with OPIVV format */
2112 #define GEN_VEXT_VV_RM(NAME)                                    \
2113 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2114                   CPURISCVState *env, uint32_t desc)            \
2115 {                                                               \
2116     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2117                  do_##NAME);                                    \
2118 }
2119 
2120 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2121 {
2122     uint8_t res = a + b;
2123     if (res < a) {
2124         res = UINT8_MAX;
2125         env->vxsat = 0x1;
2126     }
2127     return res;
2128 }
2129 
2130 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2131                                uint16_t b)
2132 {
2133     uint16_t res = a + b;
2134     if (res < a) {
2135         res = UINT16_MAX;
2136         env->vxsat = 0x1;
2137     }
2138     return res;
2139 }
2140 
2141 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2142                                uint32_t b)
2143 {
2144     uint32_t res = a + b;
2145     if (res < a) {
2146         res = UINT32_MAX;
2147         env->vxsat = 0x1;
2148     }
2149     return res;
2150 }
2151 
2152 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2153                                uint64_t b)
2154 {
2155     uint64_t res = a + b;
2156     if (res < a) {
2157         res = UINT64_MAX;
2158         env->vxsat = 0x1;
2159     }
2160     return res;
2161 }
2162 
2163 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2164 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2165 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2166 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2167 GEN_VEXT_VV_RM(vsaddu_vv_b)
2168 GEN_VEXT_VV_RM(vsaddu_vv_h)
2169 GEN_VEXT_VV_RM(vsaddu_vv_w)
2170 GEN_VEXT_VV_RM(vsaddu_vv_d)
2171 
2172 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2173                           CPURISCVState *env, int vxrm);
2174 
2175 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2176 static inline void                                                  \
2177 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2178           CPURISCVState *env, int vxrm)                             \
2179 {                                                                   \
2180     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2181     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2182 }
2183 
2184 static inline void
2185 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2186              CPURISCVState *env,
2187              uint32_t vl, uint32_t vm, int vxrm,
2188              opivx2_rm_fn *fn)
2189 {
2190     for (uint32_t i = env->vstart; i < vl; i++) {
2191         if (!vm && !vext_elem_mask(v0, i)) {
2192             continue;
2193         }
2194         fn(vd, s1, vs2, i, env, vxrm);
2195     }
2196     env->vstart = 0;
2197 }
2198 
2199 static inline void
2200 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2201              CPURISCVState *env,
2202              uint32_t desc,
2203              opivx2_rm_fn *fn)
2204 {
2205     uint32_t vm = vext_vm(desc);
2206     uint32_t vl = env->vl;
2207 
2208     switch (env->vxrm) {
2209     case 0: /* rnu */
2210         vext_vx_rm_1(vd, v0, s1, vs2,
2211                      env, vl, vm, 0, fn);
2212         break;
2213     case 1: /* rne */
2214         vext_vx_rm_1(vd, v0, s1, vs2,
2215                      env, vl, vm, 1, fn);
2216         break;
2217     case 2: /* rdn */
2218         vext_vx_rm_1(vd, v0, s1, vs2,
2219                      env, vl, vm, 2, fn);
2220         break;
2221     default: /* rod */
2222         vext_vx_rm_1(vd, v0, s1, vs2,
2223                      env, vl, vm, 3, fn);
2224         break;
2225     }
2226 }
2227 
2228 /* generate helpers for fixed point instructions with OPIVX format */
2229 #define GEN_VEXT_VX_RM(NAME)                              \
2230 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2231         void *vs2, CPURISCVState *env, uint32_t desc)     \
2232 {                                                         \
2233     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2234                  do_##NAME);                              \
2235 }
2236 
2237 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2238 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2239 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2240 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2241 GEN_VEXT_VX_RM(vsaddu_vx_b)
2242 GEN_VEXT_VX_RM(vsaddu_vx_h)
2243 GEN_VEXT_VX_RM(vsaddu_vx_w)
2244 GEN_VEXT_VX_RM(vsaddu_vx_d)
2245 
2246 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2247 {
2248     int8_t res = a + b;
2249     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2250         res = a > 0 ? INT8_MAX : INT8_MIN;
2251         env->vxsat = 0x1;
2252     }
2253     return res;
2254 }
2255 
2256 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2257 {
2258     int16_t res = a + b;
2259     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2260         res = a > 0 ? INT16_MAX : INT16_MIN;
2261         env->vxsat = 0x1;
2262     }
2263     return res;
2264 }
2265 
2266 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2267 {
2268     int32_t res = a + b;
2269     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2270         res = a > 0 ? INT32_MAX : INT32_MIN;
2271         env->vxsat = 0x1;
2272     }
2273     return res;
2274 }
2275 
2276 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2277 {
2278     int64_t res = a + b;
2279     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2280         res = a > 0 ? INT64_MAX : INT64_MIN;
2281         env->vxsat = 0x1;
2282     }
2283     return res;
2284 }
2285 
2286 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2287 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2288 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2289 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2290 GEN_VEXT_VV_RM(vsadd_vv_b)
2291 GEN_VEXT_VV_RM(vsadd_vv_h)
2292 GEN_VEXT_VV_RM(vsadd_vv_w)
2293 GEN_VEXT_VV_RM(vsadd_vv_d)
2294 
2295 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2296 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2297 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2298 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2299 GEN_VEXT_VX_RM(vsadd_vx_b)
2300 GEN_VEXT_VX_RM(vsadd_vx_h)
2301 GEN_VEXT_VX_RM(vsadd_vx_w)
2302 GEN_VEXT_VX_RM(vsadd_vx_d)
2303 
2304 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2305 {
2306     uint8_t res = a - b;
2307     if (res > a) {
2308         res = 0;
2309         env->vxsat = 0x1;
2310     }
2311     return res;
2312 }
2313 
2314 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2315                                uint16_t b)
2316 {
2317     uint16_t res = a - b;
2318     if (res > a) {
2319         res = 0;
2320         env->vxsat = 0x1;
2321     }
2322     return res;
2323 }
2324 
2325 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2326                                uint32_t b)
2327 {
2328     uint32_t res = a - b;
2329     if (res > a) {
2330         res = 0;
2331         env->vxsat = 0x1;
2332     }
2333     return res;
2334 }
2335 
2336 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2337                                uint64_t b)
2338 {
2339     uint64_t res = a - b;
2340     if (res > a) {
2341         res = 0;
2342         env->vxsat = 0x1;
2343     }
2344     return res;
2345 }
2346 
2347 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2348 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2349 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2350 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2351 GEN_VEXT_VV_RM(vssubu_vv_b)
2352 GEN_VEXT_VV_RM(vssubu_vv_h)
2353 GEN_VEXT_VV_RM(vssubu_vv_w)
2354 GEN_VEXT_VV_RM(vssubu_vv_d)
2355 
2356 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2357 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2358 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2359 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2360 GEN_VEXT_VX_RM(vssubu_vx_b)
2361 GEN_VEXT_VX_RM(vssubu_vx_h)
2362 GEN_VEXT_VX_RM(vssubu_vx_w)
2363 GEN_VEXT_VX_RM(vssubu_vx_d)
2364 
2365 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2366 {
2367     int8_t res = a - b;
2368     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2369         res = a >= 0 ? INT8_MAX : INT8_MIN;
2370         env->vxsat = 0x1;
2371     }
2372     return res;
2373 }
2374 
2375 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2376 {
2377     int16_t res = a - b;
2378     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2379         res = a >= 0 ? INT16_MAX : INT16_MIN;
2380         env->vxsat = 0x1;
2381     }
2382     return res;
2383 }
2384 
2385 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2386 {
2387     int32_t res = a - b;
2388     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2389         res = a >= 0 ? INT32_MAX : INT32_MIN;
2390         env->vxsat = 0x1;
2391     }
2392     return res;
2393 }
2394 
2395 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2396 {
2397     int64_t res = a - b;
2398     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2399         res = a >= 0 ? INT64_MAX : INT64_MIN;
2400         env->vxsat = 0x1;
2401     }
2402     return res;
2403 }
2404 
2405 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2406 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2407 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2408 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2409 GEN_VEXT_VV_RM(vssub_vv_b)
2410 GEN_VEXT_VV_RM(vssub_vv_h)
2411 GEN_VEXT_VV_RM(vssub_vv_w)
2412 GEN_VEXT_VV_RM(vssub_vv_d)
2413 
2414 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2415 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2416 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2417 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2418 GEN_VEXT_VX_RM(vssub_vx_b)
2419 GEN_VEXT_VX_RM(vssub_vx_h)
2420 GEN_VEXT_VX_RM(vssub_vx_w)
2421 GEN_VEXT_VX_RM(vssub_vx_d)
2422 
2423 /* Vector Single-Width Averaging Add and Subtract */
2424 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2425 {
2426     uint8_t d = extract64(v, shift, 1);
2427     uint8_t d1;
2428     uint64_t D1, D2;
2429 
2430     if (shift == 0 || shift > 64) {
2431         return 0;
2432     }
2433 
2434     d1 = extract64(v, shift - 1, 1);
2435     D1 = extract64(v, 0, shift);
2436     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2437         return d1;
2438     } else if (vxrm == 1) { /* round-to-nearest-even */
2439         if (shift > 1) {
2440             D2 = extract64(v, 0, shift - 1);
2441             return d1 & ((D2 != 0) | d);
2442         } else {
2443             return d1 & d;
2444         }
2445     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2446         return !d & (D1 != 0);
2447     }
2448     return 0; /* round-down (truncate) */
2449 }
2450 
2451 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2452 {
2453     int64_t res = (int64_t)a + b;
2454     uint8_t round = get_round(vxrm, res, 1);
2455 
2456     return (res >> 1) + round;
2457 }
2458 
2459 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2460 {
2461     int64_t res = a + b;
2462     uint8_t round = get_round(vxrm, res, 1);
2463     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2464 
2465     /* With signed overflow, bit 64 is inverse of bit 63. */
2466     return ((res >> 1) ^ over) + round;
2467 }
2468 
2469 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2470 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2471 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2472 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2473 GEN_VEXT_VV_RM(vaadd_vv_b)
2474 GEN_VEXT_VV_RM(vaadd_vv_h)
2475 GEN_VEXT_VV_RM(vaadd_vv_w)
2476 GEN_VEXT_VV_RM(vaadd_vv_d)
2477 
2478 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2479 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2480 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2481 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2482 GEN_VEXT_VX_RM(vaadd_vx_b)
2483 GEN_VEXT_VX_RM(vaadd_vx_h)
2484 GEN_VEXT_VX_RM(vaadd_vx_w)
2485 GEN_VEXT_VX_RM(vaadd_vx_d)
2486 
2487 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2488                                uint32_t a, uint32_t b)
2489 {
2490     uint64_t res = (uint64_t)a + b;
2491     uint8_t round = get_round(vxrm, res, 1);
2492 
2493     return (res >> 1) + round;
2494 }
2495 
2496 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2497                                uint64_t a, uint64_t b)
2498 {
2499     uint64_t res = a + b;
2500     uint8_t round = get_round(vxrm, res, 1);
2501     uint64_t over = (uint64_t)(res < a) << 63;
2502 
2503     return ((res >> 1) | over) + round;
2504 }
2505 
2506 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2507 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2508 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2509 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2510 GEN_VEXT_VV_RM(vaaddu_vv_b)
2511 GEN_VEXT_VV_RM(vaaddu_vv_h)
2512 GEN_VEXT_VV_RM(vaaddu_vv_w)
2513 GEN_VEXT_VV_RM(vaaddu_vv_d)
2514 
2515 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2516 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2517 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2518 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2519 GEN_VEXT_VX_RM(vaaddu_vx_b)
2520 GEN_VEXT_VX_RM(vaaddu_vx_h)
2521 GEN_VEXT_VX_RM(vaaddu_vx_w)
2522 GEN_VEXT_VX_RM(vaaddu_vx_d)
2523 
2524 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2525 {
2526     int64_t res = (int64_t)a - b;
2527     uint8_t round = get_round(vxrm, res, 1);
2528 
2529     return (res >> 1) + round;
2530 }
2531 
2532 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2533 {
2534     int64_t res = (int64_t)a - b;
2535     uint8_t round = get_round(vxrm, res, 1);
2536     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2537 
2538     /* With signed overflow, bit 64 is inverse of bit 63. */
2539     return ((res >> 1) ^ over) + round;
2540 }
2541 
2542 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2543 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2544 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2545 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2546 GEN_VEXT_VV_RM(vasub_vv_b)
2547 GEN_VEXT_VV_RM(vasub_vv_h)
2548 GEN_VEXT_VV_RM(vasub_vv_w)
2549 GEN_VEXT_VV_RM(vasub_vv_d)
2550 
2551 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2552 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2553 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2554 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2555 GEN_VEXT_VX_RM(vasub_vx_b)
2556 GEN_VEXT_VX_RM(vasub_vx_h)
2557 GEN_VEXT_VX_RM(vasub_vx_w)
2558 GEN_VEXT_VX_RM(vasub_vx_d)
2559 
2560 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2561                                uint32_t a, uint32_t b)
2562 {
2563     int64_t res = (int64_t)a - b;
2564     uint8_t round = get_round(vxrm, res, 1);
2565 
2566     return (res >> 1) + round;
2567 }
2568 
2569 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2570                                uint64_t a, uint64_t b)
2571 {
2572     uint64_t res = (uint64_t)a - b;
2573     uint8_t round = get_round(vxrm, res, 1);
2574     uint64_t over = (uint64_t)(res > a) << 63;
2575 
2576     return ((res >> 1) | over) + round;
2577 }
2578 
2579 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2580 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2581 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2582 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2583 GEN_VEXT_VV_RM(vasubu_vv_b)
2584 GEN_VEXT_VV_RM(vasubu_vv_h)
2585 GEN_VEXT_VV_RM(vasubu_vv_w)
2586 GEN_VEXT_VV_RM(vasubu_vv_d)
2587 
2588 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2589 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2590 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2591 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2592 GEN_VEXT_VX_RM(vasubu_vx_b)
2593 GEN_VEXT_VX_RM(vasubu_vx_h)
2594 GEN_VEXT_VX_RM(vasubu_vx_w)
2595 GEN_VEXT_VX_RM(vasubu_vx_d)
2596 
2597 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2598 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2599 {
2600     uint8_t round;
2601     int16_t res;
2602 
2603     res = (int16_t)a * (int16_t)b;
2604     round = get_round(vxrm, res, 7);
2605     res   = (res >> 7) + round;
2606 
2607     if (res > INT8_MAX) {
2608         env->vxsat = 0x1;
2609         return INT8_MAX;
2610     } else if (res < INT8_MIN) {
2611         env->vxsat = 0x1;
2612         return INT8_MIN;
2613     } else {
2614         return res;
2615     }
2616 }
2617 
2618 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2619 {
2620     uint8_t round;
2621     int32_t res;
2622 
2623     res = (int32_t)a * (int32_t)b;
2624     round = get_round(vxrm, res, 15);
2625     res   = (res >> 15) + round;
2626 
2627     if (res > INT16_MAX) {
2628         env->vxsat = 0x1;
2629         return INT16_MAX;
2630     } else if (res < INT16_MIN) {
2631         env->vxsat = 0x1;
2632         return INT16_MIN;
2633     } else {
2634         return res;
2635     }
2636 }
2637 
2638 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2639 {
2640     uint8_t round;
2641     int64_t res;
2642 
2643     res = (int64_t)a * (int64_t)b;
2644     round = get_round(vxrm, res, 31);
2645     res   = (res >> 31) + round;
2646 
2647     if (res > INT32_MAX) {
2648         env->vxsat = 0x1;
2649         return INT32_MAX;
2650     } else if (res < INT32_MIN) {
2651         env->vxsat = 0x1;
2652         return INT32_MIN;
2653     } else {
2654         return res;
2655     }
2656 }
2657 
2658 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2659 {
2660     uint8_t round;
2661     uint64_t hi_64, lo_64;
2662     int64_t res;
2663 
2664     if (a == INT64_MIN && b == INT64_MIN) {
2665         env->vxsat = 1;
2666         return INT64_MAX;
2667     }
2668 
2669     muls64(&lo_64, &hi_64, a, b);
2670     round = get_round(vxrm, lo_64, 63);
2671     /*
2672      * Cannot overflow, as there are always
2673      * 2 sign bits after multiply.
2674      */
2675     res = (hi_64 << 1) | (lo_64 >> 63);
2676     if (round) {
2677         if (res == INT64_MAX) {
2678             env->vxsat = 1;
2679         } else {
2680             res += 1;
2681         }
2682     }
2683     return res;
2684 }
2685 
2686 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2687 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2688 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2689 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2690 GEN_VEXT_VV_RM(vsmul_vv_b)
2691 GEN_VEXT_VV_RM(vsmul_vv_h)
2692 GEN_VEXT_VV_RM(vsmul_vv_w)
2693 GEN_VEXT_VV_RM(vsmul_vv_d)
2694 
2695 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2696 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2697 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2698 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2699 GEN_VEXT_VX_RM(vsmul_vx_b)
2700 GEN_VEXT_VX_RM(vsmul_vx_h)
2701 GEN_VEXT_VX_RM(vsmul_vx_w)
2702 GEN_VEXT_VX_RM(vsmul_vx_d)
2703 
2704 /* Vector Single-Width Scaling Shift Instructions */
2705 static inline uint8_t
2706 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2707 {
2708     uint8_t round, shift = b & 0x7;
2709     uint8_t res;
2710 
2711     round = get_round(vxrm, a, shift);
2712     res   = (a >> shift)  + round;
2713     return res;
2714 }
2715 static inline uint16_t
2716 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2717 {
2718     uint8_t round, shift = b & 0xf;
2719     uint16_t res;
2720 
2721     round = get_round(vxrm, a, shift);
2722     res   = (a >> shift)  + round;
2723     return res;
2724 }
2725 static inline uint32_t
2726 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2727 {
2728     uint8_t round, shift = b & 0x1f;
2729     uint32_t res;
2730 
2731     round = get_round(vxrm, a, shift);
2732     res   = (a >> shift)  + round;
2733     return res;
2734 }
2735 static inline uint64_t
2736 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2737 {
2738     uint8_t round, shift = b & 0x3f;
2739     uint64_t res;
2740 
2741     round = get_round(vxrm, a, shift);
2742     res   = (a >> shift)  + round;
2743     return res;
2744 }
2745 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2746 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2747 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2748 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2749 GEN_VEXT_VV_RM(vssrl_vv_b)
2750 GEN_VEXT_VV_RM(vssrl_vv_h)
2751 GEN_VEXT_VV_RM(vssrl_vv_w)
2752 GEN_VEXT_VV_RM(vssrl_vv_d)
2753 
2754 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2755 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2756 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2757 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2758 GEN_VEXT_VX_RM(vssrl_vx_b)
2759 GEN_VEXT_VX_RM(vssrl_vx_h)
2760 GEN_VEXT_VX_RM(vssrl_vx_w)
2761 GEN_VEXT_VX_RM(vssrl_vx_d)
2762 
2763 static inline int8_t
2764 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2765 {
2766     uint8_t round, shift = b & 0x7;
2767     int8_t res;
2768 
2769     round = get_round(vxrm, a, shift);
2770     res   = (a >> shift)  + round;
2771     return res;
2772 }
2773 static inline int16_t
2774 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2775 {
2776     uint8_t round, shift = b & 0xf;
2777     int16_t res;
2778 
2779     round = get_round(vxrm, a, shift);
2780     res   = (a >> shift)  + round;
2781     return res;
2782 }
2783 static inline int32_t
2784 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2785 {
2786     uint8_t round, shift = b & 0x1f;
2787     int32_t res;
2788 
2789     round = get_round(vxrm, a, shift);
2790     res   = (a >> shift)  + round;
2791     return res;
2792 }
2793 static inline int64_t
2794 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2795 {
2796     uint8_t round, shift = b & 0x3f;
2797     int64_t res;
2798 
2799     round = get_round(vxrm, a, shift);
2800     res   = (a >> shift)  + round;
2801     return res;
2802 }
2803 
2804 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2805 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2806 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2807 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2808 GEN_VEXT_VV_RM(vssra_vv_b)
2809 GEN_VEXT_VV_RM(vssra_vv_h)
2810 GEN_VEXT_VV_RM(vssra_vv_w)
2811 GEN_VEXT_VV_RM(vssra_vv_d)
2812 
2813 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2814 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2815 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2816 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2817 GEN_VEXT_VX_RM(vssra_vx_b)
2818 GEN_VEXT_VX_RM(vssra_vx_h)
2819 GEN_VEXT_VX_RM(vssra_vx_w)
2820 GEN_VEXT_VX_RM(vssra_vx_d)
2821 
2822 /* Vector Narrowing Fixed-Point Clip Instructions */
2823 static inline int8_t
2824 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2825 {
2826     uint8_t round, shift = b & 0xf;
2827     int16_t res;
2828 
2829     round = get_round(vxrm, a, shift);
2830     res   = (a >> shift)  + round;
2831     if (res > INT8_MAX) {
2832         env->vxsat = 0x1;
2833         return INT8_MAX;
2834     } else if (res < INT8_MIN) {
2835         env->vxsat = 0x1;
2836         return INT8_MIN;
2837     } else {
2838         return res;
2839     }
2840 }
2841 
2842 static inline int16_t
2843 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2844 {
2845     uint8_t round, shift = b & 0x1f;
2846     int32_t res;
2847 
2848     round = get_round(vxrm, a, shift);
2849     res   = (a >> shift)  + round;
2850     if (res > INT16_MAX) {
2851         env->vxsat = 0x1;
2852         return INT16_MAX;
2853     } else if (res < INT16_MIN) {
2854         env->vxsat = 0x1;
2855         return INT16_MIN;
2856     } else {
2857         return res;
2858     }
2859 }
2860 
2861 static inline int32_t
2862 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2863 {
2864     uint8_t round, shift = b & 0x3f;
2865     int64_t res;
2866 
2867     round = get_round(vxrm, a, shift);
2868     res   = (a >> shift)  + round;
2869     if (res > INT32_MAX) {
2870         env->vxsat = 0x1;
2871         return INT32_MAX;
2872     } else if (res < INT32_MIN) {
2873         env->vxsat = 0x1;
2874         return INT32_MIN;
2875     } else {
2876         return res;
2877     }
2878 }
2879 
2880 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2881 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2882 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2883 GEN_VEXT_VV_RM(vnclip_wv_b)
2884 GEN_VEXT_VV_RM(vnclip_wv_h)
2885 GEN_VEXT_VV_RM(vnclip_wv_w)
2886 
2887 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2888 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2889 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2890 GEN_VEXT_VX_RM(vnclip_wx_b)
2891 GEN_VEXT_VX_RM(vnclip_wx_h)
2892 GEN_VEXT_VX_RM(vnclip_wx_w)
2893 
2894 static inline uint8_t
2895 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2896 {
2897     uint8_t round, shift = b & 0xf;
2898     uint16_t res;
2899 
2900     round = get_round(vxrm, a, shift);
2901     res   = (a >> shift)  + round;
2902     if (res > UINT8_MAX) {
2903         env->vxsat = 0x1;
2904         return UINT8_MAX;
2905     } else {
2906         return res;
2907     }
2908 }
2909 
2910 static inline uint16_t
2911 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2912 {
2913     uint8_t round, shift = b & 0x1f;
2914     uint32_t res;
2915 
2916     round = get_round(vxrm, a, shift);
2917     res   = (a >> shift)  + round;
2918     if (res > UINT16_MAX) {
2919         env->vxsat = 0x1;
2920         return UINT16_MAX;
2921     } else {
2922         return res;
2923     }
2924 }
2925 
2926 static inline uint32_t
2927 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2928 {
2929     uint8_t round, shift = b & 0x3f;
2930     uint64_t res;
2931 
2932     round = get_round(vxrm, a, shift);
2933     res   = (a >> shift)  + round;
2934     if (res > UINT32_MAX) {
2935         env->vxsat = 0x1;
2936         return UINT32_MAX;
2937     } else {
2938         return res;
2939     }
2940 }
2941 
2942 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2943 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2944 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2945 GEN_VEXT_VV_RM(vnclipu_wv_b)
2946 GEN_VEXT_VV_RM(vnclipu_wv_h)
2947 GEN_VEXT_VV_RM(vnclipu_wv_w)
2948 
2949 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2950 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2951 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2952 GEN_VEXT_VX_RM(vnclipu_wx_b)
2953 GEN_VEXT_VX_RM(vnclipu_wx_h)
2954 GEN_VEXT_VX_RM(vnclipu_wx_w)
2955 
2956 /*
2957  *** Vector Float Point Arithmetic Instructions
2958  */
2959 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2960 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2961 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2962                       CPURISCVState *env)                      \
2963 {                                                              \
2964     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2965     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2966     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2967 }
2968 
2969 #define GEN_VEXT_VV_ENV(NAME)                             \
2970 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2971                   void *vs2, CPURISCVState *env,          \
2972                   uint32_t desc)                          \
2973 {                                                         \
2974     uint32_t vm = vext_vm(desc);                          \
2975     uint32_t vl = env->vl;                                \
2976     uint32_t i;                                           \
2977                                                           \
2978     for (i = env->vstart; i < vl; i++) {                  \
2979         if (!vm && !vext_elem_mask(v0, i)) {              \
2980             continue;                                     \
2981         }                                                 \
2982         do_##NAME(vd, vs1, vs2, i, env);                  \
2983     }                                                     \
2984     env->vstart = 0;                                      \
2985 }
2986 
2987 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2988 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2989 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2990 GEN_VEXT_VV_ENV(vfadd_vv_h)
2991 GEN_VEXT_VV_ENV(vfadd_vv_w)
2992 GEN_VEXT_VV_ENV(vfadd_vv_d)
2993 
2994 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2995 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2996                       CPURISCVState *env)                      \
2997 {                                                              \
2998     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2999     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3000 }
3001 
3002 #define GEN_VEXT_VF(NAME)                                 \
3003 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3004                   void *vs2, CPURISCVState *env,          \
3005                   uint32_t desc)                          \
3006 {                                                         \
3007     uint32_t vm = vext_vm(desc);                          \
3008     uint32_t vl = env->vl;                                \
3009     uint32_t i;                                           \
3010                                                           \
3011     for (i = env->vstart; i < vl; i++) {                  \
3012         if (!vm && !vext_elem_mask(v0, i)) {              \
3013             continue;                                     \
3014         }                                                 \
3015         do_##NAME(vd, s1, vs2, i, env);                   \
3016     }                                                     \
3017     env->vstart = 0;                                      \
3018 }
3019 
3020 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3021 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3022 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3023 GEN_VEXT_VF(vfadd_vf_h)
3024 GEN_VEXT_VF(vfadd_vf_w)
3025 GEN_VEXT_VF(vfadd_vf_d)
3026 
3027 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3028 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3029 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3030 GEN_VEXT_VV_ENV(vfsub_vv_h)
3031 GEN_VEXT_VV_ENV(vfsub_vv_w)
3032 GEN_VEXT_VV_ENV(vfsub_vv_d)
3033 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3034 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3035 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3036 GEN_VEXT_VF(vfsub_vf_h)
3037 GEN_VEXT_VF(vfsub_vf_w)
3038 GEN_VEXT_VF(vfsub_vf_d)
3039 
3040 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3041 {
3042     return float16_sub(b, a, s);
3043 }
3044 
3045 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3046 {
3047     return float32_sub(b, a, s);
3048 }
3049 
3050 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3051 {
3052     return float64_sub(b, a, s);
3053 }
3054 
3055 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3056 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3057 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3058 GEN_VEXT_VF(vfrsub_vf_h)
3059 GEN_VEXT_VF(vfrsub_vf_w)
3060 GEN_VEXT_VF(vfrsub_vf_d)
3061 
3062 /* Vector Widening Floating-Point Add/Subtract Instructions */
3063 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3064 {
3065     return float32_add(float16_to_float32(a, true, s),
3066             float16_to_float32(b, true, s), s);
3067 }
3068 
3069 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3070 {
3071     return float64_add(float32_to_float64(a, s),
3072             float32_to_float64(b, s), s);
3073 
3074 }
3075 
3076 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3077 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3078 GEN_VEXT_VV_ENV(vfwadd_vv_h)
3079 GEN_VEXT_VV_ENV(vfwadd_vv_w)
3080 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3081 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3082 GEN_VEXT_VF(vfwadd_vf_h)
3083 GEN_VEXT_VF(vfwadd_vf_w)
3084 
3085 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3086 {
3087     return float32_sub(float16_to_float32(a, true, s),
3088             float16_to_float32(b, true, s), s);
3089 }
3090 
3091 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3092 {
3093     return float64_sub(float32_to_float64(a, s),
3094             float32_to_float64(b, s), s);
3095 
3096 }
3097 
3098 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3099 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3100 GEN_VEXT_VV_ENV(vfwsub_vv_h)
3101 GEN_VEXT_VV_ENV(vfwsub_vv_w)
3102 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3103 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3104 GEN_VEXT_VF(vfwsub_vf_h)
3105 GEN_VEXT_VF(vfwsub_vf_w)
3106 
3107 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3108 {
3109     return float32_add(a, float16_to_float32(b, true, s), s);
3110 }
3111 
3112 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3113 {
3114     return float64_add(a, float32_to_float64(b, s), s);
3115 }
3116 
3117 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3118 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3119 GEN_VEXT_VV_ENV(vfwadd_wv_h)
3120 GEN_VEXT_VV_ENV(vfwadd_wv_w)
3121 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3122 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3123 GEN_VEXT_VF(vfwadd_wf_h)
3124 GEN_VEXT_VF(vfwadd_wf_w)
3125 
3126 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3127 {
3128     return float32_sub(a, float16_to_float32(b, true, s), s);
3129 }
3130 
3131 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3132 {
3133     return float64_sub(a, float32_to_float64(b, s), s);
3134 }
3135 
3136 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3137 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3138 GEN_VEXT_VV_ENV(vfwsub_wv_h)
3139 GEN_VEXT_VV_ENV(vfwsub_wv_w)
3140 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3141 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3142 GEN_VEXT_VF(vfwsub_wf_h)
3143 GEN_VEXT_VF(vfwsub_wf_w)
3144 
3145 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3146 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3147 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3148 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3149 GEN_VEXT_VV_ENV(vfmul_vv_h)
3150 GEN_VEXT_VV_ENV(vfmul_vv_w)
3151 GEN_VEXT_VV_ENV(vfmul_vv_d)
3152 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3153 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3154 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3155 GEN_VEXT_VF(vfmul_vf_h)
3156 GEN_VEXT_VF(vfmul_vf_w)
3157 GEN_VEXT_VF(vfmul_vf_d)
3158 
3159 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3160 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3161 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3162 GEN_VEXT_VV_ENV(vfdiv_vv_h)
3163 GEN_VEXT_VV_ENV(vfdiv_vv_w)
3164 GEN_VEXT_VV_ENV(vfdiv_vv_d)
3165 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3166 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3167 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3168 GEN_VEXT_VF(vfdiv_vf_h)
3169 GEN_VEXT_VF(vfdiv_vf_w)
3170 GEN_VEXT_VF(vfdiv_vf_d)
3171 
3172 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3173 {
3174     return float16_div(b, a, s);
3175 }
3176 
3177 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3178 {
3179     return float32_div(b, a, s);
3180 }
3181 
3182 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3183 {
3184     return float64_div(b, a, s);
3185 }
3186 
3187 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3188 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3189 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3190 GEN_VEXT_VF(vfrdiv_vf_h)
3191 GEN_VEXT_VF(vfrdiv_vf_w)
3192 GEN_VEXT_VF(vfrdiv_vf_d)
3193 
3194 /* Vector Widening Floating-Point Multiply */
3195 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3196 {
3197     return float32_mul(float16_to_float32(a, true, s),
3198             float16_to_float32(b, true, s), s);
3199 }
3200 
3201 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3202 {
3203     return float64_mul(float32_to_float64(a, s),
3204             float32_to_float64(b, s), s);
3205 
3206 }
3207 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3208 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3209 GEN_VEXT_VV_ENV(vfwmul_vv_h)
3210 GEN_VEXT_VV_ENV(vfwmul_vv_w)
3211 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3212 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3213 GEN_VEXT_VF(vfwmul_vf_h)
3214 GEN_VEXT_VF(vfwmul_vf_w)
3215 
3216 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3217 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3218 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3219         CPURISCVState *env)                                        \
3220 {                                                                  \
3221     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3222     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3223     TD d = *((TD *)vd + HD(i));                                    \
3224     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3225 }
3226 
3227 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3228 {
3229     return float16_muladd(a, b, d, 0, s);
3230 }
3231 
3232 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3233 {
3234     return float32_muladd(a, b, d, 0, s);
3235 }
3236 
3237 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3238 {
3239     return float64_muladd(a, b, d, 0, s);
3240 }
3241 
3242 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3243 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3244 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3245 GEN_VEXT_VV_ENV(vfmacc_vv_h)
3246 GEN_VEXT_VV_ENV(vfmacc_vv_w)
3247 GEN_VEXT_VV_ENV(vfmacc_vv_d)
3248 
3249 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3250 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3251         CPURISCVState *env)                                       \
3252 {                                                                 \
3253     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3254     TD d = *((TD *)vd + HD(i));                                   \
3255     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3256 }
3257 
3258 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3259 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3260 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3261 GEN_VEXT_VF(vfmacc_vf_h)
3262 GEN_VEXT_VF(vfmacc_vf_w)
3263 GEN_VEXT_VF(vfmacc_vf_d)
3264 
3265 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3266 {
3267     return float16_muladd(a, b, d,
3268             float_muladd_negate_c | float_muladd_negate_product, s);
3269 }
3270 
3271 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3272 {
3273     return float32_muladd(a, b, d,
3274             float_muladd_negate_c | float_muladd_negate_product, s);
3275 }
3276 
3277 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3278 {
3279     return float64_muladd(a, b, d,
3280             float_muladd_negate_c | float_muladd_negate_product, s);
3281 }
3282 
3283 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3284 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3285 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3286 GEN_VEXT_VV_ENV(vfnmacc_vv_h)
3287 GEN_VEXT_VV_ENV(vfnmacc_vv_w)
3288 GEN_VEXT_VV_ENV(vfnmacc_vv_d)
3289 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3290 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3291 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3292 GEN_VEXT_VF(vfnmacc_vf_h)
3293 GEN_VEXT_VF(vfnmacc_vf_w)
3294 GEN_VEXT_VF(vfnmacc_vf_d)
3295 
3296 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3297 {
3298     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3299 }
3300 
3301 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3302 {
3303     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3304 }
3305 
3306 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3307 {
3308     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3309 }
3310 
3311 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3312 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3313 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3314 GEN_VEXT_VV_ENV(vfmsac_vv_h)
3315 GEN_VEXT_VV_ENV(vfmsac_vv_w)
3316 GEN_VEXT_VV_ENV(vfmsac_vv_d)
3317 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3318 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3319 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3320 GEN_VEXT_VF(vfmsac_vf_h)
3321 GEN_VEXT_VF(vfmsac_vf_w)
3322 GEN_VEXT_VF(vfmsac_vf_d)
3323 
3324 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3325 {
3326     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3327 }
3328 
3329 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3330 {
3331     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3332 }
3333 
3334 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3335 {
3336     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3337 }
3338 
3339 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3340 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3341 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3342 GEN_VEXT_VV_ENV(vfnmsac_vv_h)
3343 GEN_VEXT_VV_ENV(vfnmsac_vv_w)
3344 GEN_VEXT_VV_ENV(vfnmsac_vv_d)
3345 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3346 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3347 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3348 GEN_VEXT_VF(vfnmsac_vf_h)
3349 GEN_VEXT_VF(vfnmsac_vf_w)
3350 GEN_VEXT_VF(vfnmsac_vf_d)
3351 
3352 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3353 {
3354     return float16_muladd(d, b, a, 0, s);
3355 }
3356 
3357 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3358 {
3359     return float32_muladd(d, b, a, 0, s);
3360 }
3361 
3362 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3363 {
3364     return float64_muladd(d, b, a, 0, s);
3365 }
3366 
3367 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3368 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3369 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3370 GEN_VEXT_VV_ENV(vfmadd_vv_h)
3371 GEN_VEXT_VV_ENV(vfmadd_vv_w)
3372 GEN_VEXT_VV_ENV(vfmadd_vv_d)
3373 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3374 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3375 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3376 GEN_VEXT_VF(vfmadd_vf_h)
3377 GEN_VEXT_VF(vfmadd_vf_w)
3378 GEN_VEXT_VF(vfmadd_vf_d)
3379 
3380 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3381 {
3382     return float16_muladd(d, b, a,
3383             float_muladd_negate_c | float_muladd_negate_product, s);
3384 }
3385 
3386 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3387 {
3388     return float32_muladd(d, b, a,
3389             float_muladd_negate_c | float_muladd_negate_product, s);
3390 }
3391 
3392 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3393 {
3394     return float64_muladd(d, b, a,
3395             float_muladd_negate_c | float_muladd_negate_product, s);
3396 }
3397 
3398 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3399 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3400 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3401 GEN_VEXT_VV_ENV(vfnmadd_vv_h)
3402 GEN_VEXT_VV_ENV(vfnmadd_vv_w)
3403 GEN_VEXT_VV_ENV(vfnmadd_vv_d)
3404 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3405 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3406 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3407 GEN_VEXT_VF(vfnmadd_vf_h)
3408 GEN_VEXT_VF(vfnmadd_vf_w)
3409 GEN_VEXT_VF(vfnmadd_vf_d)
3410 
3411 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3412 {
3413     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3414 }
3415 
3416 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3417 {
3418     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3419 }
3420 
3421 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3422 {
3423     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3424 }
3425 
3426 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3427 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3428 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3429 GEN_VEXT_VV_ENV(vfmsub_vv_h)
3430 GEN_VEXT_VV_ENV(vfmsub_vv_w)
3431 GEN_VEXT_VV_ENV(vfmsub_vv_d)
3432 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3433 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3434 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3435 GEN_VEXT_VF(vfmsub_vf_h)
3436 GEN_VEXT_VF(vfmsub_vf_w)
3437 GEN_VEXT_VF(vfmsub_vf_d)
3438 
3439 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3440 {
3441     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3442 }
3443 
3444 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3445 {
3446     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3447 }
3448 
3449 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3450 {
3451     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3452 }
3453 
3454 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3455 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3456 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3457 GEN_VEXT_VV_ENV(vfnmsub_vv_h)
3458 GEN_VEXT_VV_ENV(vfnmsub_vv_w)
3459 GEN_VEXT_VV_ENV(vfnmsub_vv_d)
3460 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3461 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3462 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3463 GEN_VEXT_VF(vfnmsub_vf_h)
3464 GEN_VEXT_VF(vfnmsub_vf_w)
3465 GEN_VEXT_VF(vfnmsub_vf_d)
3466 
3467 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3468 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3469 {
3470     return float32_muladd(float16_to_float32(a, true, s),
3471                         float16_to_float32(b, true, s), d, 0, s);
3472 }
3473 
3474 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3475 {
3476     return float64_muladd(float32_to_float64(a, s),
3477                         float32_to_float64(b, s), d, 0, s);
3478 }
3479 
3480 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3481 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3482 GEN_VEXT_VV_ENV(vfwmacc_vv_h)
3483 GEN_VEXT_VV_ENV(vfwmacc_vv_w)
3484 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3485 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3486 GEN_VEXT_VF(vfwmacc_vf_h)
3487 GEN_VEXT_VF(vfwmacc_vf_w)
3488 
3489 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3490 {
3491     return float32_muladd(float16_to_float32(a, true, s),
3492                         float16_to_float32(b, true, s), d,
3493                         float_muladd_negate_c | float_muladd_negate_product, s);
3494 }
3495 
3496 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3497 {
3498     return float64_muladd(float32_to_float64(a, s),
3499                         float32_to_float64(b, s), d,
3500                         float_muladd_negate_c | float_muladd_negate_product, s);
3501 }
3502 
3503 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3504 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3505 GEN_VEXT_VV_ENV(vfwnmacc_vv_h)
3506 GEN_VEXT_VV_ENV(vfwnmacc_vv_w)
3507 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3508 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3509 GEN_VEXT_VF(vfwnmacc_vf_h)
3510 GEN_VEXT_VF(vfwnmacc_vf_w)
3511 
3512 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3513 {
3514     return float32_muladd(float16_to_float32(a, true, s),
3515                         float16_to_float32(b, true, s), d,
3516                         float_muladd_negate_c, s);
3517 }
3518 
3519 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3520 {
3521     return float64_muladd(float32_to_float64(a, s),
3522                         float32_to_float64(b, s), d,
3523                         float_muladd_negate_c, s);
3524 }
3525 
3526 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3527 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3528 GEN_VEXT_VV_ENV(vfwmsac_vv_h)
3529 GEN_VEXT_VV_ENV(vfwmsac_vv_w)
3530 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3531 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3532 GEN_VEXT_VF(vfwmsac_vf_h)
3533 GEN_VEXT_VF(vfwmsac_vf_w)
3534 
3535 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3536 {
3537     return float32_muladd(float16_to_float32(a, true, s),
3538                         float16_to_float32(b, true, s), d,
3539                         float_muladd_negate_product, s);
3540 }
3541 
3542 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3543 {
3544     return float64_muladd(float32_to_float64(a, s),
3545                         float32_to_float64(b, s), d,
3546                         float_muladd_negate_product, s);
3547 }
3548 
3549 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3550 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3551 GEN_VEXT_VV_ENV(vfwnmsac_vv_h)
3552 GEN_VEXT_VV_ENV(vfwnmsac_vv_w)
3553 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3554 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3555 GEN_VEXT_VF(vfwnmsac_vf_h)
3556 GEN_VEXT_VF(vfwnmsac_vf_w)
3557 
3558 /* Vector Floating-Point Square-Root Instruction */
3559 /* (TD, T2, TX2) */
3560 #define OP_UU_H uint16_t, uint16_t, uint16_t
3561 #define OP_UU_W uint32_t, uint32_t, uint32_t
3562 #define OP_UU_D uint64_t, uint64_t, uint64_t
3563 
3564 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3565 static void do_##NAME(void *vd, void *vs2, int i,      \
3566         CPURISCVState *env)                            \
3567 {                                                      \
3568     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3569     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3570 }
3571 
3572 #define GEN_VEXT_V_ENV(NAME)                           \
3573 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3574         CPURISCVState *env, uint32_t desc)             \
3575 {                                                      \
3576     uint32_t vm = vext_vm(desc);                       \
3577     uint32_t vl = env->vl;                             \
3578     uint32_t i;                                        \
3579                                                        \
3580     if (vl == 0) {                                     \
3581         return;                                        \
3582     }                                                  \
3583     for (i = env->vstart; i < vl; i++) {               \
3584         if (!vm && !vext_elem_mask(v0, i)) {           \
3585             continue;                                  \
3586         }                                              \
3587         do_##NAME(vd, vs2, i, env);                    \
3588     }                                                  \
3589     env->vstart = 0;                                   \
3590 }
3591 
3592 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3593 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3594 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3595 GEN_VEXT_V_ENV(vfsqrt_v_h)
3596 GEN_VEXT_V_ENV(vfsqrt_v_w)
3597 GEN_VEXT_V_ENV(vfsqrt_v_d)
3598 
3599 /*
3600  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3601  *
3602  * Adapted from riscv-v-spec recip.c:
3603  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3604  */
3605 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3606 {
3607     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3608     uint64_t exp = extract64(f, frac_size, exp_size);
3609     uint64_t frac = extract64(f, 0, frac_size);
3610 
3611     const uint8_t lookup_table[] = {
3612         52, 51, 50, 48, 47, 46, 44, 43,
3613         42, 41, 40, 39, 38, 36, 35, 34,
3614         33, 32, 31, 30, 30, 29, 28, 27,
3615         26, 25, 24, 23, 23, 22, 21, 20,
3616         19, 19, 18, 17, 16, 16, 15, 14,
3617         14, 13, 12, 12, 11, 10, 10, 9,
3618         9, 8, 7, 7, 6, 6, 5, 4,
3619         4, 3, 3, 2, 2, 1, 1, 0,
3620         127, 125, 123, 121, 119, 118, 116, 114,
3621         113, 111, 109, 108, 106, 105, 103, 102,
3622         100, 99, 97, 96, 95, 93, 92, 91,
3623         90, 88, 87, 86, 85, 84, 83, 82,
3624         80, 79, 78, 77, 76, 75, 74, 73,
3625         72, 71, 70, 70, 69, 68, 67, 66,
3626         65, 64, 63, 63, 62, 61, 60, 59,
3627         59, 58, 57, 56, 56, 55, 54, 53
3628     };
3629     const int precision = 7;
3630 
3631     if (exp == 0 && frac != 0) { /* subnormal */
3632         /* Normalize the subnormal. */
3633         while (extract64(frac, frac_size - 1, 1) == 0) {
3634             exp--;
3635             frac <<= 1;
3636         }
3637 
3638         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3639     }
3640 
3641     int idx = ((exp & 1) << (precision - 1)) |
3642                 (frac >> (frac_size - precision + 1));
3643     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3644                             (frac_size - precision);
3645     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3646 
3647     uint64_t val = 0;
3648     val = deposit64(val, 0, frac_size, out_frac);
3649     val = deposit64(val, frac_size, exp_size, out_exp);
3650     val = deposit64(val, frac_size + exp_size, 1, sign);
3651     return val;
3652 }
3653 
3654 static float16 frsqrt7_h(float16 f, float_status *s)
3655 {
3656     int exp_size = 5, frac_size = 10;
3657     bool sign = float16_is_neg(f);
3658 
3659     /*
3660      * frsqrt7(sNaN) = canonical NaN
3661      * frsqrt7(-inf) = canonical NaN
3662      * frsqrt7(-normal) = canonical NaN
3663      * frsqrt7(-subnormal) = canonical NaN
3664      */
3665     if (float16_is_signaling_nan(f, s) ||
3666             (float16_is_infinity(f) && sign) ||
3667             (float16_is_normal(f) && sign) ||
3668             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3669         s->float_exception_flags |= float_flag_invalid;
3670         return float16_default_nan(s);
3671     }
3672 
3673     /* frsqrt7(qNaN) = canonical NaN */
3674     if (float16_is_quiet_nan(f, s)) {
3675         return float16_default_nan(s);
3676     }
3677 
3678     /* frsqrt7(+-0) = +-inf */
3679     if (float16_is_zero(f)) {
3680         s->float_exception_flags |= float_flag_divbyzero;
3681         return float16_set_sign(float16_infinity, sign);
3682     }
3683 
3684     /* frsqrt7(+inf) = +0 */
3685     if (float16_is_infinity(f) && !sign) {
3686         return float16_set_sign(float16_zero, sign);
3687     }
3688 
3689     /* +normal, +subnormal */
3690     uint64_t val = frsqrt7(f, exp_size, frac_size);
3691     return make_float16(val);
3692 }
3693 
3694 static float32 frsqrt7_s(float32 f, float_status *s)
3695 {
3696     int exp_size = 8, frac_size = 23;
3697     bool sign = float32_is_neg(f);
3698 
3699     /*
3700      * frsqrt7(sNaN) = canonical NaN
3701      * frsqrt7(-inf) = canonical NaN
3702      * frsqrt7(-normal) = canonical NaN
3703      * frsqrt7(-subnormal) = canonical NaN
3704      */
3705     if (float32_is_signaling_nan(f, s) ||
3706             (float32_is_infinity(f) && sign) ||
3707             (float32_is_normal(f) && sign) ||
3708             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3709         s->float_exception_flags |= float_flag_invalid;
3710         return float32_default_nan(s);
3711     }
3712 
3713     /* frsqrt7(qNaN) = canonical NaN */
3714     if (float32_is_quiet_nan(f, s)) {
3715         return float32_default_nan(s);
3716     }
3717 
3718     /* frsqrt7(+-0) = +-inf */
3719     if (float32_is_zero(f)) {
3720         s->float_exception_flags |= float_flag_divbyzero;
3721         return float32_set_sign(float32_infinity, sign);
3722     }
3723 
3724     /* frsqrt7(+inf) = +0 */
3725     if (float32_is_infinity(f) && !sign) {
3726         return float32_set_sign(float32_zero, sign);
3727     }
3728 
3729     /* +normal, +subnormal */
3730     uint64_t val = frsqrt7(f, exp_size, frac_size);
3731     return make_float32(val);
3732 }
3733 
3734 static float64 frsqrt7_d(float64 f, float_status *s)
3735 {
3736     int exp_size = 11, frac_size = 52;
3737     bool sign = float64_is_neg(f);
3738 
3739     /*
3740      * frsqrt7(sNaN) = canonical NaN
3741      * frsqrt7(-inf) = canonical NaN
3742      * frsqrt7(-normal) = canonical NaN
3743      * frsqrt7(-subnormal) = canonical NaN
3744      */
3745     if (float64_is_signaling_nan(f, s) ||
3746             (float64_is_infinity(f) && sign) ||
3747             (float64_is_normal(f) && sign) ||
3748             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3749         s->float_exception_flags |= float_flag_invalid;
3750         return float64_default_nan(s);
3751     }
3752 
3753     /* frsqrt7(qNaN) = canonical NaN */
3754     if (float64_is_quiet_nan(f, s)) {
3755         return float64_default_nan(s);
3756     }
3757 
3758     /* frsqrt7(+-0) = +-inf */
3759     if (float64_is_zero(f)) {
3760         s->float_exception_flags |= float_flag_divbyzero;
3761         return float64_set_sign(float64_infinity, sign);
3762     }
3763 
3764     /* frsqrt7(+inf) = +0 */
3765     if (float64_is_infinity(f) && !sign) {
3766         return float64_set_sign(float64_zero, sign);
3767     }
3768 
3769     /* +normal, +subnormal */
3770     uint64_t val = frsqrt7(f, exp_size, frac_size);
3771     return make_float64(val);
3772 }
3773 
3774 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3775 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3776 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3777 GEN_VEXT_V_ENV(vfrsqrt7_v_h)
3778 GEN_VEXT_V_ENV(vfrsqrt7_v_w)
3779 GEN_VEXT_V_ENV(vfrsqrt7_v_d)
3780 
3781 /*
3782  * Vector Floating-Point Reciprocal Estimate Instruction
3783  *
3784  * Adapted from riscv-v-spec recip.c:
3785  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3786  */
3787 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3788                       float_status *s)
3789 {
3790     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3791     uint64_t exp = extract64(f, frac_size, exp_size);
3792     uint64_t frac = extract64(f, 0, frac_size);
3793 
3794     const uint8_t lookup_table[] = {
3795         127, 125, 123, 121, 119, 117, 116, 114,
3796         112, 110, 109, 107, 105, 104, 102, 100,
3797         99, 97, 96, 94, 93, 91, 90, 88,
3798         87, 85, 84, 83, 81, 80, 79, 77,
3799         76, 75, 74, 72, 71, 70, 69, 68,
3800         66, 65, 64, 63, 62, 61, 60, 59,
3801         58, 57, 56, 55, 54, 53, 52, 51,
3802         50, 49, 48, 47, 46, 45, 44, 43,
3803         42, 41, 40, 40, 39, 38, 37, 36,
3804         35, 35, 34, 33, 32, 31, 31, 30,
3805         29, 28, 28, 27, 26, 25, 25, 24,
3806         23, 23, 22, 21, 21, 20, 19, 19,
3807         18, 17, 17, 16, 15, 15, 14, 14,
3808         13, 12, 12, 11, 11, 10, 9, 9,
3809         8, 8, 7, 7, 6, 5, 5, 4,
3810         4, 3, 3, 2, 2, 1, 1, 0
3811     };
3812     const int precision = 7;
3813 
3814     if (exp == 0 && frac != 0) { /* subnormal */
3815         /* Normalize the subnormal. */
3816         while (extract64(frac, frac_size - 1, 1) == 0) {
3817             exp--;
3818             frac <<= 1;
3819         }
3820 
3821         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3822 
3823         if (exp != 0 && exp != UINT64_MAX) {
3824             /*
3825              * Overflow to inf or max value of same sign,
3826              * depending on sign and rounding mode.
3827              */
3828             s->float_exception_flags |= (float_flag_inexact |
3829                                          float_flag_overflow);
3830 
3831             if ((s->float_rounding_mode == float_round_to_zero) ||
3832                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3833                 ((s->float_rounding_mode == float_round_up) && sign)) {
3834                 /* Return greatest/negative finite value. */
3835                 return (sign << (exp_size + frac_size)) |
3836                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3837             } else {
3838                 /* Return +-inf. */
3839                 return (sign << (exp_size + frac_size)) |
3840                     MAKE_64BIT_MASK(frac_size, exp_size);
3841             }
3842         }
3843     }
3844 
3845     int idx = frac >> (frac_size - precision);
3846     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3847                             (frac_size - precision);
3848     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3849 
3850     if (out_exp == 0 || out_exp == UINT64_MAX) {
3851         /*
3852          * The result is subnormal, but don't raise the underflow exception,
3853          * because there's no additional loss of precision.
3854          */
3855         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3856         if (out_exp == UINT64_MAX) {
3857             out_frac >>= 1;
3858             out_exp = 0;
3859         }
3860     }
3861 
3862     uint64_t val = 0;
3863     val = deposit64(val, 0, frac_size, out_frac);
3864     val = deposit64(val, frac_size, exp_size, out_exp);
3865     val = deposit64(val, frac_size + exp_size, 1, sign);
3866     return val;
3867 }
3868 
3869 static float16 frec7_h(float16 f, float_status *s)
3870 {
3871     int exp_size = 5, frac_size = 10;
3872     bool sign = float16_is_neg(f);
3873 
3874     /* frec7(+-inf) = +-0 */
3875     if (float16_is_infinity(f)) {
3876         return float16_set_sign(float16_zero, sign);
3877     }
3878 
3879     /* frec7(+-0) = +-inf */
3880     if (float16_is_zero(f)) {
3881         s->float_exception_flags |= float_flag_divbyzero;
3882         return float16_set_sign(float16_infinity, sign);
3883     }
3884 
3885     /* frec7(sNaN) = canonical NaN */
3886     if (float16_is_signaling_nan(f, s)) {
3887         s->float_exception_flags |= float_flag_invalid;
3888         return float16_default_nan(s);
3889     }
3890 
3891     /* frec7(qNaN) = canonical NaN */
3892     if (float16_is_quiet_nan(f, s)) {
3893         return float16_default_nan(s);
3894     }
3895 
3896     /* +-normal, +-subnormal */
3897     uint64_t val = frec7(f, exp_size, frac_size, s);
3898     return make_float16(val);
3899 }
3900 
3901 static float32 frec7_s(float32 f, float_status *s)
3902 {
3903     int exp_size = 8, frac_size = 23;
3904     bool sign = float32_is_neg(f);
3905 
3906     /* frec7(+-inf) = +-0 */
3907     if (float32_is_infinity(f)) {
3908         return float32_set_sign(float32_zero, sign);
3909     }
3910 
3911     /* frec7(+-0) = +-inf */
3912     if (float32_is_zero(f)) {
3913         s->float_exception_flags |= float_flag_divbyzero;
3914         return float32_set_sign(float32_infinity, sign);
3915     }
3916 
3917     /* frec7(sNaN) = canonical NaN */
3918     if (float32_is_signaling_nan(f, s)) {
3919         s->float_exception_flags |= float_flag_invalid;
3920         return float32_default_nan(s);
3921     }
3922 
3923     /* frec7(qNaN) = canonical NaN */
3924     if (float32_is_quiet_nan(f, s)) {
3925         return float32_default_nan(s);
3926     }
3927 
3928     /* +-normal, +-subnormal */
3929     uint64_t val = frec7(f, exp_size, frac_size, s);
3930     return make_float32(val);
3931 }
3932 
3933 static float64 frec7_d(float64 f, float_status *s)
3934 {
3935     int exp_size = 11, frac_size = 52;
3936     bool sign = float64_is_neg(f);
3937 
3938     /* frec7(+-inf) = +-0 */
3939     if (float64_is_infinity(f)) {
3940         return float64_set_sign(float64_zero, sign);
3941     }
3942 
3943     /* frec7(+-0) = +-inf */
3944     if (float64_is_zero(f)) {
3945         s->float_exception_flags |= float_flag_divbyzero;
3946         return float64_set_sign(float64_infinity, sign);
3947     }
3948 
3949     /* frec7(sNaN) = canonical NaN */
3950     if (float64_is_signaling_nan(f, s)) {
3951         s->float_exception_flags |= float_flag_invalid;
3952         return float64_default_nan(s);
3953     }
3954 
3955     /* frec7(qNaN) = canonical NaN */
3956     if (float64_is_quiet_nan(f, s)) {
3957         return float64_default_nan(s);
3958     }
3959 
3960     /* +-normal, +-subnormal */
3961     uint64_t val = frec7(f, exp_size, frac_size, s);
3962     return make_float64(val);
3963 }
3964 
3965 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3966 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3967 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3968 GEN_VEXT_V_ENV(vfrec7_v_h)
3969 GEN_VEXT_V_ENV(vfrec7_v_w)
3970 GEN_VEXT_V_ENV(vfrec7_v_d)
3971 
3972 /* Vector Floating-Point MIN/MAX Instructions */
3973 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3974 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3975 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3976 GEN_VEXT_VV_ENV(vfmin_vv_h)
3977 GEN_VEXT_VV_ENV(vfmin_vv_w)
3978 GEN_VEXT_VV_ENV(vfmin_vv_d)
3979 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3980 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3981 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3982 GEN_VEXT_VF(vfmin_vf_h)
3983 GEN_VEXT_VF(vfmin_vf_w)
3984 GEN_VEXT_VF(vfmin_vf_d)
3985 
3986 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3987 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3988 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3989 GEN_VEXT_VV_ENV(vfmax_vv_h)
3990 GEN_VEXT_VV_ENV(vfmax_vv_w)
3991 GEN_VEXT_VV_ENV(vfmax_vv_d)
3992 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3993 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3994 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3995 GEN_VEXT_VF(vfmax_vf_h)
3996 GEN_VEXT_VF(vfmax_vf_w)
3997 GEN_VEXT_VF(vfmax_vf_d)
3998 
3999 /* Vector Floating-Point Sign-Injection Instructions */
4000 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4001 {
4002     return deposit64(b, 0, 15, a);
4003 }
4004 
4005 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4006 {
4007     return deposit64(b, 0, 31, a);
4008 }
4009 
4010 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4011 {
4012     return deposit64(b, 0, 63, a);
4013 }
4014 
4015 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4016 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4017 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4018 GEN_VEXT_VV_ENV(vfsgnj_vv_h)
4019 GEN_VEXT_VV_ENV(vfsgnj_vv_w)
4020 GEN_VEXT_VV_ENV(vfsgnj_vv_d)
4021 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4022 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4023 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4024 GEN_VEXT_VF(vfsgnj_vf_h)
4025 GEN_VEXT_VF(vfsgnj_vf_w)
4026 GEN_VEXT_VF(vfsgnj_vf_d)
4027 
4028 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4029 {
4030     return deposit64(~b, 0, 15, a);
4031 }
4032 
4033 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4034 {
4035     return deposit64(~b, 0, 31, a);
4036 }
4037 
4038 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4039 {
4040     return deposit64(~b, 0, 63, a);
4041 }
4042 
4043 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4044 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4045 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4046 GEN_VEXT_VV_ENV(vfsgnjn_vv_h)
4047 GEN_VEXT_VV_ENV(vfsgnjn_vv_w)
4048 GEN_VEXT_VV_ENV(vfsgnjn_vv_d)
4049 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4050 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4051 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4052 GEN_VEXT_VF(vfsgnjn_vf_h)
4053 GEN_VEXT_VF(vfsgnjn_vf_w)
4054 GEN_VEXT_VF(vfsgnjn_vf_d)
4055 
4056 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4057 {
4058     return deposit64(b ^ a, 0, 15, a);
4059 }
4060 
4061 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4062 {
4063     return deposit64(b ^ a, 0, 31, a);
4064 }
4065 
4066 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4067 {
4068     return deposit64(b ^ a, 0, 63, a);
4069 }
4070 
4071 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4072 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4073 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4074 GEN_VEXT_VV_ENV(vfsgnjx_vv_h)
4075 GEN_VEXT_VV_ENV(vfsgnjx_vv_w)
4076 GEN_VEXT_VV_ENV(vfsgnjx_vv_d)
4077 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4078 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4079 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4080 GEN_VEXT_VF(vfsgnjx_vf_h)
4081 GEN_VEXT_VF(vfsgnjx_vf_w)
4082 GEN_VEXT_VF(vfsgnjx_vf_d)
4083 
4084 /* Vector Floating-Point Compare Instructions */
4085 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4086 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4087                   CPURISCVState *env, uint32_t desc)          \
4088 {                                                             \
4089     uint32_t vm = vext_vm(desc);                              \
4090     uint32_t vl = env->vl;                                    \
4091     uint32_t i;                                               \
4092                                                               \
4093     for (i = env->vstart; i < vl; i++) {                      \
4094         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4095         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4096         if (!vm && !vext_elem_mask(v0, i)) {                  \
4097             continue;                                         \
4098         }                                                     \
4099         vext_set_elem_mask(vd, i,                             \
4100                            DO_OP(s2, s1, &env->fp_status));   \
4101     }                                                         \
4102     env->vstart = 0;                                          \
4103 }
4104 
4105 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4106 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4107 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4108 
4109 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4110 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4111                   CPURISCVState *env, uint32_t desc)                \
4112 {                                                                   \
4113     uint32_t vm = vext_vm(desc);                                    \
4114     uint32_t vl = env->vl;                                          \
4115     uint32_t i;                                                     \
4116                                                                     \
4117     for (i = env->vstart; i < vl; i++) {                            \
4118         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4119         if (!vm && !vext_elem_mask(v0, i)) {                        \
4120             continue;                                               \
4121         }                                                           \
4122         vext_set_elem_mask(vd, i,                                   \
4123                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4124     }                                                               \
4125     env->vstart = 0;                                                \
4126 }
4127 
4128 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4129 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4130 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4131 
4132 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4133 {
4134     FloatRelation compare = float16_compare_quiet(a, b, s);
4135     return compare != float_relation_equal;
4136 }
4137 
4138 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4139 {
4140     FloatRelation compare = float32_compare_quiet(a, b, s);
4141     return compare != float_relation_equal;
4142 }
4143 
4144 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4145 {
4146     FloatRelation compare = float64_compare_quiet(a, b, s);
4147     return compare != float_relation_equal;
4148 }
4149 
4150 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4151 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4152 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4153 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4154 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4155 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4156 
4157 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4158 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4159 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4160 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4161 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4162 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4163 
4164 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4165 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4166 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4167 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4168 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4169 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4170 
4171 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4172 {
4173     FloatRelation compare = float16_compare(a, b, s);
4174     return compare == float_relation_greater;
4175 }
4176 
4177 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4178 {
4179     FloatRelation compare = float32_compare(a, b, s);
4180     return compare == float_relation_greater;
4181 }
4182 
4183 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4184 {
4185     FloatRelation compare = float64_compare(a, b, s);
4186     return compare == float_relation_greater;
4187 }
4188 
4189 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4190 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4191 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4192 
4193 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4194 {
4195     FloatRelation compare = float16_compare(a, b, s);
4196     return compare == float_relation_greater ||
4197            compare == float_relation_equal;
4198 }
4199 
4200 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4201 {
4202     FloatRelation compare = float32_compare(a, b, s);
4203     return compare == float_relation_greater ||
4204            compare == float_relation_equal;
4205 }
4206 
4207 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4208 {
4209     FloatRelation compare = float64_compare(a, b, s);
4210     return compare == float_relation_greater ||
4211            compare == float_relation_equal;
4212 }
4213 
4214 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4215 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4216 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4217 
4218 /* Vector Floating-Point Classify Instruction */
4219 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4220 static void do_##NAME(void *vd, void *vs2, int i)      \
4221 {                                                      \
4222     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4223     *((TD *)vd + HD(i)) = OP(s2);                      \
4224 }
4225 
4226 #define GEN_VEXT_V(NAME)                               \
4227 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4228                   CPURISCVState *env, uint32_t desc)   \
4229 {                                                      \
4230     uint32_t vm = vext_vm(desc);                       \
4231     uint32_t vl = env->vl;                             \
4232     uint32_t i;                                        \
4233                                                        \
4234     for (i = env->vstart; i < vl; i++) {               \
4235         if (!vm && !vext_elem_mask(v0, i)) {           \
4236             continue;                                  \
4237         }                                              \
4238         do_##NAME(vd, vs2, i);                         \
4239     }                                                  \
4240     env->vstart = 0;                                   \
4241 }
4242 
4243 target_ulong fclass_h(uint64_t frs1)
4244 {
4245     float16 f = frs1;
4246     bool sign = float16_is_neg(f);
4247 
4248     if (float16_is_infinity(f)) {
4249         return sign ? 1 << 0 : 1 << 7;
4250     } else if (float16_is_zero(f)) {
4251         return sign ? 1 << 3 : 1 << 4;
4252     } else if (float16_is_zero_or_denormal(f)) {
4253         return sign ? 1 << 2 : 1 << 5;
4254     } else if (float16_is_any_nan(f)) {
4255         float_status s = { }; /* for snan_bit_is_one */
4256         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4257     } else {
4258         return sign ? 1 << 1 : 1 << 6;
4259     }
4260 }
4261 
4262 target_ulong fclass_s(uint64_t frs1)
4263 {
4264     float32 f = frs1;
4265     bool sign = float32_is_neg(f);
4266 
4267     if (float32_is_infinity(f)) {
4268         return sign ? 1 << 0 : 1 << 7;
4269     } else if (float32_is_zero(f)) {
4270         return sign ? 1 << 3 : 1 << 4;
4271     } else if (float32_is_zero_or_denormal(f)) {
4272         return sign ? 1 << 2 : 1 << 5;
4273     } else if (float32_is_any_nan(f)) {
4274         float_status s = { }; /* for snan_bit_is_one */
4275         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4276     } else {
4277         return sign ? 1 << 1 : 1 << 6;
4278     }
4279 }
4280 
4281 target_ulong fclass_d(uint64_t frs1)
4282 {
4283     float64 f = frs1;
4284     bool sign = float64_is_neg(f);
4285 
4286     if (float64_is_infinity(f)) {
4287         return sign ? 1 << 0 : 1 << 7;
4288     } else if (float64_is_zero(f)) {
4289         return sign ? 1 << 3 : 1 << 4;
4290     } else if (float64_is_zero_or_denormal(f)) {
4291         return sign ? 1 << 2 : 1 << 5;
4292     } else if (float64_is_any_nan(f)) {
4293         float_status s = { }; /* for snan_bit_is_one */
4294         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4295     } else {
4296         return sign ? 1 << 1 : 1 << 6;
4297     }
4298 }
4299 
4300 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4301 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4302 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4303 GEN_VEXT_V(vfclass_v_h)
4304 GEN_VEXT_V(vfclass_v_w)
4305 GEN_VEXT_V(vfclass_v_d)
4306 
4307 /* Vector Floating-Point Merge Instruction */
4308 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4309 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4310                   CPURISCVState *env, uint32_t desc)          \
4311 {                                                             \
4312     uint32_t vm = vext_vm(desc);                              \
4313     uint32_t vl = env->vl;                                    \
4314     uint32_t i;                                               \
4315                                                               \
4316     for (i = env->vstart; i < vl; i++) {                      \
4317         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4318         *((ETYPE *)vd + H(i))                                 \
4319           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4320     }                                                         \
4321     env->vstart = 0;                                          \
4322 }
4323 
4324 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4325 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4326 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4327 
4328 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4329 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4330 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4331 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4332 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4333 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h)
4334 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w)
4335 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d)
4336 
4337 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4338 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4339 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4340 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4341 GEN_VEXT_V_ENV(vfcvt_x_f_v_h)
4342 GEN_VEXT_V_ENV(vfcvt_x_f_v_w)
4343 GEN_VEXT_V_ENV(vfcvt_x_f_v_d)
4344 
4345 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4346 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4347 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4348 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4349 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h)
4350 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w)
4351 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d)
4352 
4353 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4354 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4355 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4356 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4357 GEN_VEXT_V_ENV(vfcvt_f_x_v_h)
4358 GEN_VEXT_V_ENV(vfcvt_f_x_v_w)
4359 GEN_VEXT_V_ENV(vfcvt_f_x_v_d)
4360 
4361 /* Widening Floating-Point/Integer Type-Convert Instructions */
4362 /* (TD, T2, TX2) */
4363 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4364 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4365 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4366 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4367 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4368 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4369 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h)
4370 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w)
4371 
4372 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4373 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4374 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4375 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h)
4376 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w)
4377 
4378 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4379 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4380 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4381 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4382 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b)
4383 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h)
4384 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w)
4385 
4386 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4387 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4388 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4389 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4390 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b)
4391 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h)
4392 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w)
4393 
4394 /*
4395  * vfwcvt.f.f.v vd, vs2, vm
4396  * Convert single-width float to double-width float.
4397  */
4398 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4399 {
4400     return float16_to_float32(a, true, s);
4401 }
4402 
4403 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4404 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4405 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h)
4406 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w)
4407 
4408 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4409 /* (TD, T2, TX2) */
4410 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4411 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4412 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4413 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4414 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4415 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4416 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4417 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b)
4418 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h)
4419 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w)
4420 
4421 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4422 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4423 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4424 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4425 GEN_VEXT_V_ENV(vfncvt_x_f_w_b)
4426 GEN_VEXT_V_ENV(vfncvt_x_f_w_h)
4427 GEN_VEXT_V_ENV(vfncvt_x_f_w_w)
4428 
4429 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4430 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4431 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4432 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h)
4433 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w)
4434 
4435 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4436 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4437 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4438 GEN_VEXT_V_ENV(vfncvt_f_x_w_h)
4439 GEN_VEXT_V_ENV(vfncvt_f_x_w_w)
4440 
4441 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4442 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4443 {
4444     return float32_to_float16(a, true, s);
4445 }
4446 
4447 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4448 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4449 GEN_VEXT_V_ENV(vfncvt_f_f_w_h)
4450 GEN_VEXT_V_ENV(vfncvt_f_f_w_w)
4451 
4452 /*
4453  *** Vector Reduction Operations
4454  */
4455 /* Vector Single-Width Integer Reduction Instructions */
4456 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4457 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4458         void *vs2, CPURISCVState *env, uint32_t desc)     \
4459 {                                                         \
4460     uint32_t vm = vext_vm(desc);                          \
4461     uint32_t vl = env->vl;                                \
4462     uint32_t i;                                           \
4463     TD s1 =  *((TD *)vs1 + HD(0));                        \
4464                                                           \
4465     for (i = env->vstart; i < vl; i++) {                  \
4466         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4467         if (!vm && !vext_elem_mask(v0, i)) {              \
4468             continue;                                     \
4469         }                                                 \
4470         s1 = OP(s1, (TD)s2);                              \
4471     }                                                     \
4472     *((TD *)vd + HD(0)) = s1;                             \
4473     env->vstart = 0;                                      \
4474 }
4475 
4476 /* vd[0] = sum(vs1[0], vs2[*]) */
4477 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4478 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4479 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4480 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4481 
4482 /* vd[0] = maxu(vs1[0], vs2[*]) */
4483 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4484 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4485 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4486 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4487 
4488 /* vd[0] = max(vs1[0], vs2[*]) */
4489 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4490 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4491 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4492 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4493 
4494 /* vd[0] = minu(vs1[0], vs2[*]) */
4495 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4496 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4497 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4498 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4499 
4500 /* vd[0] = min(vs1[0], vs2[*]) */
4501 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4502 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4503 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4504 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4505 
4506 /* vd[0] = and(vs1[0], vs2[*]) */
4507 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4508 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4509 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4510 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4511 
4512 /* vd[0] = or(vs1[0], vs2[*]) */
4513 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4514 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4515 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4516 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4517 
4518 /* vd[0] = xor(vs1[0], vs2[*]) */
4519 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4520 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4521 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4522 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4523 
4524 /* Vector Widening Integer Reduction Instructions */
4525 /* signed sum reduction into double-width accumulator */
4526 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4527 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4528 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4529 
4530 /* Unsigned sum reduction into double-width accumulator */
4531 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4532 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4533 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4534 
4535 /* Vector Single-Width Floating-Point Reduction Instructions */
4536 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4537 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4538                   void *vs2, CPURISCVState *env,           \
4539                   uint32_t desc)                           \
4540 {                                                          \
4541     uint32_t vm = vext_vm(desc);                           \
4542     uint32_t vl = env->vl;                                 \
4543     uint32_t i;                                            \
4544     TD s1 =  *((TD *)vs1 + HD(0));                         \
4545                                                            \
4546     for (i = env->vstart; i < vl; i++) {                   \
4547         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4548         if (!vm && !vext_elem_mask(v0, i)) {               \
4549             continue;                                      \
4550         }                                                  \
4551         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4552     }                                                      \
4553     *((TD *)vd + HD(0)) = s1;                              \
4554     env->vstart = 0;                                       \
4555 }
4556 
4557 /* Unordered sum */
4558 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4559 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4560 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4561 
4562 /* Maximum value */
4563 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4564 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4565 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4566 
4567 /* Minimum value */
4568 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4569 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4570 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4571 
4572 /* Vector Widening Floating-Point Reduction Instructions */
4573 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4574 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4575                             void *vs2, CPURISCVState *env, uint32_t desc)
4576 {
4577     uint32_t vm = vext_vm(desc);
4578     uint32_t vl = env->vl;
4579     uint32_t i;
4580     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4581 
4582     for (i = env->vstart; i < vl; i++) {
4583         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4584         if (!vm && !vext_elem_mask(v0, i)) {
4585             continue;
4586         }
4587         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4588                          &env->fp_status);
4589     }
4590     *((uint32_t *)vd + H4(0)) = s1;
4591     env->vstart = 0;
4592 }
4593 
4594 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4595                             void *vs2, CPURISCVState *env, uint32_t desc)
4596 {
4597     uint32_t vm = vext_vm(desc);
4598     uint32_t vl = env->vl;
4599     uint32_t i;
4600     uint64_t s1 =  *((uint64_t *)vs1);
4601 
4602     for (i = env->vstart; i < vl; i++) {
4603         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4604         if (!vm && !vext_elem_mask(v0, i)) {
4605             continue;
4606         }
4607         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4608                          &env->fp_status);
4609     }
4610     *((uint64_t *)vd) = s1;
4611     env->vstart = 0;
4612 }
4613 
4614 /*
4615  *** Vector Mask Operations
4616  */
4617 /* Vector Mask-Register Logical Instructions */
4618 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4619 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4620                   void *vs2, CPURISCVState *env,          \
4621                   uint32_t desc)                          \
4622 {                                                         \
4623     uint32_t vl = env->vl;                                \
4624     uint32_t i;                                           \
4625     int a, b;                                             \
4626                                                           \
4627     for (i = env->vstart; i < vl; i++) {                  \
4628         a = vext_elem_mask(vs1, i);                       \
4629         b = vext_elem_mask(vs2, i);                       \
4630         vext_set_elem_mask(vd, i, OP(b, a));              \
4631     }                                                     \
4632     env->vstart = 0;                                      \
4633 }
4634 
4635 #define DO_NAND(N, M)  (!(N & M))
4636 #define DO_ANDNOT(N, M)  (N & !M)
4637 #define DO_NOR(N, M)  (!(N | M))
4638 #define DO_ORNOT(N, M)  (N | !M)
4639 #define DO_XNOR(N, M)  (!(N ^ M))
4640 
4641 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4642 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4643 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4644 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4645 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4646 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4647 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4648 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4649 
4650 /* Vector count population in mask vcpop */
4651 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4652                              uint32_t desc)
4653 {
4654     target_ulong cnt = 0;
4655     uint32_t vm = vext_vm(desc);
4656     uint32_t vl = env->vl;
4657     int i;
4658 
4659     for (i = env->vstart; i < vl; i++) {
4660         if (vm || vext_elem_mask(v0, i)) {
4661             if (vext_elem_mask(vs2, i)) {
4662                 cnt++;
4663             }
4664         }
4665     }
4666     env->vstart = 0;
4667     return cnt;
4668 }
4669 
4670 /* vfirst find-first-set mask bit*/
4671 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4672                               uint32_t desc)
4673 {
4674     uint32_t vm = vext_vm(desc);
4675     uint32_t vl = env->vl;
4676     int i;
4677 
4678     for (i = env->vstart; i < vl; i++) {
4679         if (vm || vext_elem_mask(v0, i)) {
4680             if (vext_elem_mask(vs2, i)) {
4681                 return i;
4682             }
4683         }
4684     }
4685     env->vstart = 0;
4686     return -1LL;
4687 }
4688 
4689 enum set_mask_type {
4690     ONLY_FIRST = 1,
4691     INCLUDE_FIRST,
4692     BEFORE_FIRST,
4693 };
4694 
4695 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4696                    uint32_t desc, enum set_mask_type type)
4697 {
4698     uint32_t vm = vext_vm(desc);
4699     uint32_t vl = env->vl;
4700     int i;
4701     bool first_mask_bit = false;
4702 
4703     for (i = env->vstart; i < vl; i++) {
4704         if (!vm && !vext_elem_mask(v0, i)) {
4705             continue;
4706         }
4707         /* write a zero to all following active elements */
4708         if (first_mask_bit) {
4709             vext_set_elem_mask(vd, i, 0);
4710             continue;
4711         }
4712         if (vext_elem_mask(vs2, i)) {
4713             first_mask_bit = true;
4714             if (type == BEFORE_FIRST) {
4715                 vext_set_elem_mask(vd, i, 0);
4716             } else {
4717                 vext_set_elem_mask(vd, i, 1);
4718             }
4719         } else {
4720             if (type == ONLY_FIRST) {
4721                 vext_set_elem_mask(vd, i, 0);
4722             } else {
4723                 vext_set_elem_mask(vd, i, 1);
4724             }
4725         }
4726     }
4727     env->vstart = 0;
4728 }
4729 
4730 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4731                      uint32_t desc)
4732 {
4733     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4734 }
4735 
4736 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4737                      uint32_t desc)
4738 {
4739     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4740 }
4741 
4742 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4743                      uint32_t desc)
4744 {
4745     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4746 }
4747 
4748 /* Vector Iota Instruction */
4749 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4750 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4751                   uint32_t desc)                                          \
4752 {                                                                         \
4753     uint32_t vm = vext_vm(desc);                                          \
4754     uint32_t vl = env->vl;                                                \
4755     uint32_t sum = 0;                                                     \
4756     int i;                                                                \
4757                                                                           \
4758     for (i = env->vstart; i < vl; i++) {                                  \
4759         if (!vm && !vext_elem_mask(v0, i)) {                              \
4760             continue;                                                     \
4761         }                                                                 \
4762         *((ETYPE *)vd + H(i)) = sum;                                      \
4763         if (vext_elem_mask(vs2, i)) {                                     \
4764             sum++;                                                        \
4765         }                                                                 \
4766     }                                                                     \
4767     env->vstart = 0;                                                      \
4768 }
4769 
4770 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4771 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4772 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4773 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4774 
4775 /* Vector Element Index Instruction */
4776 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4777 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4778 {                                                                         \
4779     uint32_t vm = vext_vm(desc);                                          \
4780     uint32_t vl = env->vl;                                                \
4781     int i;                                                                \
4782                                                                           \
4783     for (i = env->vstart; i < vl; i++) {                                  \
4784         if (!vm && !vext_elem_mask(v0, i)) {                              \
4785             continue;                                                     \
4786         }                                                                 \
4787         *((ETYPE *)vd + H(i)) = i;                                        \
4788     }                                                                     \
4789     env->vstart = 0;                                                      \
4790 }
4791 
4792 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4793 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4794 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4795 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4796 
4797 /*
4798  *** Vector Permutation Instructions
4799  */
4800 
4801 /* Vector Slide Instructions */
4802 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4803 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4804                   CPURISCVState *env, uint32_t desc)                      \
4805 {                                                                         \
4806     uint32_t vm = vext_vm(desc);                                          \
4807     uint32_t vl = env->vl;                                                \
4808     target_ulong offset = s1, i_min, i;                                   \
4809                                                                           \
4810     i_min = MAX(env->vstart, offset);                                     \
4811     for (i = i_min; i < vl; i++) {                                        \
4812         if (!vm && !vext_elem_mask(v0, i)) {                              \
4813             continue;                                                     \
4814         }                                                                 \
4815         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4816     }                                                                     \
4817 }
4818 
4819 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4820 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4821 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4822 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4823 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4824 
4825 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4826 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4827                   CPURISCVState *env, uint32_t desc)                      \
4828 {                                                                         \
4829     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4830     uint32_t vm = vext_vm(desc);                                          \
4831     uint32_t vl = env->vl;                                                \
4832     target_ulong i_max, i;                                                \
4833                                                                           \
4834     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4835     for (i = env->vstart; i < i_max; ++i) {                               \
4836         if (vm || vext_elem_mask(v0, i)) {                                \
4837             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4838         }                                                                 \
4839     }                                                                     \
4840                                                                           \
4841     for (i = i_max; i < vl; ++i) {                                        \
4842         if (vm || vext_elem_mask(v0, i)) {                                \
4843             *((ETYPE *)vd + H(i)) = 0;                                    \
4844         }                                                                 \
4845     }                                                                     \
4846                                                                           \
4847     env->vstart = 0;                                                      \
4848 }
4849 
4850 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4851 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4852 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4853 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4854 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4855 
4856 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4857 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4858                      void *vs2, CPURISCVState *env, uint32_t desc)          \
4859 {                                                                           \
4860     typedef uint##BITWIDTH##_t ETYPE;                                       \
4861     uint32_t vm = vext_vm(desc);                                            \
4862     uint32_t vl = env->vl;                                                  \
4863     uint32_t i;                                                             \
4864                                                                             \
4865     for (i = env->vstart; i < vl; i++) {                                    \
4866         if (!vm && !vext_elem_mask(v0, i)) {                                \
4867             continue;                                                       \
4868         }                                                                   \
4869         if (i == 0) {                                                       \
4870             *((ETYPE *)vd + H(i)) = s1;                                     \
4871         } else {                                                            \
4872             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4873         }                                                                   \
4874     }                                                                       \
4875     env->vstart = 0;                                                        \
4876 }
4877 
4878 GEN_VEXT_VSLIE1UP(8,  H1)
4879 GEN_VEXT_VSLIE1UP(16, H2)
4880 GEN_VEXT_VSLIE1UP(32, H4)
4881 GEN_VEXT_VSLIE1UP(64, H8)
4882 
4883 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4884 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4885                   CPURISCVState *env, uint32_t desc)              \
4886 {                                                                 \
4887     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4888 }
4889 
4890 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4891 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4892 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4893 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4894 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4895 
4896 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4897 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4898                        void *vs2, CPURISCVState *env, uint32_t desc)          \
4899 {                                                                             \
4900     typedef uint##BITWIDTH##_t ETYPE;                                         \
4901     uint32_t vm = vext_vm(desc);                                              \
4902     uint32_t vl = env->vl;                                                    \
4903     uint32_t i;                                                               \
4904                                                                               \
4905     for (i = env->vstart; i < vl; i++) {                                      \
4906         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4907             continue;                                                         \
4908         }                                                                     \
4909         if (i == vl - 1) {                                                    \
4910             *((ETYPE *)vd + H(i)) = s1;                                       \
4911         } else {                                                              \
4912             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4913         }                                                                     \
4914     }                                                                         \
4915     env->vstart = 0;                                                          \
4916 }
4917 
4918 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4919 GEN_VEXT_VSLIDE1DOWN(16, H2)
4920 GEN_VEXT_VSLIDE1DOWN(32, H4)
4921 GEN_VEXT_VSLIDE1DOWN(64, H8)
4922 
4923 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4924 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4925                   CPURISCVState *env, uint32_t desc)              \
4926 {                                                                 \
4927     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4928 }
4929 
4930 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4931 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4932 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4933 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4934 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4935 
4936 /* Vector Floating-Point Slide Instructions */
4937 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4938 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4939                   CPURISCVState *env, uint32_t desc)          \
4940 {                                                             \
4941     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4942 }
4943 
4944 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4945 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4946 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4947 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4948 
4949 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4950 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4951                   CPURISCVState *env, uint32_t desc)          \
4952 {                                                             \
4953     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4954 }
4955 
4956 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4957 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4958 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4959 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4960 
4961 /* Vector Register Gather Instruction */
4962 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4963 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4964                   CPURISCVState *env, uint32_t desc)                      \
4965 {                                                                         \
4966     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4967     uint32_t vm = vext_vm(desc);                                          \
4968     uint32_t vl = env->vl;                                                \
4969     uint64_t index;                                                       \
4970     uint32_t i;                                                           \
4971                                                                           \
4972     for (i = env->vstart; i < vl; i++) {                                  \
4973         if (!vm && !vext_elem_mask(v0, i)) {                              \
4974             continue;                                                     \
4975         }                                                                 \
4976         index = *((TS1 *)vs1 + HS1(i));                                   \
4977         if (index >= vlmax) {                                             \
4978             *((TS2 *)vd + HS2(i)) = 0;                                    \
4979         } else {                                                          \
4980             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4981         }                                                                 \
4982     }                                                                     \
4983     env->vstart = 0;                                                      \
4984 }
4985 
4986 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4987 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4988 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4989 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4990 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4991 
4992 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4993 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4994 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
4995 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
4996 
4997 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4998 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4999                   CPURISCVState *env, uint32_t desc)                      \
5000 {                                                                         \
5001     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5002     uint32_t vm = vext_vm(desc);                                          \
5003     uint32_t vl = env->vl;                                                \
5004     uint64_t index = s1;                                                  \
5005     uint32_t i;                                                           \
5006                                                                           \
5007     for (i = env->vstart; i < vl; i++) {                                  \
5008         if (!vm && !vext_elem_mask(v0, i)) {                              \
5009             continue;                                                     \
5010         }                                                                 \
5011         if (index >= vlmax) {                                             \
5012             *((ETYPE *)vd + H(i)) = 0;                                    \
5013         } else {                                                          \
5014             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5015         }                                                                 \
5016     }                                                                     \
5017     env->vstart = 0;                                                      \
5018 }
5019 
5020 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5021 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5022 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5023 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5024 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5025 
5026 /* Vector Compress Instruction */
5027 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5028 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5029                   CPURISCVState *env, uint32_t desc)                      \
5030 {                                                                         \
5031     uint32_t vl = env->vl;                                                \
5032     uint32_t num = 0, i;                                                  \
5033                                                                           \
5034     for (i = env->vstart; i < vl; i++) {                                  \
5035         if (!vext_elem_mask(vs1, i)) {                                    \
5036             continue;                                                     \
5037         }                                                                 \
5038         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5039         num++;                                                            \
5040     }                                                                     \
5041     env->vstart = 0;                                                      \
5042 }
5043 
5044 /* Compress into vd elements of vs2 where vs1 is enabled */
5045 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5046 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5047 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5048 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5049 
5050 /* Vector Whole Register Move */
5051 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5052 {
5053     /* EEW = SEW */
5054     uint32_t maxsz = simd_maxsz(desc);
5055     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5056     uint32_t startb = env->vstart * sewb;
5057     uint32_t i = startb;
5058 
5059     memcpy((uint8_t *)vd + H1(i),
5060            (uint8_t *)vs2 + H1(i),
5061            maxsz - startb);
5062 
5063     env->vstart = 0;
5064 }
5065 
5066 /* Vector Integer Extension */
5067 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5068 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5069                   CPURISCVState *env, uint32_t desc)             \
5070 {                                                                \
5071     uint32_t vl = env->vl;                                       \
5072     uint32_t vm = vext_vm(desc);                                 \
5073     uint32_t i;                                                  \
5074                                                                  \
5075     for (i = env->vstart; i < vl; i++) {                         \
5076         if (!vm && !vext_elem_mask(v0, i)) {                     \
5077             continue;                                            \
5078         }                                                        \
5079         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5080     }                                                            \
5081     env->vstart = 0;                                             \
5082 }
5083 
5084 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5085 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5086 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5087 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5088 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5089 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5090 
5091 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5092 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5093 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5094 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5095 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5096 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5097