xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 7b1bff41)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vta_all_1s(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
133 }
134 
135 /*
136  * Get the maximum number of elements can be operated.
137  *
138  * log2_esz: log2 of element size in bytes.
139  */
140 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
141 {
142     /*
143      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
144      * so vlen in bytes (vlenb) is encoded as maxsz.
145      */
146     uint32_t vlenb = simd_maxsz(desc);
147 
148     /* Return VLMAX */
149     int scale = vext_lmul(desc) - log2_esz;
150     return scale < 0 ? vlenb >> -scale : vlenb << scale;
151 }
152 
153 /*
154  * Get number of total elements, including prestart, body and tail elements.
155  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
156  * are held in the same vector register.
157  */
158 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
159                                             uint32_t esz)
160 {
161     uint32_t vlenb = simd_maxsz(desc);
162     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
163     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
164                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
165     return (vlenb << emul) / esz;
166 }
167 
168 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
169 {
170     return (addr & env->cur_pmmask) | env->cur_pmbase;
171 }
172 
173 /*
174  * This function checks watchpoint before real load operation.
175  *
176  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
177  * In user mode, there is no watchpoint support now.
178  *
179  * It will trigger an exception if there is no mapping in TLB
180  * and page table walk can't fill the TLB entry. Then the guest
181  * software can return here after process the exception or never return.
182  */
183 static void probe_pages(CPURISCVState *env, target_ulong addr,
184                         target_ulong len, uintptr_t ra,
185                         MMUAccessType access_type)
186 {
187     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
188     target_ulong curlen = MIN(pagelen, len);
189 
190     probe_access(env, adjust_addr(env, addr), curlen, access_type,
191                  cpu_mmu_index(env, false), ra);
192     if (len > curlen) {
193         addr += curlen;
194         curlen = len - curlen;
195         probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                      cpu_mmu_index(env, false), ra);
197     }
198 }
199 
200 /* set agnostic elements to 1s */
201 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
202                               uint32_t tot)
203 {
204     if (is_agnostic == 0) {
205         /* policy undisturbed */
206         return;
207     }
208     if (tot - cnt == 0) {
209         return ;
210     }
211     memset(base + cnt, -1, tot - cnt);
212 }
213 
214 static inline void vext_set_elem_mask(void *v0, int index,
215                                       uint8_t value)
216 {
217     int idx = index / 64;
218     int pos = index % 64;
219     uint64_t old = ((uint64_t *)v0)[idx];
220     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
221 }
222 
223 /*
224  * Earlier designs (pre-0.9) had a varying number of bits
225  * per mask value (MLEN). In the 0.9 design, MLEN=1.
226  * (Section 4.5)
227  */
228 static inline int vext_elem_mask(void *v0, int index)
229 {
230     int idx = index / 64;
231     int pos = index  % 64;
232     return (((uint64_t *)v0)[idx] >> pos) & 1;
233 }
234 
235 /* elements operations for load and store */
236 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
237                                uint32_t idx, void *vd, uintptr_t retaddr);
238 
239 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
240 static void NAME(CPURISCVState *env, abi_ptr addr,         \
241                  uint32_t idx, void *vd, uintptr_t retaddr)\
242 {                                                          \
243     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
244     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
245 }                                                          \
246 
247 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
248 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
249 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
250 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
251 
252 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
253 static void NAME(CPURISCVState *env, abi_ptr addr,         \
254                  uint32_t idx, void *vd, uintptr_t retaddr)\
255 {                                                          \
256     ETYPE data = *((ETYPE *)vd + H(idx));                  \
257     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
258 }
259 
260 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
261 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
262 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
263 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
264 
265 /*
266  *** stride: access vector element from strided memory
267  */
268 static void
269 vext_ldst_stride(void *vd, void *v0, target_ulong base,
270                  target_ulong stride, CPURISCVState *env,
271                  uint32_t desc, uint32_t vm,
272                  vext_ldst_elem_fn *ldst_elem,
273                  uint32_t log2_esz, uintptr_t ra)
274 {
275     uint32_t i, k;
276     uint32_t nf = vext_nf(desc);
277     uint32_t max_elems = vext_max_elems(desc, log2_esz);
278     uint32_t esz = 1 << log2_esz;
279     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
280     uint32_t vta = vext_vta(desc);
281 
282     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
283         if (!vm && !vext_elem_mask(v0, i)) {
284             continue;
285         }
286 
287         k = 0;
288         while (k < nf) {
289             target_ulong addr = base + stride * i + (k << log2_esz);
290             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
291             k++;
292         }
293     }
294     env->vstart = 0;
295     /* set tail elements to 1s */
296     for (k = 0; k < nf; ++k) {
297         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
298                           (k * max_elems + max_elems) * esz);
299     }
300     if (nf * max_elems % total_elems != 0) {
301         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
302         uint32_t registers_used =
303             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
304         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
305                           registers_used * vlenb);
306     }
307 }
308 
309 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
310 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
311                   target_ulong stride, CPURISCVState *env,              \
312                   uint32_t desc)                                        \
313 {                                                                       \
314     uint32_t vm = vext_vm(desc);                                        \
315     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
316                      ctzl(sizeof(ETYPE)), GETPC());                     \
317 }
318 
319 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
320 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
321 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
322 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
323 
324 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
325 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
326                   target_ulong stride, CPURISCVState *env,              \
327                   uint32_t desc)                                        \
328 {                                                                       \
329     uint32_t vm = vext_vm(desc);                                        \
330     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
331                      ctzl(sizeof(ETYPE)), GETPC());                     \
332 }
333 
334 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
335 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
336 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
337 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
338 
339 /*
340  *** unit-stride: access elements stored contiguously in memory
341  */
342 
343 /* unmasked unit-stride load and store operation*/
344 static void
345 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
346              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
347              uintptr_t ra)
348 {
349     uint32_t i, k;
350     uint32_t nf = vext_nf(desc);
351     uint32_t max_elems = vext_max_elems(desc, log2_esz);
352     uint32_t esz = 1 << log2_esz;
353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
354     uint32_t vta = vext_vta(desc);
355 
356     /* load bytes from guest memory */
357     for (i = env->vstart; i < evl; i++, env->vstart++) {
358         k = 0;
359         while (k < nf) {
360             target_ulong addr = base + ((i * nf + k) << log2_esz);
361             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
362             k++;
363         }
364     }
365     env->vstart = 0;
366     /* set tail elements to 1s */
367     for (k = 0; k < nf; ++k) {
368         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
369                           (k * max_elems + max_elems) * esz);
370     }
371     if (nf * max_elems % total_elems != 0) {
372         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
373         uint32_t registers_used =
374             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
375         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
376                           registers_used * vlenb);
377     }
378 }
379 
380 /*
381  * masked unit-stride load and store operation will be a special case of stride,
382  * stride = NF * sizeof (MTYPE)
383  */
384 
385 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
386 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
387                          CPURISCVState *env, uint32_t desc)             \
388 {                                                                       \
389     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
390     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
391                      ctzl(sizeof(ETYPE)), GETPC());                     \
392 }                                                                       \
393                                                                         \
394 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
395                   CPURISCVState *env, uint32_t desc)                    \
396 {                                                                       \
397     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
398                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
399 }
400 
401 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
402 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
403 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
404 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
405 
406 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
407 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
408                          CPURISCVState *env, uint32_t desc)              \
409 {                                                                        \
410     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
411     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
412                      ctzl(sizeof(ETYPE)), GETPC());                      \
413 }                                                                        \
414                                                                          \
415 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
416                   CPURISCVState *env, uint32_t desc)                     \
417 {                                                                        \
418     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
419                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
420 }
421 
422 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
423 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
424 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
425 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
426 
427 /*
428  *** unit stride mask load and store, EEW = 1
429  */
430 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
431                     CPURISCVState *env, uint32_t desc)
432 {
433     /* evl = ceil(vl/8) */
434     uint8_t evl = (env->vl + 7) >> 3;
435     vext_ldst_us(vd, base, env, desc, lde_b,
436                  0, evl, GETPC());
437 }
438 
439 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, ste_b,
445                  0, evl, GETPC());
446 }
447 
448 /*
449  *** index: access vector element from indexed memory
450  */
451 typedef target_ulong vext_get_index_addr(target_ulong base,
452         uint32_t idx, void *vs2);
453 
454 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
455 static target_ulong NAME(target_ulong base,            \
456                          uint32_t idx, void *vs2)      \
457 {                                                      \
458     return (base + *((ETYPE *)vs2 + H(idx)));          \
459 }
460 
461 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
462 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
463 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
464 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
465 
466 static inline void
467 vext_ldst_index(void *vd, void *v0, target_ulong base,
468                 void *vs2, CPURISCVState *env, uint32_t desc,
469                 vext_get_index_addr get_index_addr,
470                 vext_ldst_elem_fn *ldst_elem,
471                 uint32_t log2_esz, uintptr_t ra)
472 {
473     uint32_t i, k;
474     uint32_t nf = vext_nf(desc);
475     uint32_t vm = vext_vm(desc);
476     uint32_t max_elems = vext_max_elems(desc, log2_esz);
477     uint32_t esz = 1 << log2_esz;
478     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
479     uint32_t vta = vext_vta(desc);
480 
481     /* load bytes from guest memory */
482     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
483         if (!vm && !vext_elem_mask(v0, i)) {
484             continue;
485         }
486 
487         k = 0;
488         while (k < nf) {
489             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
490             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
491             k++;
492         }
493     }
494     env->vstart = 0;
495     /* set tail elements to 1s */
496     for (k = 0; k < nf; ++k) {
497         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
498                           (k * max_elems + max_elems) * esz);
499     }
500     if (nf * max_elems % total_elems != 0) {
501         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
502         uint32_t registers_used =
503             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
504         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
505                           registers_used * vlenb);
506     }
507 }
508 
509 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
510 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
511                   void *vs2, CPURISCVState *env, uint32_t desc)            \
512 {                                                                          \
513     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
514                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
515 }
516 
517 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
529 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
530 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
531 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
532 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
533 
534 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
535 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
536                   void *vs2, CPURISCVState *env, uint32_t desc)  \
537 {                                                                \
538     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
539                     STORE_FN, ctzl(sizeof(ETYPE)),               \
540                     GETPC());                                    \
541 }
542 
543 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
555 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
556 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
557 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
558 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
559 
560 /*
561  *** unit-stride fault-only-fisrt load instructions
562  */
563 static inline void
564 vext_ldff(void *vd, void *v0, target_ulong base,
565           CPURISCVState *env, uint32_t desc,
566           vext_ldst_elem_fn *ldst_elem,
567           uint32_t log2_esz, uintptr_t ra)
568 {
569     void *host;
570     uint32_t i, k, vl = 0;
571     uint32_t nf = vext_nf(desc);
572     uint32_t vm = vext_vm(desc);
573     uint32_t max_elems = vext_max_elems(desc, log2_esz);
574     uint32_t esz = 1 << log2_esz;
575     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
576     uint32_t vta = vext_vta(desc);
577     target_ulong addr, offset, remain;
578 
579     /* probe every access*/
580     for (i = env->vstart; i < env->vl; i++) {
581         if (!vm && !vext_elem_mask(v0, i)) {
582             continue;
583         }
584         addr = adjust_addr(env, base + i * (nf << log2_esz));
585         if (i == 0) {
586             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
587         } else {
588             /* if it triggers an exception, no need to check watchpoint */
589             remain = nf << log2_esz;
590             while (remain > 0) {
591                 offset = -(addr | TARGET_PAGE_MASK);
592                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
593                                          cpu_mmu_index(env, false));
594                 if (host) {
595 #ifdef CONFIG_USER_ONLY
596                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
597                         vl = i;
598                         goto ProbeSuccess;
599                     }
600 #else
601                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
602 #endif
603                 } else {
604                     vl = i;
605                     goto ProbeSuccess;
606                 }
607                 if (remain <=  offset) {
608                     break;
609                 }
610                 remain -= offset;
611                 addr = adjust_addr(env, addr + offset);
612             }
613         }
614     }
615 ProbeSuccess:
616     /* load bytes from guest memory */
617     if (vl != 0) {
618         env->vl = vl;
619     }
620     for (i = env->vstart; i < env->vl; i++) {
621         k = 0;
622         if (!vm && !vext_elem_mask(v0, i)) {
623             continue;
624         }
625         while (k < nf) {
626             target_ulong addr = base + ((i * nf + k) << log2_esz);
627             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
628             k++;
629         }
630     }
631     env->vstart = 0;
632     /* set tail elements to 1s */
633     for (k = 0; k < nf; ++k) {
634         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
635                           (k * max_elems + max_elems) * esz);
636     }
637     if (nf * max_elems % total_elems != 0) {
638         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
639         uint32_t registers_used =
640             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
641         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
642                           registers_used * vlenb);
643     }
644 }
645 
646 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
647 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
648                   CPURISCVState *env, uint32_t desc)      \
649 {                                                         \
650     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
651               ctzl(sizeof(ETYPE)), GETPC());              \
652 }
653 
654 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
655 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
656 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
657 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
658 
659 #define DO_SWAP(N, M) (M)
660 #define DO_AND(N, M)  (N & M)
661 #define DO_XOR(N, M)  (N ^ M)
662 #define DO_OR(N, M)   (N | M)
663 #define DO_ADD(N, M)  (N + M)
664 
665 /* Signed min/max */
666 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
667 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
668 
669 /* Unsigned min/max */
670 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
671 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
672 
673 /*
674  *** load and store whole register instructions
675  */
676 static void
677 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
678                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
679 {
680     uint32_t i, k, off, pos;
681     uint32_t nf = vext_nf(desc);
682     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
683     uint32_t max_elems = vlenb >> log2_esz;
684 
685     k = env->vstart / max_elems;
686     off = env->vstart % max_elems;
687 
688     if (off) {
689         /* load/store rest of elements of current segment pointed by vstart */
690         for (pos = off; pos < max_elems; pos++, env->vstart++) {
691             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
692             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
693         }
694         k++;
695     }
696 
697     /* load/store elements for rest of segments */
698     for (; k < nf; k++) {
699         for (i = 0; i < max_elems; i++, env->vstart++) {
700             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
701             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
702         }
703     }
704 
705     env->vstart = 0;
706 }
707 
708 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
709 void HELPER(NAME)(void *vd, target_ulong base,       \
710                   CPURISCVState *env, uint32_t desc) \
711 {                                                    \
712     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
713                     ctzl(sizeof(ETYPE)), GETPC());   \
714 }
715 
716 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
717 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
718 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
719 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
720 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
721 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
722 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
723 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
724 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
725 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
726 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
727 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
728 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
729 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
730 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
731 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
732 
733 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
734 void HELPER(NAME)(void *vd, target_ulong base,       \
735                   CPURISCVState *env, uint32_t desc) \
736 {                                                    \
737     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
738                     ctzl(sizeof(ETYPE)), GETPC());   \
739 }
740 
741 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
742 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
743 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
744 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
745 
746 /*
747  *** Vector Integer Arithmetic Instructions
748  */
749 
750 /* expand macro args before macro */
751 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
752 
753 /* (TD, T1, T2, TX1, TX2) */
754 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
755 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
756 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
757 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
758 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
759 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
760 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
761 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
762 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
763 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
764 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
765 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
766 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
767 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
768 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
769 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
770 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
771 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
772 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
773 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
774 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
775 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
776 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
777 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
778 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
779 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
780 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
781 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
782 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
783 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
784 
785 /* operation of two vector elements */
786 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
787 
788 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
789 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
790 {                                                               \
791     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
792     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
793     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
794 }
795 #define DO_SUB(N, M) (N - M)
796 #define DO_RSUB(N, M) (M - N)
797 
798 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
799 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
800 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
801 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
802 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
803 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
804 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
805 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
806 
807 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
808                        CPURISCVState *env, uint32_t desc,
809                        opivv2_fn *fn, uint32_t esz)
810 {
811     uint32_t vm = vext_vm(desc);
812     uint32_t vl = env->vl;
813     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
814     uint32_t vta = vext_vta(desc);
815     uint32_t i;
816 
817     for (i = env->vstart; i < vl; i++) {
818         if (!vm && !vext_elem_mask(v0, i)) {
819             continue;
820         }
821         fn(vd, vs1, vs2, i);
822     }
823     env->vstart = 0;
824     /* set tail elements to 1s */
825     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
826 }
827 
828 /* generate the helpers for OPIVV */
829 #define GEN_VEXT_VV(NAME, ESZ)                            \
830 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
831                   void *vs2, CPURISCVState *env,          \
832                   uint32_t desc)                          \
833 {                                                         \
834     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
835                do_##NAME, ESZ);                           \
836 }
837 
838 GEN_VEXT_VV(vadd_vv_b, 1)
839 GEN_VEXT_VV(vadd_vv_h, 2)
840 GEN_VEXT_VV(vadd_vv_w, 4)
841 GEN_VEXT_VV(vadd_vv_d, 8)
842 GEN_VEXT_VV(vsub_vv_b, 1)
843 GEN_VEXT_VV(vsub_vv_h, 2)
844 GEN_VEXT_VV(vsub_vv_w, 4)
845 GEN_VEXT_VV(vsub_vv_d, 8)
846 
847 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
848 
849 /*
850  * (T1)s1 gives the real operator type.
851  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
852  */
853 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
854 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
855 {                                                                   \
856     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
857     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
858 }
859 
860 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
861 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
862 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
863 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
864 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
865 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
866 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
867 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
868 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
869 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
870 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
871 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
872 
873 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
874                        CPURISCVState *env, uint32_t desc,
875                        opivx2_fn fn, uint32_t esz)
876 {
877     uint32_t vm = vext_vm(desc);
878     uint32_t vl = env->vl;
879     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
880     uint32_t vta = vext_vta(desc);
881     uint32_t i;
882 
883     for (i = env->vstart; i < vl; i++) {
884         if (!vm && !vext_elem_mask(v0, i)) {
885             continue;
886         }
887         fn(vd, s1, vs2, i);
888     }
889     env->vstart = 0;
890     /* set tail elements to 1s */
891     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
892 }
893 
894 /* generate the helpers for OPIVX */
895 #define GEN_VEXT_VX(NAME, ESZ)                            \
896 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
897                   void *vs2, CPURISCVState *env,          \
898                   uint32_t desc)                          \
899 {                                                         \
900     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
901                do_##NAME, ESZ);                           \
902 }
903 
904 GEN_VEXT_VX(vadd_vx_b, 1)
905 GEN_VEXT_VX(vadd_vx_h, 2)
906 GEN_VEXT_VX(vadd_vx_w, 4)
907 GEN_VEXT_VX(vadd_vx_d, 8)
908 GEN_VEXT_VX(vsub_vx_b, 1)
909 GEN_VEXT_VX(vsub_vx_h, 2)
910 GEN_VEXT_VX(vsub_vx_w, 4)
911 GEN_VEXT_VX(vsub_vx_d, 8)
912 GEN_VEXT_VX(vrsub_vx_b, 1)
913 GEN_VEXT_VX(vrsub_vx_h, 2)
914 GEN_VEXT_VX(vrsub_vx_w, 4)
915 GEN_VEXT_VX(vrsub_vx_d, 8)
916 
917 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
918 {
919     intptr_t oprsz = simd_oprsz(desc);
920     intptr_t i;
921 
922     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
923         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
924     }
925 }
926 
927 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
928 {
929     intptr_t oprsz = simd_oprsz(desc);
930     intptr_t i;
931 
932     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
933         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
934     }
935 }
936 
937 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
938 {
939     intptr_t oprsz = simd_oprsz(desc);
940     intptr_t i;
941 
942     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
943         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
944     }
945 }
946 
947 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
948 {
949     intptr_t oprsz = simd_oprsz(desc);
950     intptr_t i;
951 
952     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
953         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
954     }
955 }
956 
957 /* Vector Widening Integer Add/Subtract */
958 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
959 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
960 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
961 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
962 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
963 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
964 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
965 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
966 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
967 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
968 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
969 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
970 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
972 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
973 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
975 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
976 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
978 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
979 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
981 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
982 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
983 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
984 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
985 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
986 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
987 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
988 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
989 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
990 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
991 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
992 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
993 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
994 GEN_VEXT_VV(vwaddu_vv_b, 2)
995 GEN_VEXT_VV(vwaddu_vv_h, 4)
996 GEN_VEXT_VV(vwaddu_vv_w, 8)
997 GEN_VEXT_VV(vwsubu_vv_b, 2)
998 GEN_VEXT_VV(vwsubu_vv_h, 4)
999 GEN_VEXT_VV(vwsubu_vv_w, 8)
1000 GEN_VEXT_VV(vwadd_vv_b, 2)
1001 GEN_VEXT_VV(vwadd_vv_h, 4)
1002 GEN_VEXT_VV(vwadd_vv_w, 8)
1003 GEN_VEXT_VV(vwsub_vv_b, 2)
1004 GEN_VEXT_VV(vwsub_vv_h, 4)
1005 GEN_VEXT_VV(vwsub_vv_w, 8)
1006 GEN_VEXT_VV(vwaddu_wv_b, 2)
1007 GEN_VEXT_VV(vwaddu_wv_h, 4)
1008 GEN_VEXT_VV(vwaddu_wv_w, 8)
1009 GEN_VEXT_VV(vwsubu_wv_b, 2)
1010 GEN_VEXT_VV(vwsubu_wv_h, 4)
1011 GEN_VEXT_VV(vwsubu_wv_w, 8)
1012 GEN_VEXT_VV(vwadd_wv_b, 2)
1013 GEN_VEXT_VV(vwadd_wv_h, 4)
1014 GEN_VEXT_VV(vwadd_wv_w, 8)
1015 GEN_VEXT_VV(vwsub_wv_b, 2)
1016 GEN_VEXT_VV(vwsub_wv_h, 4)
1017 GEN_VEXT_VV(vwsub_wv_w, 8)
1018 
1019 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1022 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1025 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1028 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1031 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1032 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1034 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1035 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1037 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1038 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1040 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1041 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1043 GEN_VEXT_VX(vwaddu_vx_b, 2)
1044 GEN_VEXT_VX(vwaddu_vx_h, 4)
1045 GEN_VEXT_VX(vwaddu_vx_w, 8)
1046 GEN_VEXT_VX(vwsubu_vx_b, 2)
1047 GEN_VEXT_VX(vwsubu_vx_h, 4)
1048 GEN_VEXT_VX(vwsubu_vx_w, 8)
1049 GEN_VEXT_VX(vwadd_vx_b, 2)
1050 GEN_VEXT_VX(vwadd_vx_h, 4)
1051 GEN_VEXT_VX(vwadd_vx_w, 8)
1052 GEN_VEXT_VX(vwsub_vx_b, 2)
1053 GEN_VEXT_VX(vwsub_vx_h, 4)
1054 GEN_VEXT_VX(vwsub_vx_w, 8)
1055 GEN_VEXT_VX(vwaddu_wx_b, 2)
1056 GEN_VEXT_VX(vwaddu_wx_h, 4)
1057 GEN_VEXT_VX(vwaddu_wx_w, 8)
1058 GEN_VEXT_VX(vwsubu_wx_b, 2)
1059 GEN_VEXT_VX(vwsubu_wx_h, 4)
1060 GEN_VEXT_VX(vwsubu_wx_w, 8)
1061 GEN_VEXT_VX(vwadd_wx_b, 2)
1062 GEN_VEXT_VX(vwadd_wx_h, 4)
1063 GEN_VEXT_VX(vwadd_wx_w, 8)
1064 GEN_VEXT_VX(vwsub_wx_b, 2)
1065 GEN_VEXT_VX(vwsub_wx_h, 4)
1066 GEN_VEXT_VX(vwsub_wx_w, 8)
1067 
1068 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1069 #define DO_VADC(N, M, C) (N + M + C)
1070 #define DO_VSBC(N, M, C) (N - M - C)
1071 
1072 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1073 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1074                   CPURISCVState *env, uint32_t desc)          \
1075 {                                                             \
1076     uint32_t vl = env->vl;                                    \
1077     uint32_t esz = sizeof(ETYPE);                             \
1078     uint32_t total_elems =                                    \
1079         vext_get_total_elems(env, desc, esz);                 \
1080     uint32_t vta = vext_vta(desc);                            \
1081     uint32_t i;                                               \
1082                                                               \
1083     for (i = env->vstart; i < vl; i++) {                      \
1084         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1085         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1086         ETYPE carry = vext_elem_mask(v0, i);                  \
1087                                                               \
1088         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1089     }                                                         \
1090     env->vstart = 0;                                          \
1091     /* set tail elements to 1s */                             \
1092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1093 }
1094 
1095 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1097 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1098 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1099 
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1102 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1103 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1104 
1105 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1106 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1107                   CPURISCVState *env, uint32_t desc)                     \
1108 {                                                                        \
1109     uint32_t vl = env->vl;                                               \
1110     uint32_t esz = sizeof(ETYPE);                                        \
1111     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1112     uint32_t vta = vext_vta(desc);                                       \
1113     uint32_t i;                                                          \
1114                                                                          \
1115     for (i = env->vstart; i < vl; i++) {                                 \
1116         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1117         ETYPE carry = vext_elem_mask(v0, i);                             \
1118                                                                          \
1119         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1120     }                                                                    \
1121     env->vstart = 0;                                          \
1122     /* set tail elements to 1s */                                        \
1123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1124 }
1125 
1126 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1128 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1129 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1130 
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1133 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1134 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1135 
1136 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1137                           (__typeof(N))(N + M) < N)
1138 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1139 
1140 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1141 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1142                   CPURISCVState *env, uint32_t desc)          \
1143 {                                                             \
1144     uint32_t vl = env->vl;                                    \
1145     uint32_t vm = vext_vm(desc);                              \
1146     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1147     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1148     uint32_t i;                                               \
1149                                                               \
1150     for (i = env->vstart; i < vl; i++) {                      \
1151         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1152         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1153         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1154         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1155     }                                                         \
1156     env->vstart = 0;                                          \
1157     /* mask destination register are always tail-agnostic */  \
1158     /* set tail elements to 1s */                             \
1159     if (vta_all_1s) {                                         \
1160         for (; i < total_elems; i++) {                        \
1161             vext_set_elem_mask(vd, i, 1);                     \
1162         }                                                     \
1163     }                                                         \
1164 }
1165 
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1168 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1169 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1170 
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1173 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1174 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1175 
1176 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1177 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1178                   void *vs2, CPURISCVState *env, uint32_t desc) \
1179 {                                                               \
1180     uint32_t vl = env->vl;                                      \
1181     uint32_t vm = vext_vm(desc);                                \
1182     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1183     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1184     uint32_t i;                                                 \
1185                                                                 \
1186     for (i = env->vstart; i < vl; i++) {                        \
1187         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1188         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1189         vext_set_elem_mask(vd, i,                               \
1190                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1191     }                                                           \
1192     env->vstart = 0;                                            \
1193     /* mask destination register are always tail-agnostic */    \
1194     /* set tail elements to 1s */                               \
1195     if (vta_all_1s) {                                           \
1196         for (; i < total_elems; i++) {                          \
1197             vext_set_elem_mask(vd, i, 1);                       \
1198         }                                                       \
1199     }                                                           \
1200 }
1201 
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206 
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211 
1212 /* Vector Bitwise Logical Instructions */
1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225 GEN_VEXT_VV(vand_vv_b, 1)
1226 GEN_VEXT_VV(vand_vv_h, 2)
1227 GEN_VEXT_VV(vand_vv_w, 4)
1228 GEN_VEXT_VV(vand_vv_d, 8)
1229 GEN_VEXT_VV(vor_vv_b, 1)
1230 GEN_VEXT_VV(vor_vv_h, 2)
1231 GEN_VEXT_VV(vor_vv_w, 4)
1232 GEN_VEXT_VV(vor_vv_d, 8)
1233 GEN_VEXT_VV(vxor_vv_b, 1)
1234 GEN_VEXT_VV(vxor_vv_h, 2)
1235 GEN_VEXT_VV(vxor_vv_w, 4)
1236 GEN_VEXT_VV(vxor_vv_d, 8)
1237 
1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250 GEN_VEXT_VX(vand_vx_b, 1)
1251 GEN_VEXT_VX(vand_vx_h, 2)
1252 GEN_VEXT_VX(vand_vx_w, 4)
1253 GEN_VEXT_VX(vand_vx_d, 8)
1254 GEN_VEXT_VX(vor_vx_b, 1)
1255 GEN_VEXT_VX(vor_vx_h, 2)
1256 GEN_VEXT_VX(vor_vx_w, 4)
1257 GEN_VEXT_VX(vor_vx_d, 8)
1258 GEN_VEXT_VX(vxor_vx_b, 1)
1259 GEN_VEXT_VX(vxor_vx_h, 2)
1260 GEN_VEXT_VX(vxor_vx_w, 4)
1261 GEN_VEXT_VX(vxor_vx_d, 8)
1262 
1263 /* Vector Single-Width Bit Shift Instructions */
1264 #define DO_SLL(N, M)  (N << (M))
1265 #define DO_SRL(N, M)  (N >> (M))
1266 
1267 /* generate the helpers for shift instructions with two vector operators */
1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1271 {                                                                         \
1272     uint32_t vm = vext_vm(desc);                                          \
1273     uint32_t vl = env->vl;                                                \
1274     uint32_t esz = sizeof(TS1);                                           \
1275     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1276     uint32_t vta = vext_vta(desc);                                        \
1277     uint32_t i;                                                           \
1278                                                                           \
1279     for (i = env->vstart; i < vl; i++) {                                  \
1280         if (!vm && !vext_elem_mask(v0, i)) {                              \
1281             continue;                                                     \
1282         }                                                                 \
1283         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1284         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1285         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1286     }                                                                     \
1287     env->vstart = 0;                                                      \
1288     /* set tail elements to 1s */                                         \
1289     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1290 }
1291 
1292 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1293 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1294 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1296 
1297 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1301 
1302 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1303 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1304 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1306 
1307 /* generate the helpers for shift instructions with one vector and one scalar */
1308 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1309 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1310         void *vs2, CPURISCVState *env, uint32_t desc)       \
1311 {                                                           \
1312     uint32_t vm = vext_vm(desc);                            \
1313     uint32_t vl = env->vl;                                  \
1314     uint32_t esz = sizeof(TD);                              \
1315     uint32_t total_elems =                                  \
1316         vext_get_total_elems(env, desc, esz);               \
1317     uint32_t vta = vext_vta(desc);                          \
1318     uint32_t i;                                             \
1319                                                             \
1320     for (i = env->vstart; i < vl; i++) {                    \
1321         if (!vm && !vext_elem_mask(v0, i)) {                \
1322             continue;                                       \
1323         }                                                   \
1324         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1325         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1326     }                                                       \
1327     env->vstart = 0;                                        \
1328     /* set tail elements to 1s */                           \
1329     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1330 }
1331 
1332 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1333 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1334 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1336 
1337 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1338 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1341 
1342 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1346 
1347 /* Vector Narrowing Integer Right Shift Instructions */
1348 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1351 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1354 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1355 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1357 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1358 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1360 
1361 /* Vector Integer Comparison Instructions */
1362 #define DO_MSEQ(N, M) (N == M)
1363 #define DO_MSNE(N, M) (N != M)
1364 #define DO_MSLT(N, M) (N < M)
1365 #define DO_MSLE(N, M) (N <= M)
1366 #define DO_MSGT(N, M) (N > M)
1367 
1368 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1369 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1370                   CPURISCVState *env, uint32_t desc)          \
1371 {                                                             \
1372     uint32_t vm = vext_vm(desc);                              \
1373     uint32_t vl = env->vl;                                    \
1374     uint32_t i;                                               \
1375                                                               \
1376     for (i = env->vstart; i < vl; i++) {                      \
1377         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1378         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1379         if (!vm && !vext_elem_mask(v0, i)) {                  \
1380             continue;                                         \
1381         }                                                     \
1382         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1383     }                                                         \
1384     env->vstart = 0;                                          \
1385 }
1386 
1387 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1388 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1389 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1390 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1391 
1392 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1393 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1394 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1395 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1396 
1397 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1398 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1399 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1400 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1401 
1402 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1403 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1404 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1405 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1406 
1407 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1408 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1409 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1410 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1411 
1412 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1413 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1414 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1415 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1416 
1417 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1418 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1419                   CPURISCVState *env, uint32_t desc)                \
1420 {                                                                   \
1421     uint32_t vm = vext_vm(desc);                                    \
1422     uint32_t vl = env->vl;                                          \
1423     uint32_t i;                                                     \
1424                                                                     \
1425     for (i = env->vstart; i < vl; i++) {                            \
1426         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1427         if (!vm && !vext_elem_mask(v0, i)) {                        \
1428             continue;                                               \
1429         }                                                           \
1430         vext_set_elem_mask(vd, i,                                   \
1431                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1432     }                                                               \
1433     env->vstart = 0;                                                \
1434 }
1435 
1436 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1437 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1438 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1439 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1440 
1441 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1442 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1443 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1444 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1445 
1446 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1447 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1448 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1449 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1450 
1451 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1452 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1453 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1454 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1455 
1456 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1457 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1458 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1459 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1460 
1461 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1462 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1463 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1464 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1465 
1466 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1467 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1468 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1469 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1470 
1471 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1472 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1473 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1474 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1475 
1476 /* Vector Integer Min/Max Instructions */
1477 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1478 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1479 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1480 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1481 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1482 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1483 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1484 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1485 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1486 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1487 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1488 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1489 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1490 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1491 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1492 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1493 GEN_VEXT_VV(vminu_vv_b, 1)
1494 GEN_VEXT_VV(vminu_vv_h, 2)
1495 GEN_VEXT_VV(vminu_vv_w, 4)
1496 GEN_VEXT_VV(vminu_vv_d, 8)
1497 GEN_VEXT_VV(vmin_vv_b, 1)
1498 GEN_VEXT_VV(vmin_vv_h, 2)
1499 GEN_VEXT_VV(vmin_vv_w, 4)
1500 GEN_VEXT_VV(vmin_vv_d, 8)
1501 GEN_VEXT_VV(vmaxu_vv_b, 1)
1502 GEN_VEXT_VV(vmaxu_vv_h, 2)
1503 GEN_VEXT_VV(vmaxu_vv_w, 4)
1504 GEN_VEXT_VV(vmaxu_vv_d, 8)
1505 GEN_VEXT_VV(vmax_vv_b, 1)
1506 GEN_VEXT_VV(vmax_vv_h, 2)
1507 GEN_VEXT_VV(vmax_vv_w, 4)
1508 GEN_VEXT_VV(vmax_vv_d, 8)
1509 
1510 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1511 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1512 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1513 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1514 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1515 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1516 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1517 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1518 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1519 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1520 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1521 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1522 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1523 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1524 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1525 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1526 GEN_VEXT_VX(vminu_vx_b, 1)
1527 GEN_VEXT_VX(vminu_vx_h, 2)
1528 GEN_VEXT_VX(vminu_vx_w, 4)
1529 GEN_VEXT_VX(vminu_vx_d, 8)
1530 GEN_VEXT_VX(vmin_vx_b, 1)
1531 GEN_VEXT_VX(vmin_vx_h, 2)
1532 GEN_VEXT_VX(vmin_vx_w, 4)
1533 GEN_VEXT_VX(vmin_vx_d, 8)
1534 GEN_VEXT_VX(vmaxu_vx_b, 1)
1535 GEN_VEXT_VX(vmaxu_vx_h, 2)
1536 GEN_VEXT_VX(vmaxu_vx_w, 4)
1537 GEN_VEXT_VX(vmaxu_vx_d, 8)
1538 GEN_VEXT_VX(vmax_vx_b, 1)
1539 GEN_VEXT_VX(vmax_vx_h, 2)
1540 GEN_VEXT_VX(vmax_vx_w, 4)
1541 GEN_VEXT_VX(vmax_vx_d, 8)
1542 
1543 /* Vector Single-Width Integer Multiply Instructions */
1544 #define DO_MUL(N, M) (N * M)
1545 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1546 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1547 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1548 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1549 GEN_VEXT_VV(vmul_vv_b, 1)
1550 GEN_VEXT_VV(vmul_vv_h, 2)
1551 GEN_VEXT_VV(vmul_vv_w, 4)
1552 GEN_VEXT_VV(vmul_vv_d, 8)
1553 
1554 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1555 {
1556     return (int16_t)s2 * (int16_t)s1 >> 8;
1557 }
1558 
1559 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1560 {
1561     return (int32_t)s2 * (int32_t)s1 >> 16;
1562 }
1563 
1564 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1565 {
1566     return (int64_t)s2 * (int64_t)s1 >> 32;
1567 }
1568 
1569 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1570 {
1571     uint64_t hi_64, lo_64;
1572 
1573     muls64(&lo_64, &hi_64, s1, s2);
1574     return hi_64;
1575 }
1576 
1577 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1578 {
1579     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1580 }
1581 
1582 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1583 {
1584     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1585 }
1586 
1587 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1588 {
1589     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1590 }
1591 
1592 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1593 {
1594     uint64_t hi_64, lo_64;
1595 
1596     mulu64(&lo_64, &hi_64, s2, s1);
1597     return hi_64;
1598 }
1599 
1600 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1601 {
1602     return (int16_t)s2 * (uint16_t)s1 >> 8;
1603 }
1604 
1605 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1606 {
1607     return (int32_t)s2 * (uint32_t)s1 >> 16;
1608 }
1609 
1610 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1611 {
1612     return (int64_t)s2 * (uint64_t)s1 >> 32;
1613 }
1614 
1615 /*
1616  * Let  A = signed operand,
1617  *      B = unsigned operand
1618  *      P = mulu64(A, B), unsigned product
1619  *
1620  * LET  X = 2 ** 64  - A, 2's complement of A
1621  *      SP = signed product
1622  * THEN
1623  *      IF A < 0
1624  *          SP = -X * B
1625  *             = -(2 ** 64 - A) * B
1626  *             = A * B - 2 ** 64 * B
1627  *             = P - 2 ** 64 * B
1628  *      ELSE
1629  *          SP = P
1630  * THEN
1631  *      HI_P -= (A < 0 ? B : 0)
1632  */
1633 
1634 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1635 {
1636     uint64_t hi_64, lo_64;
1637 
1638     mulu64(&lo_64, &hi_64, s2, s1);
1639 
1640     hi_64 -= s2 < 0 ? s1 : 0;
1641     return hi_64;
1642 }
1643 
1644 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1645 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1646 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1647 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1648 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1649 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1650 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1651 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1652 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1653 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1654 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1655 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1656 GEN_VEXT_VV(vmulh_vv_b, 1)
1657 GEN_VEXT_VV(vmulh_vv_h, 2)
1658 GEN_VEXT_VV(vmulh_vv_w, 4)
1659 GEN_VEXT_VV(vmulh_vv_d, 8)
1660 GEN_VEXT_VV(vmulhu_vv_b, 1)
1661 GEN_VEXT_VV(vmulhu_vv_h, 2)
1662 GEN_VEXT_VV(vmulhu_vv_w, 4)
1663 GEN_VEXT_VV(vmulhu_vv_d, 8)
1664 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1665 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1666 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1667 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1668 
1669 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1670 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1671 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1672 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1673 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1674 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1675 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1676 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1677 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1678 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1679 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1680 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1681 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1682 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1683 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1684 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1685 GEN_VEXT_VX(vmul_vx_b, 1)
1686 GEN_VEXT_VX(vmul_vx_h, 2)
1687 GEN_VEXT_VX(vmul_vx_w, 4)
1688 GEN_VEXT_VX(vmul_vx_d, 8)
1689 GEN_VEXT_VX(vmulh_vx_b, 1)
1690 GEN_VEXT_VX(vmulh_vx_h, 2)
1691 GEN_VEXT_VX(vmulh_vx_w, 4)
1692 GEN_VEXT_VX(vmulh_vx_d, 8)
1693 GEN_VEXT_VX(vmulhu_vx_b, 1)
1694 GEN_VEXT_VX(vmulhu_vx_h, 2)
1695 GEN_VEXT_VX(vmulhu_vx_w, 4)
1696 GEN_VEXT_VX(vmulhu_vx_d, 8)
1697 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1698 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1699 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1700 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1701 
1702 /* Vector Integer Divide Instructions */
1703 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1704 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1705 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1706         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1707 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1708         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1709 
1710 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1711 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1712 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1713 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1714 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1715 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1716 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1717 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1718 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1719 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1720 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1721 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1722 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1723 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1724 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1725 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1726 GEN_VEXT_VV(vdivu_vv_b, 1)
1727 GEN_VEXT_VV(vdivu_vv_h, 2)
1728 GEN_VEXT_VV(vdivu_vv_w, 4)
1729 GEN_VEXT_VV(vdivu_vv_d, 8)
1730 GEN_VEXT_VV(vdiv_vv_b, 1)
1731 GEN_VEXT_VV(vdiv_vv_h, 2)
1732 GEN_VEXT_VV(vdiv_vv_w, 4)
1733 GEN_VEXT_VV(vdiv_vv_d, 8)
1734 GEN_VEXT_VV(vremu_vv_b, 1)
1735 GEN_VEXT_VV(vremu_vv_h, 2)
1736 GEN_VEXT_VV(vremu_vv_w, 4)
1737 GEN_VEXT_VV(vremu_vv_d, 8)
1738 GEN_VEXT_VV(vrem_vv_b, 1)
1739 GEN_VEXT_VV(vrem_vv_h, 2)
1740 GEN_VEXT_VV(vrem_vv_w, 4)
1741 GEN_VEXT_VV(vrem_vv_d, 8)
1742 
1743 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1744 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1745 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1746 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1747 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1748 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1749 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1750 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1751 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1752 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1753 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1754 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1755 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1756 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1757 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1758 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1759 GEN_VEXT_VX(vdivu_vx_b, 1)
1760 GEN_VEXT_VX(vdivu_vx_h, 2)
1761 GEN_VEXT_VX(vdivu_vx_w, 4)
1762 GEN_VEXT_VX(vdivu_vx_d, 8)
1763 GEN_VEXT_VX(vdiv_vx_b, 1)
1764 GEN_VEXT_VX(vdiv_vx_h, 2)
1765 GEN_VEXT_VX(vdiv_vx_w, 4)
1766 GEN_VEXT_VX(vdiv_vx_d, 8)
1767 GEN_VEXT_VX(vremu_vx_b, 1)
1768 GEN_VEXT_VX(vremu_vx_h, 2)
1769 GEN_VEXT_VX(vremu_vx_w, 4)
1770 GEN_VEXT_VX(vremu_vx_d, 8)
1771 GEN_VEXT_VX(vrem_vx_b, 1)
1772 GEN_VEXT_VX(vrem_vx_h, 2)
1773 GEN_VEXT_VX(vrem_vx_w, 4)
1774 GEN_VEXT_VX(vrem_vx_d, 8)
1775 
1776 /* Vector Widening Integer Multiply Instructions */
1777 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1778 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1779 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1780 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1781 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1782 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1783 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1784 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1785 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1786 GEN_VEXT_VV(vwmul_vv_b, 2)
1787 GEN_VEXT_VV(vwmul_vv_h, 4)
1788 GEN_VEXT_VV(vwmul_vv_w, 8)
1789 GEN_VEXT_VV(vwmulu_vv_b, 2)
1790 GEN_VEXT_VV(vwmulu_vv_h, 4)
1791 GEN_VEXT_VV(vwmulu_vv_w, 8)
1792 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1793 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1794 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1795 
1796 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1797 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1798 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1799 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1800 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1801 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1802 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1803 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1804 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1805 GEN_VEXT_VX(vwmul_vx_b, 2)
1806 GEN_VEXT_VX(vwmul_vx_h, 4)
1807 GEN_VEXT_VX(vwmul_vx_w, 8)
1808 GEN_VEXT_VX(vwmulu_vx_b, 2)
1809 GEN_VEXT_VX(vwmulu_vx_h, 4)
1810 GEN_VEXT_VX(vwmulu_vx_w, 8)
1811 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1812 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1813 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1814 
1815 /* Vector Single-Width Integer Multiply-Add Instructions */
1816 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1817 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1818 {                                                                  \
1819     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1820     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1821     TD d = *((TD *)vd + HD(i));                                    \
1822     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1823 }
1824 
1825 #define DO_MACC(N, M, D) (M * N + D)
1826 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1827 #define DO_MADD(N, M, D) (M * D + N)
1828 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1829 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1830 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1831 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1832 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1833 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1834 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1835 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1836 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1837 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1838 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1839 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1840 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1841 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1842 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1843 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1844 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1845 GEN_VEXT_VV(vmacc_vv_b, 1)
1846 GEN_VEXT_VV(vmacc_vv_h, 2)
1847 GEN_VEXT_VV(vmacc_vv_w, 4)
1848 GEN_VEXT_VV(vmacc_vv_d, 8)
1849 GEN_VEXT_VV(vnmsac_vv_b, 1)
1850 GEN_VEXT_VV(vnmsac_vv_h, 2)
1851 GEN_VEXT_VV(vnmsac_vv_w, 4)
1852 GEN_VEXT_VV(vnmsac_vv_d, 8)
1853 GEN_VEXT_VV(vmadd_vv_b, 1)
1854 GEN_VEXT_VV(vmadd_vv_h, 2)
1855 GEN_VEXT_VV(vmadd_vv_w, 4)
1856 GEN_VEXT_VV(vmadd_vv_d, 8)
1857 GEN_VEXT_VV(vnmsub_vv_b, 1)
1858 GEN_VEXT_VV(vnmsub_vv_h, 2)
1859 GEN_VEXT_VV(vnmsub_vv_w, 4)
1860 GEN_VEXT_VV(vnmsub_vv_d, 8)
1861 
1862 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1863 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1864 {                                                                   \
1865     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1866     TD d = *((TD *)vd + HD(i));                                     \
1867     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1868 }
1869 
1870 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1871 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1872 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1873 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1874 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1875 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1876 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1877 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1878 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1879 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1880 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1881 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1882 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1883 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1884 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1885 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1886 GEN_VEXT_VX(vmacc_vx_b, 1)
1887 GEN_VEXT_VX(vmacc_vx_h, 2)
1888 GEN_VEXT_VX(vmacc_vx_w, 4)
1889 GEN_VEXT_VX(vmacc_vx_d, 8)
1890 GEN_VEXT_VX(vnmsac_vx_b, 1)
1891 GEN_VEXT_VX(vnmsac_vx_h, 2)
1892 GEN_VEXT_VX(vnmsac_vx_w, 4)
1893 GEN_VEXT_VX(vnmsac_vx_d, 8)
1894 GEN_VEXT_VX(vmadd_vx_b, 1)
1895 GEN_VEXT_VX(vmadd_vx_h, 2)
1896 GEN_VEXT_VX(vmadd_vx_w, 4)
1897 GEN_VEXT_VX(vmadd_vx_d, 8)
1898 GEN_VEXT_VX(vnmsub_vx_b, 1)
1899 GEN_VEXT_VX(vnmsub_vx_h, 2)
1900 GEN_VEXT_VX(vnmsub_vx_w, 4)
1901 GEN_VEXT_VX(vnmsub_vx_d, 8)
1902 
1903 /* Vector Widening Integer Multiply-Add Instructions */
1904 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1905 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1906 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1907 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1908 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1909 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1910 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1911 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1912 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1913 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1914 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1915 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1916 GEN_VEXT_VV(vwmacc_vv_b, 2)
1917 GEN_VEXT_VV(vwmacc_vv_h, 4)
1918 GEN_VEXT_VV(vwmacc_vv_w, 8)
1919 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1920 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1921 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1922 
1923 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1924 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1925 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1926 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1927 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1928 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1929 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1930 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1931 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1932 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1933 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1934 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1935 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1936 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1937 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1938 GEN_VEXT_VX(vwmacc_vx_b, 2)
1939 GEN_VEXT_VX(vwmacc_vx_h, 4)
1940 GEN_VEXT_VX(vwmacc_vx_w, 8)
1941 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1942 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1943 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1944 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1945 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1946 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1947 
1948 /* Vector Integer Merge and Move Instructions */
1949 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1950 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1951                   uint32_t desc)                                     \
1952 {                                                                    \
1953     uint32_t vl = env->vl;                                           \
1954     uint32_t i;                                                      \
1955                                                                      \
1956     for (i = env->vstart; i < vl; i++) {                             \
1957         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1958         *((ETYPE *)vd + H(i)) = s1;                                  \
1959     }                                                                \
1960     env->vstart = 0;                                                 \
1961 }
1962 
1963 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1964 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1965 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1966 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1967 
1968 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1969 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1970                   uint32_t desc)                                     \
1971 {                                                                    \
1972     uint32_t vl = env->vl;                                           \
1973     uint32_t i;                                                      \
1974                                                                      \
1975     for (i = env->vstart; i < vl; i++) {                             \
1976         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1977     }                                                                \
1978     env->vstart = 0;                                                 \
1979 }
1980 
1981 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1982 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1983 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1984 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1985 
1986 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1987 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1988                   CPURISCVState *env, uint32_t desc)                 \
1989 {                                                                    \
1990     uint32_t vl = env->vl;                                           \
1991     uint32_t i;                                                      \
1992                                                                      \
1993     for (i = env->vstart; i < vl; i++) {                             \
1994         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1995         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1996     }                                                                \
1997     env->vstart = 0;                                                 \
1998 }
1999 
2000 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2001 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2002 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2003 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2004 
2005 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2006 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2007                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2008 {                                                                    \
2009     uint32_t vl = env->vl;                                           \
2010     uint32_t i;                                                      \
2011                                                                      \
2012     for (i = env->vstart; i < vl; i++) {                             \
2013         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2014         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2015                    (ETYPE)(target_long)s1);                          \
2016         *((ETYPE *)vd + H(i)) = d;                                   \
2017     }                                                                \
2018     env->vstart = 0;                                                 \
2019 }
2020 
2021 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2022 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2023 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2024 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2025 
2026 /*
2027  *** Vector Fixed-Point Arithmetic Instructions
2028  */
2029 
2030 /* Vector Single-Width Saturating Add and Subtract */
2031 
2032 /*
2033  * As fixed point instructions probably have round mode and saturation,
2034  * define common macros for fixed point here.
2035  */
2036 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2037                           CPURISCVState *env, int vxrm);
2038 
2039 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2040 static inline void                                                  \
2041 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2042           CPURISCVState *env, int vxrm)                             \
2043 {                                                                   \
2044     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2045     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2046     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2047 }
2048 
2049 static inline void
2050 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2051              CPURISCVState *env,
2052              uint32_t vl, uint32_t vm, int vxrm,
2053              opivv2_rm_fn *fn)
2054 {
2055     for (uint32_t i = env->vstart; i < vl; i++) {
2056         if (!vm && !vext_elem_mask(v0, i)) {
2057             continue;
2058         }
2059         fn(vd, vs1, vs2, i, env, vxrm);
2060     }
2061     env->vstart = 0;
2062 }
2063 
2064 static inline void
2065 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2066              CPURISCVState *env,
2067              uint32_t desc,
2068              opivv2_rm_fn *fn)
2069 {
2070     uint32_t vm = vext_vm(desc);
2071     uint32_t vl = env->vl;
2072 
2073     switch (env->vxrm) {
2074     case 0: /* rnu */
2075         vext_vv_rm_1(vd, v0, vs1, vs2,
2076                      env, vl, vm, 0, fn);
2077         break;
2078     case 1: /* rne */
2079         vext_vv_rm_1(vd, v0, vs1, vs2,
2080                      env, vl, vm, 1, fn);
2081         break;
2082     case 2: /* rdn */
2083         vext_vv_rm_1(vd, v0, vs1, vs2,
2084                      env, vl, vm, 2, fn);
2085         break;
2086     default: /* rod */
2087         vext_vv_rm_1(vd, v0, vs1, vs2,
2088                      env, vl, vm, 3, fn);
2089         break;
2090     }
2091 }
2092 
2093 /* generate helpers for fixed point instructions with OPIVV format */
2094 #define GEN_VEXT_VV_RM(NAME)                                    \
2095 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2096                   CPURISCVState *env, uint32_t desc)            \
2097 {                                                               \
2098     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2099                  do_##NAME);                                    \
2100 }
2101 
2102 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2103 {
2104     uint8_t res = a + b;
2105     if (res < a) {
2106         res = UINT8_MAX;
2107         env->vxsat = 0x1;
2108     }
2109     return res;
2110 }
2111 
2112 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2113                                uint16_t b)
2114 {
2115     uint16_t res = a + b;
2116     if (res < a) {
2117         res = UINT16_MAX;
2118         env->vxsat = 0x1;
2119     }
2120     return res;
2121 }
2122 
2123 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2124                                uint32_t b)
2125 {
2126     uint32_t res = a + b;
2127     if (res < a) {
2128         res = UINT32_MAX;
2129         env->vxsat = 0x1;
2130     }
2131     return res;
2132 }
2133 
2134 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2135                                uint64_t b)
2136 {
2137     uint64_t res = a + b;
2138     if (res < a) {
2139         res = UINT64_MAX;
2140         env->vxsat = 0x1;
2141     }
2142     return res;
2143 }
2144 
2145 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2146 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2147 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2148 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2149 GEN_VEXT_VV_RM(vsaddu_vv_b)
2150 GEN_VEXT_VV_RM(vsaddu_vv_h)
2151 GEN_VEXT_VV_RM(vsaddu_vv_w)
2152 GEN_VEXT_VV_RM(vsaddu_vv_d)
2153 
2154 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2155                           CPURISCVState *env, int vxrm);
2156 
2157 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2158 static inline void                                                  \
2159 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2160           CPURISCVState *env, int vxrm)                             \
2161 {                                                                   \
2162     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2163     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2164 }
2165 
2166 static inline void
2167 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2168              CPURISCVState *env,
2169              uint32_t vl, uint32_t vm, int vxrm,
2170              opivx2_rm_fn *fn)
2171 {
2172     for (uint32_t i = env->vstart; i < vl; i++) {
2173         if (!vm && !vext_elem_mask(v0, i)) {
2174             continue;
2175         }
2176         fn(vd, s1, vs2, i, env, vxrm);
2177     }
2178     env->vstart = 0;
2179 }
2180 
2181 static inline void
2182 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2183              CPURISCVState *env,
2184              uint32_t desc,
2185              opivx2_rm_fn *fn)
2186 {
2187     uint32_t vm = vext_vm(desc);
2188     uint32_t vl = env->vl;
2189 
2190     switch (env->vxrm) {
2191     case 0: /* rnu */
2192         vext_vx_rm_1(vd, v0, s1, vs2,
2193                      env, vl, vm, 0, fn);
2194         break;
2195     case 1: /* rne */
2196         vext_vx_rm_1(vd, v0, s1, vs2,
2197                      env, vl, vm, 1, fn);
2198         break;
2199     case 2: /* rdn */
2200         vext_vx_rm_1(vd, v0, s1, vs2,
2201                      env, vl, vm, 2, fn);
2202         break;
2203     default: /* rod */
2204         vext_vx_rm_1(vd, v0, s1, vs2,
2205                      env, vl, vm, 3, fn);
2206         break;
2207     }
2208 }
2209 
2210 /* generate helpers for fixed point instructions with OPIVX format */
2211 #define GEN_VEXT_VX_RM(NAME)                              \
2212 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2213         void *vs2, CPURISCVState *env, uint32_t desc)     \
2214 {                                                         \
2215     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2216                  do_##NAME);                              \
2217 }
2218 
2219 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2220 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2221 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2222 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2223 GEN_VEXT_VX_RM(vsaddu_vx_b)
2224 GEN_VEXT_VX_RM(vsaddu_vx_h)
2225 GEN_VEXT_VX_RM(vsaddu_vx_w)
2226 GEN_VEXT_VX_RM(vsaddu_vx_d)
2227 
2228 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2229 {
2230     int8_t res = a + b;
2231     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2232         res = a > 0 ? INT8_MAX : INT8_MIN;
2233         env->vxsat = 0x1;
2234     }
2235     return res;
2236 }
2237 
2238 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2239 {
2240     int16_t res = a + b;
2241     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2242         res = a > 0 ? INT16_MAX : INT16_MIN;
2243         env->vxsat = 0x1;
2244     }
2245     return res;
2246 }
2247 
2248 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2249 {
2250     int32_t res = a + b;
2251     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2252         res = a > 0 ? INT32_MAX : INT32_MIN;
2253         env->vxsat = 0x1;
2254     }
2255     return res;
2256 }
2257 
2258 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2259 {
2260     int64_t res = a + b;
2261     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2262         res = a > 0 ? INT64_MAX : INT64_MIN;
2263         env->vxsat = 0x1;
2264     }
2265     return res;
2266 }
2267 
2268 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2269 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2270 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2271 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2272 GEN_VEXT_VV_RM(vsadd_vv_b)
2273 GEN_VEXT_VV_RM(vsadd_vv_h)
2274 GEN_VEXT_VV_RM(vsadd_vv_w)
2275 GEN_VEXT_VV_RM(vsadd_vv_d)
2276 
2277 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2278 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2279 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2280 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2281 GEN_VEXT_VX_RM(vsadd_vx_b)
2282 GEN_VEXT_VX_RM(vsadd_vx_h)
2283 GEN_VEXT_VX_RM(vsadd_vx_w)
2284 GEN_VEXT_VX_RM(vsadd_vx_d)
2285 
2286 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2287 {
2288     uint8_t res = a - b;
2289     if (res > a) {
2290         res = 0;
2291         env->vxsat = 0x1;
2292     }
2293     return res;
2294 }
2295 
2296 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2297                                uint16_t b)
2298 {
2299     uint16_t res = a - b;
2300     if (res > a) {
2301         res = 0;
2302         env->vxsat = 0x1;
2303     }
2304     return res;
2305 }
2306 
2307 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2308                                uint32_t b)
2309 {
2310     uint32_t res = a - b;
2311     if (res > a) {
2312         res = 0;
2313         env->vxsat = 0x1;
2314     }
2315     return res;
2316 }
2317 
2318 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2319                                uint64_t b)
2320 {
2321     uint64_t res = a - b;
2322     if (res > a) {
2323         res = 0;
2324         env->vxsat = 0x1;
2325     }
2326     return res;
2327 }
2328 
2329 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2330 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2331 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2332 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2333 GEN_VEXT_VV_RM(vssubu_vv_b)
2334 GEN_VEXT_VV_RM(vssubu_vv_h)
2335 GEN_VEXT_VV_RM(vssubu_vv_w)
2336 GEN_VEXT_VV_RM(vssubu_vv_d)
2337 
2338 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2339 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2340 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2341 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2342 GEN_VEXT_VX_RM(vssubu_vx_b)
2343 GEN_VEXT_VX_RM(vssubu_vx_h)
2344 GEN_VEXT_VX_RM(vssubu_vx_w)
2345 GEN_VEXT_VX_RM(vssubu_vx_d)
2346 
2347 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2348 {
2349     int8_t res = a - b;
2350     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2351         res = a >= 0 ? INT8_MAX : INT8_MIN;
2352         env->vxsat = 0x1;
2353     }
2354     return res;
2355 }
2356 
2357 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2358 {
2359     int16_t res = a - b;
2360     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2361         res = a >= 0 ? INT16_MAX : INT16_MIN;
2362         env->vxsat = 0x1;
2363     }
2364     return res;
2365 }
2366 
2367 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2368 {
2369     int32_t res = a - b;
2370     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2371         res = a >= 0 ? INT32_MAX : INT32_MIN;
2372         env->vxsat = 0x1;
2373     }
2374     return res;
2375 }
2376 
2377 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2378 {
2379     int64_t res = a - b;
2380     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2381         res = a >= 0 ? INT64_MAX : INT64_MIN;
2382         env->vxsat = 0x1;
2383     }
2384     return res;
2385 }
2386 
2387 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2388 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2389 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2390 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2391 GEN_VEXT_VV_RM(vssub_vv_b)
2392 GEN_VEXT_VV_RM(vssub_vv_h)
2393 GEN_VEXT_VV_RM(vssub_vv_w)
2394 GEN_VEXT_VV_RM(vssub_vv_d)
2395 
2396 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2397 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2398 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2399 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2400 GEN_VEXT_VX_RM(vssub_vx_b)
2401 GEN_VEXT_VX_RM(vssub_vx_h)
2402 GEN_VEXT_VX_RM(vssub_vx_w)
2403 GEN_VEXT_VX_RM(vssub_vx_d)
2404 
2405 /* Vector Single-Width Averaging Add and Subtract */
2406 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2407 {
2408     uint8_t d = extract64(v, shift, 1);
2409     uint8_t d1;
2410     uint64_t D1, D2;
2411 
2412     if (shift == 0 || shift > 64) {
2413         return 0;
2414     }
2415 
2416     d1 = extract64(v, shift - 1, 1);
2417     D1 = extract64(v, 0, shift);
2418     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2419         return d1;
2420     } else if (vxrm == 1) { /* round-to-nearest-even */
2421         if (shift > 1) {
2422             D2 = extract64(v, 0, shift - 1);
2423             return d1 & ((D2 != 0) | d);
2424         } else {
2425             return d1 & d;
2426         }
2427     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2428         return !d & (D1 != 0);
2429     }
2430     return 0; /* round-down (truncate) */
2431 }
2432 
2433 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2434 {
2435     int64_t res = (int64_t)a + b;
2436     uint8_t round = get_round(vxrm, res, 1);
2437 
2438     return (res >> 1) + round;
2439 }
2440 
2441 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2442 {
2443     int64_t res = a + b;
2444     uint8_t round = get_round(vxrm, res, 1);
2445     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2446 
2447     /* With signed overflow, bit 64 is inverse of bit 63. */
2448     return ((res >> 1) ^ over) + round;
2449 }
2450 
2451 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2452 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2453 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2454 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2455 GEN_VEXT_VV_RM(vaadd_vv_b)
2456 GEN_VEXT_VV_RM(vaadd_vv_h)
2457 GEN_VEXT_VV_RM(vaadd_vv_w)
2458 GEN_VEXT_VV_RM(vaadd_vv_d)
2459 
2460 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2461 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2462 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2463 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2464 GEN_VEXT_VX_RM(vaadd_vx_b)
2465 GEN_VEXT_VX_RM(vaadd_vx_h)
2466 GEN_VEXT_VX_RM(vaadd_vx_w)
2467 GEN_VEXT_VX_RM(vaadd_vx_d)
2468 
2469 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2470                                uint32_t a, uint32_t b)
2471 {
2472     uint64_t res = (uint64_t)a + b;
2473     uint8_t round = get_round(vxrm, res, 1);
2474 
2475     return (res >> 1) + round;
2476 }
2477 
2478 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2479                                uint64_t a, uint64_t b)
2480 {
2481     uint64_t res = a + b;
2482     uint8_t round = get_round(vxrm, res, 1);
2483     uint64_t over = (uint64_t)(res < a) << 63;
2484 
2485     return ((res >> 1) | over) + round;
2486 }
2487 
2488 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2489 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2490 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2491 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2492 GEN_VEXT_VV_RM(vaaddu_vv_b)
2493 GEN_VEXT_VV_RM(vaaddu_vv_h)
2494 GEN_VEXT_VV_RM(vaaddu_vv_w)
2495 GEN_VEXT_VV_RM(vaaddu_vv_d)
2496 
2497 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2498 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2499 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2500 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2501 GEN_VEXT_VX_RM(vaaddu_vx_b)
2502 GEN_VEXT_VX_RM(vaaddu_vx_h)
2503 GEN_VEXT_VX_RM(vaaddu_vx_w)
2504 GEN_VEXT_VX_RM(vaaddu_vx_d)
2505 
2506 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2507 {
2508     int64_t res = (int64_t)a - b;
2509     uint8_t round = get_round(vxrm, res, 1);
2510 
2511     return (res >> 1) + round;
2512 }
2513 
2514 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2515 {
2516     int64_t res = (int64_t)a - b;
2517     uint8_t round = get_round(vxrm, res, 1);
2518     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2519 
2520     /* With signed overflow, bit 64 is inverse of bit 63. */
2521     return ((res >> 1) ^ over) + round;
2522 }
2523 
2524 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2525 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2526 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2527 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2528 GEN_VEXT_VV_RM(vasub_vv_b)
2529 GEN_VEXT_VV_RM(vasub_vv_h)
2530 GEN_VEXT_VV_RM(vasub_vv_w)
2531 GEN_VEXT_VV_RM(vasub_vv_d)
2532 
2533 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2534 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2535 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2536 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2537 GEN_VEXT_VX_RM(vasub_vx_b)
2538 GEN_VEXT_VX_RM(vasub_vx_h)
2539 GEN_VEXT_VX_RM(vasub_vx_w)
2540 GEN_VEXT_VX_RM(vasub_vx_d)
2541 
2542 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2543                                uint32_t a, uint32_t b)
2544 {
2545     int64_t res = (int64_t)a - b;
2546     uint8_t round = get_round(vxrm, res, 1);
2547 
2548     return (res >> 1) + round;
2549 }
2550 
2551 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2552                                uint64_t a, uint64_t b)
2553 {
2554     uint64_t res = (uint64_t)a - b;
2555     uint8_t round = get_round(vxrm, res, 1);
2556     uint64_t over = (uint64_t)(res > a) << 63;
2557 
2558     return ((res >> 1) | over) + round;
2559 }
2560 
2561 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2562 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2563 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2564 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2565 GEN_VEXT_VV_RM(vasubu_vv_b)
2566 GEN_VEXT_VV_RM(vasubu_vv_h)
2567 GEN_VEXT_VV_RM(vasubu_vv_w)
2568 GEN_VEXT_VV_RM(vasubu_vv_d)
2569 
2570 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2571 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2572 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2573 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2574 GEN_VEXT_VX_RM(vasubu_vx_b)
2575 GEN_VEXT_VX_RM(vasubu_vx_h)
2576 GEN_VEXT_VX_RM(vasubu_vx_w)
2577 GEN_VEXT_VX_RM(vasubu_vx_d)
2578 
2579 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2580 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2581 {
2582     uint8_t round;
2583     int16_t res;
2584 
2585     res = (int16_t)a * (int16_t)b;
2586     round = get_round(vxrm, res, 7);
2587     res   = (res >> 7) + round;
2588 
2589     if (res > INT8_MAX) {
2590         env->vxsat = 0x1;
2591         return INT8_MAX;
2592     } else if (res < INT8_MIN) {
2593         env->vxsat = 0x1;
2594         return INT8_MIN;
2595     } else {
2596         return res;
2597     }
2598 }
2599 
2600 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2601 {
2602     uint8_t round;
2603     int32_t res;
2604 
2605     res = (int32_t)a * (int32_t)b;
2606     round = get_round(vxrm, res, 15);
2607     res   = (res >> 15) + round;
2608 
2609     if (res > INT16_MAX) {
2610         env->vxsat = 0x1;
2611         return INT16_MAX;
2612     } else if (res < INT16_MIN) {
2613         env->vxsat = 0x1;
2614         return INT16_MIN;
2615     } else {
2616         return res;
2617     }
2618 }
2619 
2620 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2621 {
2622     uint8_t round;
2623     int64_t res;
2624 
2625     res = (int64_t)a * (int64_t)b;
2626     round = get_round(vxrm, res, 31);
2627     res   = (res >> 31) + round;
2628 
2629     if (res > INT32_MAX) {
2630         env->vxsat = 0x1;
2631         return INT32_MAX;
2632     } else if (res < INT32_MIN) {
2633         env->vxsat = 0x1;
2634         return INT32_MIN;
2635     } else {
2636         return res;
2637     }
2638 }
2639 
2640 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2641 {
2642     uint8_t round;
2643     uint64_t hi_64, lo_64;
2644     int64_t res;
2645 
2646     if (a == INT64_MIN && b == INT64_MIN) {
2647         env->vxsat = 1;
2648         return INT64_MAX;
2649     }
2650 
2651     muls64(&lo_64, &hi_64, a, b);
2652     round = get_round(vxrm, lo_64, 63);
2653     /*
2654      * Cannot overflow, as there are always
2655      * 2 sign bits after multiply.
2656      */
2657     res = (hi_64 << 1) | (lo_64 >> 63);
2658     if (round) {
2659         if (res == INT64_MAX) {
2660             env->vxsat = 1;
2661         } else {
2662             res += 1;
2663         }
2664     }
2665     return res;
2666 }
2667 
2668 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2669 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2670 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2671 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2672 GEN_VEXT_VV_RM(vsmul_vv_b)
2673 GEN_VEXT_VV_RM(vsmul_vv_h)
2674 GEN_VEXT_VV_RM(vsmul_vv_w)
2675 GEN_VEXT_VV_RM(vsmul_vv_d)
2676 
2677 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2678 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2679 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2680 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2681 GEN_VEXT_VX_RM(vsmul_vx_b)
2682 GEN_VEXT_VX_RM(vsmul_vx_h)
2683 GEN_VEXT_VX_RM(vsmul_vx_w)
2684 GEN_VEXT_VX_RM(vsmul_vx_d)
2685 
2686 /* Vector Single-Width Scaling Shift Instructions */
2687 static inline uint8_t
2688 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2689 {
2690     uint8_t round, shift = b & 0x7;
2691     uint8_t res;
2692 
2693     round = get_round(vxrm, a, shift);
2694     res   = (a >> shift)  + round;
2695     return res;
2696 }
2697 static inline uint16_t
2698 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2699 {
2700     uint8_t round, shift = b & 0xf;
2701     uint16_t res;
2702 
2703     round = get_round(vxrm, a, shift);
2704     res   = (a >> shift)  + round;
2705     return res;
2706 }
2707 static inline uint32_t
2708 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2709 {
2710     uint8_t round, shift = b & 0x1f;
2711     uint32_t res;
2712 
2713     round = get_round(vxrm, a, shift);
2714     res   = (a >> shift)  + round;
2715     return res;
2716 }
2717 static inline uint64_t
2718 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2719 {
2720     uint8_t round, shift = b & 0x3f;
2721     uint64_t res;
2722 
2723     round = get_round(vxrm, a, shift);
2724     res   = (a >> shift)  + round;
2725     return res;
2726 }
2727 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2728 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2729 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2730 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2731 GEN_VEXT_VV_RM(vssrl_vv_b)
2732 GEN_VEXT_VV_RM(vssrl_vv_h)
2733 GEN_VEXT_VV_RM(vssrl_vv_w)
2734 GEN_VEXT_VV_RM(vssrl_vv_d)
2735 
2736 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2737 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2738 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2739 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2740 GEN_VEXT_VX_RM(vssrl_vx_b)
2741 GEN_VEXT_VX_RM(vssrl_vx_h)
2742 GEN_VEXT_VX_RM(vssrl_vx_w)
2743 GEN_VEXT_VX_RM(vssrl_vx_d)
2744 
2745 static inline int8_t
2746 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2747 {
2748     uint8_t round, shift = b & 0x7;
2749     int8_t res;
2750 
2751     round = get_round(vxrm, a, shift);
2752     res   = (a >> shift)  + round;
2753     return res;
2754 }
2755 static inline int16_t
2756 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2757 {
2758     uint8_t round, shift = b & 0xf;
2759     int16_t res;
2760 
2761     round = get_round(vxrm, a, shift);
2762     res   = (a >> shift)  + round;
2763     return res;
2764 }
2765 static inline int32_t
2766 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2767 {
2768     uint8_t round, shift = b & 0x1f;
2769     int32_t res;
2770 
2771     round = get_round(vxrm, a, shift);
2772     res   = (a >> shift)  + round;
2773     return res;
2774 }
2775 static inline int64_t
2776 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2777 {
2778     uint8_t round, shift = b & 0x3f;
2779     int64_t res;
2780 
2781     round = get_round(vxrm, a, shift);
2782     res   = (a >> shift)  + round;
2783     return res;
2784 }
2785 
2786 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2787 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2788 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2789 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2790 GEN_VEXT_VV_RM(vssra_vv_b)
2791 GEN_VEXT_VV_RM(vssra_vv_h)
2792 GEN_VEXT_VV_RM(vssra_vv_w)
2793 GEN_VEXT_VV_RM(vssra_vv_d)
2794 
2795 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2796 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2797 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2798 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2799 GEN_VEXT_VX_RM(vssra_vx_b)
2800 GEN_VEXT_VX_RM(vssra_vx_h)
2801 GEN_VEXT_VX_RM(vssra_vx_w)
2802 GEN_VEXT_VX_RM(vssra_vx_d)
2803 
2804 /* Vector Narrowing Fixed-Point Clip Instructions */
2805 static inline int8_t
2806 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2807 {
2808     uint8_t round, shift = b & 0xf;
2809     int16_t res;
2810 
2811     round = get_round(vxrm, a, shift);
2812     res   = (a >> shift)  + round;
2813     if (res > INT8_MAX) {
2814         env->vxsat = 0x1;
2815         return INT8_MAX;
2816     } else if (res < INT8_MIN) {
2817         env->vxsat = 0x1;
2818         return INT8_MIN;
2819     } else {
2820         return res;
2821     }
2822 }
2823 
2824 static inline int16_t
2825 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2826 {
2827     uint8_t round, shift = b & 0x1f;
2828     int32_t res;
2829 
2830     round = get_round(vxrm, a, shift);
2831     res   = (a >> shift)  + round;
2832     if (res > INT16_MAX) {
2833         env->vxsat = 0x1;
2834         return INT16_MAX;
2835     } else if (res < INT16_MIN) {
2836         env->vxsat = 0x1;
2837         return INT16_MIN;
2838     } else {
2839         return res;
2840     }
2841 }
2842 
2843 static inline int32_t
2844 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2845 {
2846     uint8_t round, shift = b & 0x3f;
2847     int64_t res;
2848 
2849     round = get_round(vxrm, a, shift);
2850     res   = (a >> shift)  + round;
2851     if (res > INT32_MAX) {
2852         env->vxsat = 0x1;
2853         return INT32_MAX;
2854     } else if (res < INT32_MIN) {
2855         env->vxsat = 0x1;
2856         return INT32_MIN;
2857     } else {
2858         return res;
2859     }
2860 }
2861 
2862 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2863 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2864 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2865 GEN_VEXT_VV_RM(vnclip_wv_b)
2866 GEN_VEXT_VV_RM(vnclip_wv_h)
2867 GEN_VEXT_VV_RM(vnclip_wv_w)
2868 
2869 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2870 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2871 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2872 GEN_VEXT_VX_RM(vnclip_wx_b)
2873 GEN_VEXT_VX_RM(vnclip_wx_h)
2874 GEN_VEXT_VX_RM(vnclip_wx_w)
2875 
2876 static inline uint8_t
2877 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2878 {
2879     uint8_t round, shift = b & 0xf;
2880     uint16_t res;
2881 
2882     round = get_round(vxrm, a, shift);
2883     res   = (a >> shift)  + round;
2884     if (res > UINT8_MAX) {
2885         env->vxsat = 0x1;
2886         return UINT8_MAX;
2887     } else {
2888         return res;
2889     }
2890 }
2891 
2892 static inline uint16_t
2893 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2894 {
2895     uint8_t round, shift = b & 0x1f;
2896     uint32_t res;
2897 
2898     round = get_round(vxrm, a, shift);
2899     res   = (a >> shift)  + round;
2900     if (res > UINT16_MAX) {
2901         env->vxsat = 0x1;
2902         return UINT16_MAX;
2903     } else {
2904         return res;
2905     }
2906 }
2907 
2908 static inline uint32_t
2909 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2910 {
2911     uint8_t round, shift = b & 0x3f;
2912     uint64_t res;
2913 
2914     round = get_round(vxrm, a, shift);
2915     res   = (a >> shift)  + round;
2916     if (res > UINT32_MAX) {
2917         env->vxsat = 0x1;
2918         return UINT32_MAX;
2919     } else {
2920         return res;
2921     }
2922 }
2923 
2924 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2925 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2926 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2927 GEN_VEXT_VV_RM(vnclipu_wv_b)
2928 GEN_VEXT_VV_RM(vnclipu_wv_h)
2929 GEN_VEXT_VV_RM(vnclipu_wv_w)
2930 
2931 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2932 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2933 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2934 GEN_VEXT_VX_RM(vnclipu_wx_b)
2935 GEN_VEXT_VX_RM(vnclipu_wx_h)
2936 GEN_VEXT_VX_RM(vnclipu_wx_w)
2937 
2938 /*
2939  *** Vector Float Point Arithmetic Instructions
2940  */
2941 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2942 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2943 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2944                       CPURISCVState *env)                      \
2945 {                                                              \
2946     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2947     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2948     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2949 }
2950 
2951 #define GEN_VEXT_VV_ENV(NAME)                             \
2952 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2953                   void *vs2, CPURISCVState *env,          \
2954                   uint32_t desc)                          \
2955 {                                                         \
2956     uint32_t vm = vext_vm(desc);                          \
2957     uint32_t vl = env->vl;                                \
2958     uint32_t i;                                           \
2959                                                           \
2960     for (i = env->vstart; i < vl; i++) {                  \
2961         if (!vm && !vext_elem_mask(v0, i)) {              \
2962             continue;                                     \
2963         }                                                 \
2964         do_##NAME(vd, vs1, vs2, i, env);                  \
2965     }                                                     \
2966     env->vstart = 0;                                      \
2967 }
2968 
2969 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2970 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2971 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2972 GEN_VEXT_VV_ENV(vfadd_vv_h)
2973 GEN_VEXT_VV_ENV(vfadd_vv_w)
2974 GEN_VEXT_VV_ENV(vfadd_vv_d)
2975 
2976 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2977 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2978                       CPURISCVState *env)                      \
2979 {                                                              \
2980     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2981     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2982 }
2983 
2984 #define GEN_VEXT_VF(NAME)                                 \
2985 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2986                   void *vs2, CPURISCVState *env,          \
2987                   uint32_t desc)                          \
2988 {                                                         \
2989     uint32_t vm = vext_vm(desc);                          \
2990     uint32_t vl = env->vl;                                \
2991     uint32_t i;                                           \
2992                                                           \
2993     for (i = env->vstart; i < vl; i++) {                  \
2994         if (!vm && !vext_elem_mask(v0, i)) {              \
2995             continue;                                     \
2996         }                                                 \
2997         do_##NAME(vd, s1, vs2, i, env);                   \
2998     }                                                     \
2999     env->vstart = 0;                                      \
3000 }
3001 
3002 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3003 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3004 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3005 GEN_VEXT_VF(vfadd_vf_h)
3006 GEN_VEXT_VF(vfadd_vf_w)
3007 GEN_VEXT_VF(vfadd_vf_d)
3008 
3009 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3010 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3011 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3012 GEN_VEXT_VV_ENV(vfsub_vv_h)
3013 GEN_VEXT_VV_ENV(vfsub_vv_w)
3014 GEN_VEXT_VV_ENV(vfsub_vv_d)
3015 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3016 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3017 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3018 GEN_VEXT_VF(vfsub_vf_h)
3019 GEN_VEXT_VF(vfsub_vf_w)
3020 GEN_VEXT_VF(vfsub_vf_d)
3021 
3022 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3023 {
3024     return float16_sub(b, a, s);
3025 }
3026 
3027 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3028 {
3029     return float32_sub(b, a, s);
3030 }
3031 
3032 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3033 {
3034     return float64_sub(b, a, s);
3035 }
3036 
3037 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3038 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3039 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3040 GEN_VEXT_VF(vfrsub_vf_h)
3041 GEN_VEXT_VF(vfrsub_vf_w)
3042 GEN_VEXT_VF(vfrsub_vf_d)
3043 
3044 /* Vector Widening Floating-Point Add/Subtract Instructions */
3045 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3046 {
3047     return float32_add(float16_to_float32(a, true, s),
3048             float16_to_float32(b, true, s), s);
3049 }
3050 
3051 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3052 {
3053     return float64_add(float32_to_float64(a, s),
3054             float32_to_float64(b, s), s);
3055 
3056 }
3057 
3058 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3059 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3060 GEN_VEXT_VV_ENV(vfwadd_vv_h)
3061 GEN_VEXT_VV_ENV(vfwadd_vv_w)
3062 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3063 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3064 GEN_VEXT_VF(vfwadd_vf_h)
3065 GEN_VEXT_VF(vfwadd_vf_w)
3066 
3067 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3068 {
3069     return float32_sub(float16_to_float32(a, true, s),
3070             float16_to_float32(b, true, s), s);
3071 }
3072 
3073 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3074 {
3075     return float64_sub(float32_to_float64(a, s),
3076             float32_to_float64(b, s), s);
3077 
3078 }
3079 
3080 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3081 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3082 GEN_VEXT_VV_ENV(vfwsub_vv_h)
3083 GEN_VEXT_VV_ENV(vfwsub_vv_w)
3084 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3085 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3086 GEN_VEXT_VF(vfwsub_vf_h)
3087 GEN_VEXT_VF(vfwsub_vf_w)
3088 
3089 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3090 {
3091     return float32_add(a, float16_to_float32(b, true, s), s);
3092 }
3093 
3094 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3095 {
3096     return float64_add(a, float32_to_float64(b, s), s);
3097 }
3098 
3099 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3100 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3101 GEN_VEXT_VV_ENV(vfwadd_wv_h)
3102 GEN_VEXT_VV_ENV(vfwadd_wv_w)
3103 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3104 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3105 GEN_VEXT_VF(vfwadd_wf_h)
3106 GEN_VEXT_VF(vfwadd_wf_w)
3107 
3108 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3109 {
3110     return float32_sub(a, float16_to_float32(b, true, s), s);
3111 }
3112 
3113 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3114 {
3115     return float64_sub(a, float32_to_float64(b, s), s);
3116 }
3117 
3118 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3119 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3120 GEN_VEXT_VV_ENV(vfwsub_wv_h)
3121 GEN_VEXT_VV_ENV(vfwsub_wv_w)
3122 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3123 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3124 GEN_VEXT_VF(vfwsub_wf_h)
3125 GEN_VEXT_VF(vfwsub_wf_w)
3126 
3127 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3128 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3129 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3130 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3131 GEN_VEXT_VV_ENV(vfmul_vv_h)
3132 GEN_VEXT_VV_ENV(vfmul_vv_w)
3133 GEN_VEXT_VV_ENV(vfmul_vv_d)
3134 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3135 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3136 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3137 GEN_VEXT_VF(vfmul_vf_h)
3138 GEN_VEXT_VF(vfmul_vf_w)
3139 GEN_VEXT_VF(vfmul_vf_d)
3140 
3141 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3142 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3143 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3144 GEN_VEXT_VV_ENV(vfdiv_vv_h)
3145 GEN_VEXT_VV_ENV(vfdiv_vv_w)
3146 GEN_VEXT_VV_ENV(vfdiv_vv_d)
3147 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3148 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3149 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3150 GEN_VEXT_VF(vfdiv_vf_h)
3151 GEN_VEXT_VF(vfdiv_vf_w)
3152 GEN_VEXT_VF(vfdiv_vf_d)
3153 
3154 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3155 {
3156     return float16_div(b, a, s);
3157 }
3158 
3159 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3160 {
3161     return float32_div(b, a, s);
3162 }
3163 
3164 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3165 {
3166     return float64_div(b, a, s);
3167 }
3168 
3169 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3170 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3171 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3172 GEN_VEXT_VF(vfrdiv_vf_h)
3173 GEN_VEXT_VF(vfrdiv_vf_w)
3174 GEN_VEXT_VF(vfrdiv_vf_d)
3175 
3176 /* Vector Widening Floating-Point Multiply */
3177 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3178 {
3179     return float32_mul(float16_to_float32(a, true, s),
3180             float16_to_float32(b, true, s), s);
3181 }
3182 
3183 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3184 {
3185     return float64_mul(float32_to_float64(a, s),
3186             float32_to_float64(b, s), s);
3187 
3188 }
3189 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3190 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3191 GEN_VEXT_VV_ENV(vfwmul_vv_h)
3192 GEN_VEXT_VV_ENV(vfwmul_vv_w)
3193 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3194 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3195 GEN_VEXT_VF(vfwmul_vf_h)
3196 GEN_VEXT_VF(vfwmul_vf_w)
3197 
3198 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3199 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3200 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3201         CPURISCVState *env)                                        \
3202 {                                                                  \
3203     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3204     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3205     TD d = *((TD *)vd + HD(i));                                    \
3206     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3207 }
3208 
3209 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3210 {
3211     return float16_muladd(a, b, d, 0, s);
3212 }
3213 
3214 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3215 {
3216     return float32_muladd(a, b, d, 0, s);
3217 }
3218 
3219 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3220 {
3221     return float64_muladd(a, b, d, 0, s);
3222 }
3223 
3224 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3225 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3226 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3227 GEN_VEXT_VV_ENV(vfmacc_vv_h)
3228 GEN_VEXT_VV_ENV(vfmacc_vv_w)
3229 GEN_VEXT_VV_ENV(vfmacc_vv_d)
3230 
3231 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3232 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3233         CPURISCVState *env)                                       \
3234 {                                                                 \
3235     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3236     TD d = *((TD *)vd + HD(i));                                   \
3237     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3238 }
3239 
3240 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3241 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3242 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3243 GEN_VEXT_VF(vfmacc_vf_h)
3244 GEN_VEXT_VF(vfmacc_vf_w)
3245 GEN_VEXT_VF(vfmacc_vf_d)
3246 
3247 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3248 {
3249     return float16_muladd(a, b, d,
3250             float_muladd_negate_c | float_muladd_negate_product, s);
3251 }
3252 
3253 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3254 {
3255     return float32_muladd(a, b, d,
3256             float_muladd_negate_c | float_muladd_negate_product, s);
3257 }
3258 
3259 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3260 {
3261     return float64_muladd(a, b, d,
3262             float_muladd_negate_c | float_muladd_negate_product, s);
3263 }
3264 
3265 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3266 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3267 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3268 GEN_VEXT_VV_ENV(vfnmacc_vv_h)
3269 GEN_VEXT_VV_ENV(vfnmacc_vv_w)
3270 GEN_VEXT_VV_ENV(vfnmacc_vv_d)
3271 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3272 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3273 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3274 GEN_VEXT_VF(vfnmacc_vf_h)
3275 GEN_VEXT_VF(vfnmacc_vf_w)
3276 GEN_VEXT_VF(vfnmacc_vf_d)
3277 
3278 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3279 {
3280     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3281 }
3282 
3283 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3284 {
3285     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3286 }
3287 
3288 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3289 {
3290     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3291 }
3292 
3293 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3294 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3295 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3296 GEN_VEXT_VV_ENV(vfmsac_vv_h)
3297 GEN_VEXT_VV_ENV(vfmsac_vv_w)
3298 GEN_VEXT_VV_ENV(vfmsac_vv_d)
3299 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3300 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3301 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3302 GEN_VEXT_VF(vfmsac_vf_h)
3303 GEN_VEXT_VF(vfmsac_vf_w)
3304 GEN_VEXT_VF(vfmsac_vf_d)
3305 
3306 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3307 {
3308     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3309 }
3310 
3311 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3312 {
3313     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3314 }
3315 
3316 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3317 {
3318     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3319 }
3320 
3321 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3322 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3323 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3324 GEN_VEXT_VV_ENV(vfnmsac_vv_h)
3325 GEN_VEXT_VV_ENV(vfnmsac_vv_w)
3326 GEN_VEXT_VV_ENV(vfnmsac_vv_d)
3327 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3328 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3329 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3330 GEN_VEXT_VF(vfnmsac_vf_h)
3331 GEN_VEXT_VF(vfnmsac_vf_w)
3332 GEN_VEXT_VF(vfnmsac_vf_d)
3333 
3334 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3335 {
3336     return float16_muladd(d, b, a, 0, s);
3337 }
3338 
3339 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3340 {
3341     return float32_muladd(d, b, a, 0, s);
3342 }
3343 
3344 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3345 {
3346     return float64_muladd(d, b, a, 0, s);
3347 }
3348 
3349 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3350 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3351 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3352 GEN_VEXT_VV_ENV(vfmadd_vv_h)
3353 GEN_VEXT_VV_ENV(vfmadd_vv_w)
3354 GEN_VEXT_VV_ENV(vfmadd_vv_d)
3355 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3356 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3357 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3358 GEN_VEXT_VF(vfmadd_vf_h)
3359 GEN_VEXT_VF(vfmadd_vf_w)
3360 GEN_VEXT_VF(vfmadd_vf_d)
3361 
3362 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3363 {
3364     return float16_muladd(d, b, a,
3365             float_muladd_negate_c | float_muladd_negate_product, s);
3366 }
3367 
3368 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3369 {
3370     return float32_muladd(d, b, a,
3371             float_muladd_negate_c | float_muladd_negate_product, s);
3372 }
3373 
3374 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3375 {
3376     return float64_muladd(d, b, a,
3377             float_muladd_negate_c | float_muladd_negate_product, s);
3378 }
3379 
3380 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3381 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3382 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3383 GEN_VEXT_VV_ENV(vfnmadd_vv_h)
3384 GEN_VEXT_VV_ENV(vfnmadd_vv_w)
3385 GEN_VEXT_VV_ENV(vfnmadd_vv_d)
3386 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3387 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3388 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3389 GEN_VEXT_VF(vfnmadd_vf_h)
3390 GEN_VEXT_VF(vfnmadd_vf_w)
3391 GEN_VEXT_VF(vfnmadd_vf_d)
3392 
3393 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3394 {
3395     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3396 }
3397 
3398 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3399 {
3400     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3401 }
3402 
3403 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3404 {
3405     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3406 }
3407 
3408 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3409 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3410 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3411 GEN_VEXT_VV_ENV(vfmsub_vv_h)
3412 GEN_VEXT_VV_ENV(vfmsub_vv_w)
3413 GEN_VEXT_VV_ENV(vfmsub_vv_d)
3414 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3415 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3416 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3417 GEN_VEXT_VF(vfmsub_vf_h)
3418 GEN_VEXT_VF(vfmsub_vf_w)
3419 GEN_VEXT_VF(vfmsub_vf_d)
3420 
3421 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3422 {
3423     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3424 }
3425 
3426 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3427 {
3428     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3429 }
3430 
3431 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3432 {
3433     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3434 }
3435 
3436 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3437 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3438 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3439 GEN_VEXT_VV_ENV(vfnmsub_vv_h)
3440 GEN_VEXT_VV_ENV(vfnmsub_vv_w)
3441 GEN_VEXT_VV_ENV(vfnmsub_vv_d)
3442 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3443 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3444 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3445 GEN_VEXT_VF(vfnmsub_vf_h)
3446 GEN_VEXT_VF(vfnmsub_vf_w)
3447 GEN_VEXT_VF(vfnmsub_vf_d)
3448 
3449 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3450 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3451 {
3452     return float32_muladd(float16_to_float32(a, true, s),
3453                         float16_to_float32(b, true, s), d, 0, s);
3454 }
3455 
3456 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3457 {
3458     return float64_muladd(float32_to_float64(a, s),
3459                         float32_to_float64(b, s), d, 0, s);
3460 }
3461 
3462 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3463 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3464 GEN_VEXT_VV_ENV(vfwmacc_vv_h)
3465 GEN_VEXT_VV_ENV(vfwmacc_vv_w)
3466 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3467 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3468 GEN_VEXT_VF(vfwmacc_vf_h)
3469 GEN_VEXT_VF(vfwmacc_vf_w)
3470 
3471 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3472 {
3473     return float32_muladd(float16_to_float32(a, true, s),
3474                         float16_to_float32(b, true, s), d,
3475                         float_muladd_negate_c | float_muladd_negate_product, s);
3476 }
3477 
3478 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3479 {
3480     return float64_muladd(float32_to_float64(a, s),
3481                         float32_to_float64(b, s), d,
3482                         float_muladd_negate_c | float_muladd_negate_product, s);
3483 }
3484 
3485 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3486 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3487 GEN_VEXT_VV_ENV(vfwnmacc_vv_h)
3488 GEN_VEXT_VV_ENV(vfwnmacc_vv_w)
3489 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3490 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3491 GEN_VEXT_VF(vfwnmacc_vf_h)
3492 GEN_VEXT_VF(vfwnmacc_vf_w)
3493 
3494 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3495 {
3496     return float32_muladd(float16_to_float32(a, true, s),
3497                         float16_to_float32(b, true, s), d,
3498                         float_muladd_negate_c, s);
3499 }
3500 
3501 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3502 {
3503     return float64_muladd(float32_to_float64(a, s),
3504                         float32_to_float64(b, s), d,
3505                         float_muladd_negate_c, s);
3506 }
3507 
3508 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3509 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3510 GEN_VEXT_VV_ENV(vfwmsac_vv_h)
3511 GEN_VEXT_VV_ENV(vfwmsac_vv_w)
3512 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3513 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3514 GEN_VEXT_VF(vfwmsac_vf_h)
3515 GEN_VEXT_VF(vfwmsac_vf_w)
3516 
3517 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3518 {
3519     return float32_muladd(float16_to_float32(a, true, s),
3520                         float16_to_float32(b, true, s), d,
3521                         float_muladd_negate_product, s);
3522 }
3523 
3524 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3525 {
3526     return float64_muladd(float32_to_float64(a, s),
3527                         float32_to_float64(b, s), d,
3528                         float_muladd_negate_product, s);
3529 }
3530 
3531 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3532 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3533 GEN_VEXT_VV_ENV(vfwnmsac_vv_h)
3534 GEN_VEXT_VV_ENV(vfwnmsac_vv_w)
3535 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3536 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3537 GEN_VEXT_VF(vfwnmsac_vf_h)
3538 GEN_VEXT_VF(vfwnmsac_vf_w)
3539 
3540 /* Vector Floating-Point Square-Root Instruction */
3541 /* (TD, T2, TX2) */
3542 #define OP_UU_H uint16_t, uint16_t, uint16_t
3543 #define OP_UU_W uint32_t, uint32_t, uint32_t
3544 #define OP_UU_D uint64_t, uint64_t, uint64_t
3545 
3546 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3547 static void do_##NAME(void *vd, void *vs2, int i,      \
3548         CPURISCVState *env)                            \
3549 {                                                      \
3550     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3551     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3552 }
3553 
3554 #define GEN_VEXT_V_ENV(NAME)                           \
3555 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3556         CPURISCVState *env, uint32_t desc)             \
3557 {                                                      \
3558     uint32_t vm = vext_vm(desc);                       \
3559     uint32_t vl = env->vl;                             \
3560     uint32_t i;                                        \
3561                                                        \
3562     if (vl == 0) {                                     \
3563         return;                                        \
3564     }                                                  \
3565     for (i = env->vstart; i < vl; i++) {               \
3566         if (!vm && !vext_elem_mask(v0, i)) {           \
3567             continue;                                  \
3568         }                                              \
3569         do_##NAME(vd, vs2, i, env);                    \
3570     }                                                  \
3571     env->vstart = 0;                                   \
3572 }
3573 
3574 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3575 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3576 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3577 GEN_VEXT_V_ENV(vfsqrt_v_h)
3578 GEN_VEXT_V_ENV(vfsqrt_v_w)
3579 GEN_VEXT_V_ENV(vfsqrt_v_d)
3580 
3581 /*
3582  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3583  *
3584  * Adapted from riscv-v-spec recip.c:
3585  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3586  */
3587 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3588 {
3589     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3590     uint64_t exp = extract64(f, frac_size, exp_size);
3591     uint64_t frac = extract64(f, 0, frac_size);
3592 
3593     const uint8_t lookup_table[] = {
3594         52, 51, 50, 48, 47, 46, 44, 43,
3595         42, 41, 40, 39, 38, 36, 35, 34,
3596         33, 32, 31, 30, 30, 29, 28, 27,
3597         26, 25, 24, 23, 23, 22, 21, 20,
3598         19, 19, 18, 17, 16, 16, 15, 14,
3599         14, 13, 12, 12, 11, 10, 10, 9,
3600         9, 8, 7, 7, 6, 6, 5, 4,
3601         4, 3, 3, 2, 2, 1, 1, 0,
3602         127, 125, 123, 121, 119, 118, 116, 114,
3603         113, 111, 109, 108, 106, 105, 103, 102,
3604         100, 99, 97, 96, 95, 93, 92, 91,
3605         90, 88, 87, 86, 85, 84, 83, 82,
3606         80, 79, 78, 77, 76, 75, 74, 73,
3607         72, 71, 70, 70, 69, 68, 67, 66,
3608         65, 64, 63, 63, 62, 61, 60, 59,
3609         59, 58, 57, 56, 56, 55, 54, 53
3610     };
3611     const int precision = 7;
3612 
3613     if (exp == 0 && frac != 0) { /* subnormal */
3614         /* Normalize the subnormal. */
3615         while (extract64(frac, frac_size - 1, 1) == 0) {
3616             exp--;
3617             frac <<= 1;
3618         }
3619 
3620         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3621     }
3622 
3623     int idx = ((exp & 1) << (precision - 1)) |
3624                 (frac >> (frac_size - precision + 1));
3625     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3626                             (frac_size - precision);
3627     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3628 
3629     uint64_t val = 0;
3630     val = deposit64(val, 0, frac_size, out_frac);
3631     val = deposit64(val, frac_size, exp_size, out_exp);
3632     val = deposit64(val, frac_size + exp_size, 1, sign);
3633     return val;
3634 }
3635 
3636 static float16 frsqrt7_h(float16 f, float_status *s)
3637 {
3638     int exp_size = 5, frac_size = 10;
3639     bool sign = float16_is_neg(f);
3640 
3641     /*
3642      * frsqrt7(sNaN) = canonical NaN
3643      * frsqrt7(-inf) = canonical NaN
3644      * frsqrt7(-normal) = canonical NaN
3645      * frsqrt7(-subnormal) = canonical NaN
3646      */
3647     if (float16_is_signaling_nan(f, s) ||
3648             (float16_is_infinity(f) && sign) ||
3649             (float16_is_normal(f) && sign) ||
3650             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3651         s->float_exception_flags |= float_flag_invalid;
3652         return float16_default_nan(s);
3653     }
3654 
3655     /* frsqrt7(qNaN) = canonical NaN */
3656     if (float16_is_quiet_nan(f, s)) {
3657         return float16_default_nan(s);
3658     }
3659 
3660     /* frsqrt7(+-0) = +-inf */
3661     if (float16_is_zero(f)) {
3662         s->float_exception_flags |= float_flag_divbyzero;
3663         return float16_set_sign(float16_infinity, sign);
3664     }
3665 
3666     /* frsqrt7(+inf) = +0 */
3667     if (float16_is_infinity(f) && !sign) {
3668         return float16_set_sign(float16_zero, sign);
3669     }
3670 
3671     /* +normal, +subnormal */
3672     uint64_t val = frsqrt7(f, exp_size, frac_size);
3673     return make_float16(val);
3674 }
3675 
3676 static float32 frsqrt7_s(float32 f, float_status *s)
3677 {
3678     int exp_size = 8, frac_size = 23;
3679     bool sign = float32_is_neg(f);
3680 
3681     /*
3682      * frsqrt7(sNaN) = canonical NaN
3683      * frsqrt7(-inf) = canonical NaN
3684      * frsqrt7(-normal) = canonical NaN
3685      * frsqrt7(-subnormal) = canonical NaN
3686      */
3687     if (float32_is_signaling_nan(f, s) ||
3688             (float32_is_infinity(f) && sign) ||
3689             (float32_is_normal(f) && sign) ||
3690             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3691         s->float_exception_flags |= float_flag_invalid;
3692         return float32_default_nan(s);
3693     }
3694 
3695     /* frsqrt7(qNaN) = canonical NaN */
3696     if (float32_is_quiet_nan(f, s)) {
3697         return float32_default_nan(s);
3698     }
3699 
3700     /* frsqrt7(+-0) = +-inf */
3701     if (float32_is_zero(f)) {
3702         s->float_exception_flags |= float_flag_divbyzero;
3703         return float32_set_sign(float32_infinity, sign);
3704     }
3705 
3706     /* frsqrt7(+inf) = +0 */
3707     if (float32_is_infinity(f) && !sign) {
3708         return float32_set_sign(float32_zero, sign);
3709     }
3710 
3711     /* +normal, +subnormal */
3712     uint64_t val = frsqrt7(f, exp_size, frac_size);
3713     return make_float32(val);
3714 }
3715 
3716 static float64 frsqrt7_d(float64 f, float_status *s)
3717 {
3718     int exp_size = 11, frac_size = 52;
3719     bool sign = float64_is_neg(f);
3720 
3721     /*
3722      * frsqrt7(sNaN) = canonical NaN
3723      * frsqrt7(-inf) = canonical NaN
3724      * frsqrt7(-normal) = canonical NaN
3725      * frsqrt7(-subnormal) = canonical NaN
3726      */
3727     if (float64_is_signaling_nan(f, s) ||
3728             (float64_is_infinity(f) && sign) ||
3729             (float64_is_normal(f) && sign) ||
3730             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3731         s->float_exception_flags |= float_flag_invalid;
3732         return float64_default_nan(s);
3733     }
3734 
3735     /* frsqrt7(qNaN) = canonical NaN */
3736     if (float64_is_quiet_nan(f, s)) {
3737         return float64_default_nan(s);
3738     }
3739 
3740     /* frsqrt7(+-0) = +-inf */
3741     if (float64_is_zero(f)) {
3742         s->float_exception_flags |= float_flag_divbyzero;
3743         return float64_set_sign(float64_infinity, sign);
3744     }
3745 
3746     /* frsqrt7(+inf) = +0 */
3747     if (float64_is_infinity(f) && !sign) {
3748         return float64_set_sign(float64_zero, sign);
3749     }
3750 
3751     /* +normal, +subnormal */
3752     uint64_t val = frsqrt7(f, exp_size, frac_size);
3753     return make_float64(val);
3754 }
3755 
3756 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3757 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3758 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3759 GEN_VEXT_V_ENV(vfrsqrt7_v_h)
3760 GEN_VEXT_V_ENV(vfrsqrt7_v_w)
3761 GEN_VEXT_V_ENV(vfrsqrt7_v_d)
3762 
3763 /*
3764  * Vector Floating-Point Reciprocal Estimate Instruction
3765  *
3766  * Adapted from riscv-v-spec recip.c:
3767  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3768  */
3769 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3770                       float_status *s)
3771 {
3772     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3773     uint64_t exp = extract64(f, frac_size, exp_size);
3774     uint64_t frac = extract64(f, 0, frac_size);
3775 
3776     const uint8_t lookup_table[] = {
3777         127, 125, 123, 121, 119, 117, 116, 114,
3778         112, 110, 109, 107, 105, 104, 102, 100,
3779         99, 97, 96, 94, 93, 91, 90, 88,
3780         87, 85, 84, 83, 81, 80, 79, 77,
3781         76, 75, 74, 72, 71, 70, 69, 68,
3782         66, 65, 64, 63, 62, 61, 60, 59,
3783         58, 57, 56, 55, 54, 53, 52, 51,
3784         50, 49, 48, 47, 46, 45, 44, 43,
3785         42, 41, 40, 40, 39, 38, 37, 36,
3786         35, 35, 34, 33, 32, 31, 31, 30,
3787         29, 28, 28, 27, 26, 25, 25, 24,
3788         23, 23, 22, 21, 21, 20, 19, 19,
3789         18, 17, 17, 16, 15, 15, 14, 14,
3790         13, 12, 12, 11, 11, 10, 9, 9,
3791         8, 8, 7, 7, 6, 5, 5, 4,
3792         4, 3, 3, 2, 2, 1, 1, 0
3793     };
3794     const int precision = 7;
3795 
3796     if (exp == 0 && frac != 0) { /* subnormal */
3797         /* Normalize the subnormal. */
3798         while (extract64(frac, frac_size - 1, 1) == 0) {
3799             exp--;
3800             frac <<= 1;
3801         }
3802 
3803         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3804 
3805         if (exp != 0 && exp != UINT64_MAX) {
3806             /*
3807              * Overflow to inf or max value of same sign,
3808              * depending on sign and rounding mode.
3809              */
3810             s->float_exception_flags |= (float_flag_inexact |
3811                                          float_flag_overflow);
3812 
3813             if ((s->float_rounding_mode == float_round_to_zero) ||
3814                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3815                 ((s->float_rounding_mode == float_round_up) && sign)) {
3816                 /* Return greatest/negative finite value. */
3817                 return (sign << (exp_size + frac_size)) |
3818                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3819             } else {
3820                 /* Return +-inf. */
3821                 return (sign << (exp_size + frac_size)) |
3822                     MAKE_64BIT_MASK(frac_size, exp_size);
3823             }
3824         }
3825     }
3826 
3827     int idx = frac >> (frac_size - precision);
3828     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3829                             (frac_size - precision);
3830     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3831 
3832     if (out_exp == 0 || out_exp == UINT64_MAX) {
3833         /*
3834          * The result is subnormal, but don't raise the underflow exception,
3835          * because there's no additional loss of precision.
3836          */
3837         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3838         if (out_exp == UINT64_MAX) {
3839             out_frac >>= 1;
3840             out_exp = 0;
3841         }
3842     }
3843 
3844     uint64_t val = 0;
3845     val = deposit64(val, 0, frac_size, out_frac);
3846     val = deposit64(val, frac_size, exp_size, out_exp);
3847     val = deposit64(val, frac_size + exp_size, 1, sign);
3848     return val;
3849 }
3850 
3851 static float16 frec7_h(float16 f, float_status *s)
3852 {
3853     int exp_size = 5, frac_size = 10;
3854     bool sign = float16_is_neg(f);
3855 
3856     /* frec7(+-inf) = +-0 */
3857     if (float16_is_infinity(f)) {
3858         return float16_set_sign(float16_zero, sign);
3859     }
3860 
3861     /* frec7(+-0) = +-inf */
3862     if (float16_is_zero(f)) {
3863         s->float_exception_flags |= float_flag_divbyzero;
3864         return float16_set_sign(float16_infinity, sign);
3865     }
3866 
3867     /* frec7(sNaN) = canonical NaN */
3868     if (float16_is_signaling_nan(f, s)) {
3869         s->float_exception_flags |= float_flag_invalid;
3870         return float16_default_nan(s);
3871     }
3872 
3873     /* frec7(qNaN) = canonical NaN */
3874     if (float16_is_quiet_nan(f, s)) {
3875         return float16_default_nan(s);
3876     }
3877 
3878     /* +-normal, +-subnormal */
3879     uint64_t val = frec7(f, exp_size, frac_size, s);
3880     return make_float16(val);
3881 }
3882 
3883 static float32 frec7_s(float32 f, float_status *s)
3884 {
3885     int exp_size = 8, frac_size = 23;
3886     bool sign = float32_is_neg(f);
3887 
3888     /* frec7(+-inf) = +-0 */
3889     if (float32_is_infinity(f)) {
3890         return float32_set_sign(float32_zero, sign);
3891     }
3892 
3893     /* frec7(+-0) = +-inf */
3894     if (float32_is_zero(f)) {
3895         s->float_exception_flags |= float_flag_divbyzero;
3896         return float32_set_sign(float32_infinity, sign);
3897     }
3898 
3899     /* frec7(sNaN) = canonical NaN */
3900     if (float32_is_signaling_nan(f, s)) {
3901         s->float_exception_flags |= float_flag_invalid;
3902         return float32_default_nan(s);
3903     }
3904 
3905     /* frec7(qNaN) = canonical NaN */
3906     if (float32_is_quiet_nan(f, s)) {
3907         return float32_default_nan(s);
3908     }
3909 
3910     /* +-normal, +-subnormal */
3911     uint64_t val = frec7(f, exp_size, frac_size, s);
3912     return make_float32(val);
3913 }
3914 
3915 static float64 frec7_d(float64 f, float_status *s)
3916 {
3917     int exp_size = 11, frac_size = 52;
3918     bool sign = float64_is_neg(f);
3919 
3920     /* frec7(+-inf) = +-0 */
3921     if (float64_is_infinity(f)) {
3922         return float64_set_sign(float64_zero, sign);
3923     }
3924 
3925     /* frec7(+-0) = +-inf */
3926     if (float64_is_zero(f)) {
3927         s->float_exception_flags |= float_flag_divbyzero;
3928         return float64_set_sign(float64_infinity, sign);
3929     }
3930 
3931     /* frec7(sNaN) = canonical NaN */
3932     if (float64_is_signaling_nan(f, s)) {
3933         s->float_exception_flags |= float_flag_invalid;
3934         return float64_default_nan(s);
3935     }
3936 
3937     /* frec7(qNaN) = canonical NaN */
3938     if (float64_is_quiet_nan(f, s)) {
3939         return float64_default_nan(s);
3940     }
3941 
3942     /* +-normal, +-subnormal */
3943     uint64_t val = frec7(f, exp_size, frac_size, s);
3944     return make_float64(val);
3945 }
3946 
3947 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3948 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3949 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3950 GEN_VEXT_V_ENV(vfrec7_v_h)
3951 GEN_VEXT_V_ENV(vfrec7_v_w)
3952 GEN_VEXT_V_ENV(vfrec7_v_d)
3953 
3954 /* Vector Floating-Point MIN/MAX Instructions */
3955 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3956 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3957 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3958 GEN_VEXT_VV_ENV(vfmin_vv_h)
3959 GEN_VEXT_VV_ENV(vfmin_vv_w)
3960 GEN_VEXT_VV_ENV(vfmin_vv_d)
3961 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3962 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3963 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3964 GEN_VEXT_VF(vfmin_vf_h)
3965 GEN_VEXT_VF(vfmin_vf_w)
3966 GEN_VEXT_VF(vfmin_vf_d)
3967 
3968 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3969 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3970 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3971 GEN_VEXT_VV_ENV(vfmax_vv_h)
3972 GEN_VEXT_VV_ENV(vfmax_vv_w)
3973 GEN_VEXT_VV_ENV(vfmax_vv_d)
3974 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3975 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3976 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3977 GEN_VEXT_VF(vfmax_vf_h)
3978 GEN_VEXT_VF(vfmax_vf_w)
3979 GEN_VEXT_VF(vfmax_vf_d)
3980 
3981 /* Vector Floating-Point Sign-Injection Instructions */
3982 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3983 {
3984     return deposit64(b, 0, 15, a);
3985 }
3986 
3987 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3988 {
3989     return deposit64(b, 0, 31, a);
3990 }
3991 
3992 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3993 {
3994     return deposit64(b, 0, 63, a);
3995 }
3996 
3997 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3998 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3999 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4000 GEN_VEXT_VV_ENV(vfsgnj_vv_h)
4001 GEN_VEXT_VV_ENV(vfsgnj_vv_w)
4002 GEN_VEXT_VV_ENV(vfsgnj_vv_d)
4003 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4004 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4005 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4006 GEN_VEXT_VF(vfsgnj_vf_h)
4007 GEN_VEXT_VF(vfsgnj_vf_w)
4008 GEN_VEXT_VF(vfsgnj_vf_d)
4009 
4010 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4011 {
4012     return deposit64(~b, 0, 15, a);
4013 }
4014 
4015 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4016 {
4017     return deposit64(~b, 0, 31, a);
4018 }
4019 
4020 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4021 {
4022     return deposit64(~b, 0, 63, a);
4023 }
4024 
4025 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4026 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4027 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4028 GEN_VEXT_VV_ENV(vfsgnjn_vv_h)
4029 GEN_VEXT_VV_ENV(vfsgnjn_vv_w)
4030 GEN_VEXT_VV_ENV(vfsgnjn_vv_d)
4031 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4032 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4033 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4034 GEN_VEXT_VF(vfsgnjn_vf_h)
4035 GEN_VEXT_VF(vfsgnjn_vf_w)
4036 GEN_VEXT_VF(vfsgnjn_vf_d)
4037 
4038 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4039 {
4040     return deposit64(b ^ a, 0, 15, a);
4041 }
4042 
4043 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4044 {
4045     return deposit64(b ^ a, 0, 31, a);
4046 }
4047 
4048 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4049 {
4050     return deposit64(b ^ a, 0, 63, a);
4051 }
4052 
4053 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4054 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4055 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4056 GEN_VEXT_VV_ENV(vfsgnjx_vv_h)
4057 GEN_VEXT_VV_ENV(vfsgnjx_vv_w)
4058 GEN_VEXT_VV_ENV(vfsgnjx_vv_d)
4059 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4060 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4061 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4062 GEN_VEXT_VF(vfsgnjx_vf_h)
4063 GEN_VEXT_VF(vfsgnjx_vf_w)
4064 GEN_VEXT_VF(vfsgnjx_vf_d)
4065 
4066 /* Vector Floating-Point Compare Instructions */
4067 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4068 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4069                   CPURISCVState *env, uint32_t desc)          \
4070 {                                                             \
4071     uint32_t vm = vext_vm(desc);                              \
4072     uint32_t vl = env->vl;                                    \
4073     uint32_t i;                                               \
4074                                                               \
4075     for (i = env->vstart; i < vl; i++) {                      \
4076         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4077         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4078         if (!vm && !vext_elem_mask(v0, i)) {                  \
4079             continue;                                         \
4080         }                                                     \
4081         vext_set_elem_mask(vd, i,                             \
4082                            DO_OP(s2, s1, &env->fp_status));   \
4083     }                                                         \
4084     env->vstart = 0;                                          \
4085 }
4086 
4087 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4088 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4089 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4090 
4091 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4092 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4093                   CPURISCVState *env, uint32_t desc)                \
4094 {                                                                   \
4095     uint32_t vm = vext_vm(desc);                                    \
4096     uint32_t vl = env->vl;                                          \
4097     uint32_t i;                                                     \
4098                                                                     \
4099     for (i = env->vstart; i < vl; i++) {                            \
4100         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4101         if (!vm && !vext_elem_mask(v0, i)) {                        \
4102             continue;                                               \
4103         }                                                           \
4104         vext_set_elem_mask(vd, i,                                   \
4105                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4106     }                                                               \
4107     env->vstart = 0;                                                \
4108 }
4109 
4110 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4111 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4112 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4113 
4114 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4115 {
4116     FloatRelation compare = float16_compare_quiet(a, b, s);
4117     return compare != float_relation_equal;
4118 }
4119 
4120 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4121 {
4122     FloatRelation compare = float32_compare_quiet(a, b, s);
4123     return compare != float_relation_equal;
4124 }
4125 
4126 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4127 {
4128     FloatRelation compare = float64_compare_quiet(a, b, s);
4129     return compare != float_relation_equal;
4130 }
4131 
4132 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4133 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4134 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4135 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4136 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4137 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4138 
4139 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4140 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4141 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4142 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4143 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4144 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4145 
4146 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4147 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4148 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4149 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4150 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4151 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4152 
4153 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4154 {
4155     FloatRelation compare = float16_compare(a, b, s);
4156     return compare == float_relation_greater;
4157 }
4158 
4159 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4160 {
4161     FloatRelation compare = float32_compare(a, b, s);
4162     return compare == float_relation_greater;
4163 }
4164 
4165 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4166 {
4167     FloatRelation compare = float64_compare(a, b, s);
4168     return compare == float_relation_greater;
4169 }
4170 
4171 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4172 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4173 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4174 
4175 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4176 {
4177     FloatRelation compare = float16_compare(a, b, s);
4178     return compare == float_relation_greater ||
4179            compare == float_relation_equal;
4180 }
4181 
4182 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4183 {
4184     FloatRelation compare = float32_compare(a, b, s);
4185     return compare == float_relation_greater ||
4186            compare == float_relation_equal;
4187 }
4188 
4189 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4190 {
4191     FloatRelation compare = float64_compare(a, b, s);
4192     return compare == float_relation_greater ||
4193            compare == float_relation_equal;
4194 }
4195 
4196 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4197 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4198 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4199 
4200 /* Vector Floating-Point Classify Instruction */
4201 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4202 static void do_##NAME(void *vd, void *vs2, int i)      \
4203 {                                                      \
4204     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4205     *((TD *)vd + HD(i)) = OP(s2);                      \
4206 }
4207 
4208 #define GEN_VEXT_V(NAME)                               \
4209 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4210                   CPURISCVState *env, uint32_t desc)   \
4211 {                                                      \
4212     uint32_t vm = vext_vm(desc);                       \
4213     uint32_t vl = env->vl;                             \
4214     uint32_t i;                                        \
4215                                                        \
4216     for (i = env->vstart; i < vl; i++) {               \
4217         if (!vm && !vext_elem_mask(v0, i)) {           \
4218             continue;                                  \
4219         }                                              \
4220         do_##NAME(vd, vs2, i);                         \
4221     }                                                  \
4222     env->vstart = 0;                                   \
4223 }
4224 
4225 target_ulong fclass_h(uint64_t frs1)
4226 {
4227     float16 f = frs1;
4228     bool sign = float16_is_neg(f);
4229 
4230     if (float16_is_infinity(f)) {
4231         return sign ? 1 << 0 : 1 << 7;
4232     } else if (float16_is_zero(f)) {
4233         return sign ? 1 << 3 : 1 << 4;
4234     } else if (float16_is_zero_or_denormal(f)) {
4235         return sign ? 1 << 2 : 1 << 5;
4236     } else if (float16_is_any_nan(f)) {
4237         float_status s = { }; /* for snan_bit_is_one */
4238         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4239     } else {
4240         return sign ? 1 << 1 : 1 << 6;
4241     }
4242 }
4243 
4244 target_ulong fclass_s(uint64_t frs1)
4245 {
4246     float32 f = frs1;
4247     bool sign = float32_is_neg(f);
4248 
4249     if (float32_is_infinity(f)) {
4250         return sign ? 1 << 0 : 1 << 7;
4251     } else if (float32_is_zero(f)) {
4252         return sign ? 1 << 3 : 1 << 4;
4253     } else if (float32_is_zero_or_denormal(f)) {
4254         return sign ? 1 << 2 : 1 << 5;
4255     } else if (float32_is_any_nan(f)) {
4256         float_status s = { }; /* for snan_bit_is_one */
4257         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4258     } else {
4259         return sign ? 1 << 1 : 1 << 6;
4260     }
4261 }
4262 
4263 target_ulong fclass_d(uint64_t frs1)
4264 {
4265     float64 f = frs1;
4266     bool sign = float64_is_neg(f);
4267 
4268     if (float64_is_infinity(f)) {
4269         return sign ? 1 << 0 : 1 << 7;
4270     } else if (float64_is_zero(f)) {
4271         return sign ? 1 << 3 : 1 << 4;
4272     } else if (float64_is_zero_or_denormal(f)) {
4273         return sign ? 1 << 2 : 1 << 5;
4274     } else if (float64_is_any_nan(f)) {
4275         float_status s = { }; /* for snan_bit_is_one */
4276         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4277     } else {
4278         return sign ? 1 << 1 : 1 << 6;
4279     }
4280 }
4281 
4282 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4283 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4284 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4285 GEN_VEXT_V(vfclass_v_h)
4286 GEN_VEXT_V(vfclass_v_w)
4287 GEN_VEXT_V(vfclass_v_d)
4288 
4289 /* Vector Floating-Point Merge Instruction */
4290 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4291 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4292                   CPURISCVState *env, uint32_t desc)          \
4293 {                                                             \
4294     uint32_t vm = vext_vm(desc);                              \
4295     uint32_t vl = env->vl;                                    \
4296     uint32_t i;                                               \
4297                                                               \
4298     for (i = env->vstart; i < vl; i++) {                      \
4299         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4300         *((ETYPE *)vd + H(i))                                 \
4301           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4302     }                                                         \
4303     env->vstart = 0;                                          \
4304 }
4305 
4306 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4307 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4308 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4309 
4310 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4311 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4312 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4313 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4314 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4315 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h)
4316 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w)
4317 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d)
4318 
4319 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4320 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4321 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4322 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4323 GEN_VEXT_V_ENV(vfcvt_x_f_v_h)
4324 GEN_VEXT_V_ENV(vfcvt_x_f_v_w)
4325 GEN_VEXT_V_ENV(vfcvt_x_f_v_d)
4326 
4327 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4328 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4329 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4330 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4331 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h)
4332 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w)
4333 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d)
4334 
4335 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4336 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4337 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4338 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4339 GEN_VEXT_V_ENV(vfcvt_f_x_v_h)
4340 GEN_VEXT_V_ENV(vfcvt_f_x_v_w)
4341 GEN_VEXT_V_ENV(vfcvt_f_x_v_d)
4342 
4343 /* Widening Floating-Point/Integer Type-Convert Instructions */
4344 /* (TD, T2, TX2) */
4345 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4346 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4347 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4348 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4349 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4350 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4351 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h)
4352 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w)
4353 
4354 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4355 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4356 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4357 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h)
4358 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w)
4359 
4360 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4361 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4362 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4363 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4364 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b)
4365 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h)
4366 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w)
4367 
4368 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4369 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4370 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4371 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4372 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b)
4373 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h)
4374 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w)
4375 
4376 /*
4377  * vfwcvt.f.f.v vd, vs2, vm
4378  * Convert single-width float to double-width float.
4379  */
4380 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4381 {
4382     return float16_to_float32(a, true, s);
4383 }
4384 
4385 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4386 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4387 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h)
4388 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w)
4389 
4390 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4391 /* (TD, T2, TX2) */
4392 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4393 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4394 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4395 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4396 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4397 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4398 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4399 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b)
4400 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h)
4401 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w)
4402 
4403 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4404 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4405 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4406 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4407 GEN_VEXT_V_ENV(vfncvt_x_f_w_b)
4408 GEN_VEXT_V_ENV(vfncvt_x_f_w_h)
4409 GEN_VEXT_V_ENV(vfncvt_x_f_w_w)
4410 
4411 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4412 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4413 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4414 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h)
4415 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w)
4416 
4417 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4418 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4419 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4420 GEN_VEXT_V_ENV(vfncvt_f_x_w_h)
4421 GEN_VEXT_V_ENV(vfncvt_f_x_w_w)
4422 
4423 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4424 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4425 {
4426     return float32_to_float16(a, true, s);
4427 }
4428 
4429 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4430 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4431 GEN_VEXT_V_ENV(vfncvt_f_f_w_h)
4432 GEN_VEXT_V_ENV(vfncvt_f_f_w_w)
4433 
4434 /*
4435  *** Vector Reduction Operations
4436  */
4437 /* Vector Single-Width Integer Reduction Instructions */
4438 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4439 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4440         void *vs2, CPURISCVState *env, uint32_t desc)     \
4441 {                                                         \
4442     uint32_t vm = vext_vm(desc);                          \
4443     uint32_t vl = env->vl;                                \
4444     uint32_t i;                                           \
4445     TD s1 =  *((TD *)vs1 + HD(0));                        \
4446                                                           \
4447     for (i = env->vstart; i < vl; i++) {                  \
4448         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4449         if (!vm && !vext_elem_mask(v0, i)) {              \
4450             continue;                                     \
4451         }                                                 \
4452         s1 = OP(s1, (TD)s2);                              \
4453     }                                                     \
4454     *((TD *)vd + HD(0)) = s1;                             \
4455     env->vstart = 0;                                      \
4456 }
4457 
4458 /* vd[0] = sum(vs1[0], vs2[*]) */
4459 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4460 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4461 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4462 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4463 
4464 /* vd[0] = maxu(vs1[0], vs2[*]) */
4465 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4466 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4467 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4468 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4469 
4470 /* vd[0] = max(vs1[0], vs2[*]) */
4471 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4472 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4473 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4474 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4475 
4476 /* vd[0] = minu(vs1[0], vs2[*]) */
4477 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4478 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4479 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4480 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4481 
4482 /* vd[0] = min(vs1[0], vs2[*]) */
4483 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4484 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4485 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4486 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4487 
4488 /* vd[0] = and(vs1[0], vs2[*]) */
4489 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4490 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4491 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4492 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4493 
4494 /* vd[0] = or(vs1[0], vs2[*]) */
4495 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4496 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4497 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4498 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4499 
4500 /* vd[0] = xor(vs1[0], vs2[*]) */
4501 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4502 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4503 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4504 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4505 
4506 /* Vector Widening Integer Reduction Instructions */
4507 /* signed sum reduction into double-width accumulator */
4508 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4509 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4510 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4511 
4512 /* Unsigned sum reduction into double-width accumulator */
4513 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4514 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4515 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4516 
4517 /* Vector Single-Width Floating-Point Reduction Instructions */
4518 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4519 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4520                   void *vs2, CPURISCVState *env,           \
4521                   uint32_t desc)                           \
4522 {                                                          \
4523     uint32_t vm = vext_vm(desc);                           \
4524     uint32_t vl = env->vl;                                 \
4525     uint32_t i;                                            \
4526     TD s1 =  *((TD *)vs1 + HD(0));                         \
4527                                                            \
4528     for (i = env->vstart; i < vl; i++) {                   \
4529         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4530         if (!vm && !vext_elem_mask(v0, i)) {               \
4531             continue;                                      \
4532         }                                                  \
4533         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4534     }                                                      \
4535     *((TD *)vd + HD(0)) = s1;                              \
4536     env->vstart = 0;                                       \
4537 }
4538 
4539 /* Unordered sum */
4540 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4541 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4542 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4543 
4544 /* Maximum value */
4545 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4546 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4547 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4548 
4549 /* Minimum value */
4550 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4551 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4552 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4553 
4554 /* Vector Widening Floating-Point Reduction Instructions */
4555 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4556 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4557                             void *vs2, CPURISCVState *env, uint32_t desc)
4558 {
4559     uint32_t vm = vext_vm(desc);
4560     uint32_t vl = env->vl;
4561     uint32_t i;
4562     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4563 
4564     for (i = env->vstart; i < vl; i++) {
4565         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4566         if (!vm && !vext_elem_mask(v0, i)) {
4567             continue;
4568         }
4569         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4570                          &env->fp_status);
4571     }
4572     *((uint32_t *)vd + H4(0)) = s1;
4573     env->vstart = 0;
4574 }
4575 
4576 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4577                             void *vs2, CPURISCVState *env, uint32_t desc)
4578 {
4579     uint32_t vm = vext_vm(desc);
4580     uint32_t vl = env->vl;
4581     uint32_t i;
4582     uint64_t s1 =  *((uint64_t *)vs1);
4583 
4584     for (i = env->vstart; i < vl; i++) {
4585         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4586         if (!vm && !vext_elem_mask(v0, i)) {
4587             continue;
4588         }
4589         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4590                          &env->fp_status);
4591     }
4592     *((uint64_t *)vd) = s1;
4593     env->vstart = 0;
4594 }
4595 
4596 /*
4597  *** Vector Mask Operations
4598  */
4599 /* Vector Mask-Register Logical Instructions */
4600 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4601 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4602                   void *vs2, CPURISCVState *env,          \
4603                   uint32_t desc)                          \
4604 {                                                         \
4605     uint32_t vl = env->vl;                                \
4606     uint32_t i;                                           \
4607     int a, b;                                             \
4608                                                           \
4609     for (i = env->vstart; i < vl; i++) {                  \
4610         a = vext_elem_mask(vs1, i);                       \
4611         b = vext_elem_mask(vs2, i);                       \
4612         vext_set_elem_mask(vd, i, OP(b, a));              \
4613     }                                                     \
4614     env->vstart = 0;                                      \
4615 }
4616 
4617 #define DO_NAND(N, M)  (!(N & M))
4618 #define DO_ANDNOT(N, M)  (N & !M)
4619 #define DO_NOR(N, M)  (!(N | M))
4620 #define DO_ORNOT(N, M)  (N | !M)
4621 #define DO_XNOR(N, M)  (!(N ^ M))
4622 
4623 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4624 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4625 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4626 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4627 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4628 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4629 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4630 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4631 
4632 /* Vector count population in mask vcpop */
4633 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4634                              uint32_t desc)
4635 {
4636     target_ulong cnt = 0;
4637     uint32_t vm = vext_vm(desc);
4638     uint32_t vl = env->vl;
4639     int i;
4640 
4641     for (i = env->vstart; i < vl; i++) {
4642         if (vm || vext_elem_mask(v0, i)) {
4643             if (vext_elem_mask(vs2, i)) {
4644                 cnt++;
4645             }
4646         }
4647     }
4648     env->vstart = 0;
4649     return cnt;
4650 }
4651 
4652 /* vfirst find-first-set mask bit*/
4653 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4654                               uint32_t desc)
4655 {
4656     uint32_t vm = vext_vm(desc);
4657     uint32_t vl = env->vl;
4658     int i;
4659 
4660     for (i = env->vstart; i < vl; i++) {
4661         if (vm || vext_elem_mask(v0, i)) {
4662             if (vext_elem_mask(vs2, i)) {
4663                 return i;
4664             }
4665         }
4666     }
4667     env->vstart = 0;
4668     return -1LL;
4669 }
4670 
4671 enum set_mask_type {
4672     ONLY_FIRST = 1,
4673     INCLUDE_FIRST,
4674     BEFORE_FIRST,
4675 };
4676 
4677 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4678                    uint32_t desc, enum set_mask_type type)
4679 {
4680     uint32_t vm = vext_vm(desc);
4681     uint32_t vl = env->vl;
4682     int i;
4683     bool first_mask_bit = false;
4684 
4685     for (i = env->vstart; i < vl; i++) {
4686         if (!vm && !vext_elem_mask(v0, i)) {
4687             continue;
4688         }
4689         /* write a zero to all following active elements */
4690         if (first_mask_bit) {
4691             vext_set_elem_mask(vd, i, 0);
4692             continue;
4693         }
4694         if (vext_elem_mask(vs2, i)) {
4695             first_mask_bit = true;
4696             if (type == BEFORE_FIRST) {
4697                 vext_set_elem_mask(vd, i, 0);
4698             } else {
4699                 vext_set_elem_mask(vd, i, 1);
4700             }
4701         } else {
4702             if (type == ONLY_FIRST) {
4703                 vext_set_elem_mask(vd, i, 0);
4704             } else {
4705                 vext_set_elem_mask(vd, i, 1);
4706             }
4707         }
4708     }
4709     env->vstart = 0;
4710 }
4711 
4712 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4713                      uint32_t desc)
4714 {
4715     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4716 }
4717 
4718 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4719                      uint32_t desc)
4720 {
4721     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4722 }
4723 
4724 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4725                      uint32_t desc)
4726 {
4727     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4728 }
4729 
4730 /* Vector Iota Instruction */
4731 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4732 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4733                   uint32_t desc)                                          \
4734 {                                                                         \
4735     uint32_t vm = vext_vm(desc);                                          \
4736     uint32_t vl = env->vl;                                                \
4737     uint32_t sum = 0;                                                     \
4738     int i;                                                                \
4739                                                                           \
4740     for (i = env->vstart; i < vl; i++) {                                  \
4741         if (!vm && !vext_elem_mask(v0, i)) {                              \
4742             continue;                                                     \
4743         }                                                                 \
4744         *((ETYPE *)vd + H(i)) = sum;                                      \
4745         if (vext_elem_mask(vs2, i)) {                                     \
4746             sum++;                                                        \
4747         }                                                                 \
4748     }                                                                     \
4749     env->vstart = 0;                                                      \
4750 }
4751 
4752 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4753 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4754 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4755 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4756 
4757 /* Vector Element Index Instruction */
4758 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4759 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4760 {                                                                         \
4761     uint32_t vm = vext_vm(desc);                                          \
4762     uint32_t vl = env->vl;                                                \
4763     int i;                                                                \
4764                                                                           \
4765     for (i = env->vstart; i < vl; i++) {                                  \
4766         if (!vm && !vext_elem_mask(v0, i)) {                              \
4767             continue;                                                     \
4768         }                                                                 \
4769         *((ETYPE *)vd + H(i)) = i;                                        \
4770     }                                                                     \
4771     env->vstart = 0;                                                      \
4772 }
4773 
4774 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4775 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4776 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4777 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4778 
4779 /*
4780  *** Vector Permutation Instructions
4781  */
4782 
4783 /* Vector Slide Instructions */
4784 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4785 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4786                   CPURISCVState *env, uint32_t desc)                      \
4787 {                                                                         \
4788     uint32_t vm = vext_vm(desc);                                          \
4789     uint32_t vl = env->vl;                                                \
4790     target_ulong offset = s1, i_min, i;                                   \
4791                                                                           \
4792     i_min = MAX(env->vstart, offset);                                     \
4793     for (i = i_min; i < vl; i++) {                                        \
4794         if (!vm && !vext_elem_mask(v0, i)) {                              \
4795             continue;                                                     \
4796         }                                                                 \
4797         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4798     }                                                                     \
4799 }
4800 
4801 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4802 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4803 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4804 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4805 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4806 
4807 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4808 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4809                   CPURISCVState *env, uint32_t desc)                      \
4810 {                                                                         \
4811     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4812     uint32_t vm = vext_vm(desc);                                          \
4813     uint32_t vl = env->vl;                                                \
4814     target_ulong i_max, i;                                                \
4815                                                                           \
4816     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4817     for (i = env->vstart; i < i_max; ++i) {                               \
4818         if (vm || vext_elem_mask(v0, i)) {                                \
4819             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4820         }                                                                 \
4821     }                                                                     \
4822                                                                           \
4823     for (i = i_max; i < vl; ++i) {                                        \
4824         if (vm || vext_elem_mask(v0, i)) {                                \
4825             *((ETYPE *)vd + H(i)) = 0;                                    \
4826         }                                                                 \
4827     }                                                                     \
4828                                                                           \
4829     env->vstart = 0;                                                      \
4830 }
4831 
4832 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4833 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4834 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4835 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4836 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4837 
4838 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4839 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4840                      void *vs2, CPURISCVState *env, uint32_t desc)          \
4841 {                                                                           \
4842     typedef uint##BITWIDTH##_t ETYPE;                                       \
4843     uint32_t vm = vext_vm(desc);                                            \
4844     uint32_t vl = env->vl;                                                  \
4845     uint32_t i;                                                             \
4846                                                                             \
4847     for (i = env->vstart; i < vl; i++) {                                    \
4848         if (!vm && !vext_elem_mask(v0, i)) {                                \
4849             continue;                                                       \
4850         }                                                                   \
4851         if (i == 0) {                                                       \
4852             *((ETYPE *)vd + H(i)) = s1;                                     \
4853         } else {                                                            \
4854             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4855         }                                                                   \
4856     }                                                                       \
4857     env->vstart = 0;                                                        \
4858 }
4859 
4860 GEN_VEXT_VSLIE1UP(8,  H1)
4861 GEN_VEXT_VSLIE1UP(16, H2)
4862 GEN_VEXT_VSLIE1UP(32, H4)
4863 GEN_VEXT_VSLIE1UP(64, H8)
4864 
4865 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4866 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4867                   CPURISCVState *env, uint32_t desc)              \
4868 {                                                                 \
4869     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4870 }
4871 
4872 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4873 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4874 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4875 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4876 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4877 
4878 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4879 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4880                        void *vs2, CPURISCVState *env, uint32_t desc)          \
4881 {                                                                             \
4882     typedef uint##BITWIDTH##_t ETYPE;                                         \
4883     uint32_t vm = vext_vm(desc);                                              \
4884     uint32_t vl = env->vl;                                                    \
4885     uint32_t i;                                                               \
4886                                                                               \
4887     for (i = env->vstart; i < vl; i++) {                                      \
4888         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4889             continue;                                                         \
4890         }                                                                     \
4891         if (i == vl - 1) {                                                    \
4892             *((ETYPE *)vd + H(i)) = s1;                                       \
4893         } else {                                                              \
4894             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4895         }                                                                     \
4896     }                                                                         \
4897     env->vstart = 0;                                                          \
4898 }
4899 
4900 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4901 GEN_VEXT_VSLIDE1DOWN(16, H2)
4902 GEN_VEXT_VSLIDE1DOWN(32, H4)
4903 GEN_VEXT_VSLIDE1DOWN(64, H8)
4904 
4905 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4906 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4907                   CPURISCVState *env, uint32_t desc)              \
4908 {                                                                 \
4909     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4910 }
4911 
4912 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4913 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4914 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4915 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4916 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4917 
4918 /* Vector Floating-Point Slide Instructions */
4919 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4920 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4921                   CPURISCVState *env, uint32_t desc)          \
4922 {                                                             \
4923     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4924 }
4925 
4926 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4927 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4928 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4929 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4930 
4931 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4932 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4933                   CPURISCVState *env, uint32_t desc)          \
4934 {                                                             \
4935     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4936 }
4937 
4938 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4939 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4940 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4941 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4942 
4943 /* Vector Register Gather Instruction */
4944 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4945 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4946                   CPURISCVState *env, uint32_t desc)                      \
4947 {                                                                         \
4948     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4949     uint32_t vm = vext_vm(desc);                                          \
4950     uint32_t vl = env->vl;                                                \
4951     uint64_t index;                                                       \
4952     uint32_t i;                                                           \
4953                                                                           \
4954     for (i = env->vstart; i < vl; i++) {                                  \
4955         if (!vm && !vext_elem_mask(v0, i)) {                              \
4956             continue;                                                     \
4957         }                                                                 \
4958         index = *((TS1 *)vs1 + HS1(i));                                   \
4959         if (index >= vlmax) {                                             \
4960             *((TS2 *)vd + HS2(i)) = 0;                                    \
4961         } else {                                                          \
4962             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4963         }                                                                 \
4964     }                                                                     \
4965     env->vstart = 0;                                                      \
4966 }
4967 
4968 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4969 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4970 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4971 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4972 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4973 
4974 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4975 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4976 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
4977 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
4978 
4979 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4980 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4981                   CPURISCVState *env, uint32_t desc)                      \
4982 {                                                                         \
4983     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4984     uint32_t vm = vext_vm(desc);                                          \
4985     uint32_t vl = env->vl;                                                \
4986     uint64_t index = s1;                                                  \
4987     uint32_t i;                                                           \
4988                                                                           \
4989     for (i = env->vstart; i < vl; i++) {                                  \
4990         if (!vm && !vext_elem_mask(v0, i)) {                              \
4991             continue;                                                     \
4992         }                                                                 \
4993         if (index >= vlmax) {                                             \
4994             *((ETYPE *)vd + H(i)) = 0;                                    \
4995         } else {                                                          \
4996             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
4997         }                                                                 \
4998     }                                                                     \
4999     env->vstart = 0;                                                      \
5000 }
5001 
5002 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5003 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5004 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5005 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5006 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5007 
5008 /* Vector Compress Instruction */
5009 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5010 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5011                   CPURISCVState *env, uint32_t desc)                      \
5012 {                                                                         \
5013     uint32_t vl = env->vl;                                                \
5014     uint32_t num = 0, i;                                                  \
5015                                                                           \
5016     for (i = env->vstart; i < vl; i++) {                                  \
5017         if (!vext_elem_mask(vs1, i)) {                                    \
5018             continue;                                                     \
5019         }                                                                 \
5020         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5021         num++;                                                            \
5022     }                                                                     \
5023     env->vstart = 0;                                                      \
5024 }
5025 
5026 /* Compress into vd elements of vs2 where vs1 is enabled */
5027 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5028 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5029 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5030 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5031 
5032 /* Vector Whole Register Move */
5033 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5034 {
5035     /* EEW = SEW */
5036     uint32_t maxsz = simd_maxsz(desc);
5037     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5038     uint32_t startb = env->vstart * sewb;
5039     uint32_t i = startb;
5040 
5041     memcpy((uint8_t *)vd + H1(i),
5042            (uint8_t *)vs2 + H1(i),
5043            maxsz - startb);
5044 
5045     env->vstart = 0;
5046 }
5047 
5048 /* Vector Integer Extension */
5049 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5050 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5051                   CPURISCVState *env, uint32_t desc)             \
5052 {                                                                \
5053     uint32_t vl = env->vl;                                       \
5054     uint32_t vm = vext_vm(desc);                                 \
5055     uint32_t i;                                                  \
5056                                                                  \
5057     for (i = env->vstart; i < vl; i++) {                         \
5058         if (!vm && !vext_elem_mask(v0, i)) {                     \
5059             continue;                                            \
5060         }                                                        \
5061         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5062     }                                                            \
5063     env->vstart = 0;                                             \
5064 }
5065 
5066 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5067 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5068 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5069 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5070 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5071 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5072 
5073 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5074 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5075 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5076 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5077 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5078 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5079