xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 5c19fc15)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vta_all_1s(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
133 }
134 
135 /*
136  * Get the maximum number of elements can be operated.
137  *
138  * log2_esz: log2 of element size in bytes.
139  */
140 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
141 {
142     /*
143      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
144      * so vlen in bytes (vlenb) is encoded as maxsz.
145      */
146     uint32_t vlenb = simd_maxsz(desc);
147 
148     /* Return VLMAX */
149     int scale = vext_lmul(desc) - log2_esz;
150     return scale < 0 ? vlenb >> -scale : vlenb << scale;
151 }
152 
153 /*
154  * Get number of total elements, including prestart, body and tail elements.
155  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
156  * are held in the same vector register.
157  */
158 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
159                                             uint32_t esz)
160 {
161     uint32_t vlenb = simd_maxsz(desc);
162     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
163     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
164                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
165     return (vlenb << emul) / esz;
166 }
167 
168 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
169 {
170     return (addr & env->cur_pmmask) | env->cur_pmbase;
171 }
172 
173 /*
174  * This function checks watchpoint before real load operation.
175  *
176  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
177  * In user mode, there is no watchpoint support now.
178  *
179  * It will trigger an exception if there is no mapping in TLB
180  * and page table walk can't fill the TLB entry. Then the guest
181  * software can return here after process the exception or never return.
182  */
183 static void probe_pages(CPURISCVState *env, target_ulong addr,
184                         target_ulong len, uintptr_t ra,
185                         MMUAccessType access_type)
186 {
187     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
188     target_ulong curlen = MIN(pagelen, len);
189 
190     probe_access(env, adjust_addr(env, addr), curlen, access_type,
191                  cpu_mmu_index(env, false), ra);
192     if (len > curlen) {
193         addr += curlen;
194         curlen = len - curlen;
195         probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                      cpu_mmu_index(env, false), ra);
197     }
198 }
199 
200 /* set agnostic elements to 1s */
201 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
202                               uint32_t tot)
203 {
204     if (is_agnostic == 0) {
205         /* policy undisturbed */
206         return;
207     }
208     if (tot - cnt == 0) {
209         return ;
210     }
211     memset(base + cnt, -1, tot - cnt);
212 }
213 
214 static inline void vext_set_elem_mask(void *v0, int index,
215                                       uint8_t value)
216 {
217     int idx = index / 64;
218     int pos = index % 64;
219     uint64_t old = ((uint64_t *)v0)[idx];
220     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
221 }
222 
223 /*
224  * Earlier designs (pre-0.9) had a varying number of bits
225  * per mask value (MLEN). In the 0.9 design, MLEN=1.
226  * (Section 4.5)
227  */
228 static inline int vext_elem_mask(void *v0, int index)
229 {
230     int idx = index / 64;
231     int pos = index  % 64;
232     return (((uint64_t *)v0)[idx] >> pos) & 1;
233 }
234 
235 /* elements operations for load and store */
236 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
237                                uint32_t idx, void *vd, uintptr_t retaddr);
238 
239 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
240 static void NAME(CPURISCVState *env, abi_ptr addr,         \
241                  uint32_t idx, void *vd, uintptr_t retaddr)\
242 {                                                          \
243     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
244     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
245 }                                                          \
246 
247 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
248 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
249 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
250 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
251 
252 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
253 static void NAME(CPURISCVState *env, abi_ptr addr,         \
254                  uint32_t idx, void *vd, uintptr_t retaddr)\
255 {                                                          \
256     ETYPE data = *((ETYPE *)vd + H(idx));                  \
257     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
258 }
259 
260 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
261 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
262 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
263 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
264 
265 /*
266  *** stride: access vector element from strided memory
267  */
268 static void
269 vext_ldst_stride(void *vd, void *v0, target_ulong base,
270                  target_ulong stride, CPURISCVState *env,
271                  uint32_t desc, uint32_t vm,
272                  vext_ldst_elem_fn *ldst_elem,
273                  uint32_t log2_esz, uintptr_t ra)
274 {
275     uint32_t i, k;
276     uint32_t nf = vext_nf(desc);
277     uint32_t max_elems = vext_max_elems(desc, log2_esz);
278     uint32_t esz = 1 << log2_esz;
279     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
280     uint32_t vta = vext_vta(desc);
281 
282     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
283         if (!vm && !vext_elem_mask(v0, i)) {
284             continue;
285         }
286 
287         k = 0;
288         while (k < nf) {
289             target_ulong addr = base + stride * i + (k << log2_esz);
290             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
291             k++;
292         }
293     }
294     env->vstart = 0;
295     /* set tail elements to 1s */
296     for (k = 0; k < nf; ++k) {
297         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
298                           (k * max_elems + max_elems) * esz);
299     }
300     if (nf * max_elems % total_elems != 0) {
301         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
302         uint32_t registers_used =
303             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
304         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
305                           registers_used * vlenb);
306     }
307 }
308 
309 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
310 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
311                   target_ulong stride, CPURISCVState *env,              \
312                   uint32_t desc)                                        \
313 {                                                                       \
314     uint32_t vm = vext_vm(desc);                                        \
315     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
316                      ctzl(sizeof(ETYPE)), GETPC());                     \
317 }
318 
319 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
320 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
321 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
322 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
323 
324 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
325 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
326                   target_ulong stride, CPURISCVState *env,              \
327                   uint32_t desc)                                        \
328 {                                                                       \
329     uint32_t vm = vext_vm(desc);                                        \
330     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
331                      ctzl(sizeof(ETYPE)), GETPC());                     \
332 }
333 
334 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
335 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
336 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
337 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
338 
339 /*
340  *** unit-stride: access elements stored contiguously in memory
341  */
342 
343 /* unmasked unit-stride load and store operation*/
344 static void
345 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
346              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
347              uintptr_t ra)
348 {
349     uint32_t i, k;
350     uint32_t nf = vext_nf(desc);
351     uint32_t max_elems = vext_max_elems(desc, log2_esz);
352     uint32_t esz = 1 << log2_esz;
353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
354     uint32_t vta = vext_vta(desc);
355 
356     /* load bytes from guest memory */
357     for (i = env->vstart; i < evl; i++, env->vstart++) {
358         k = 0;
359         while (k < nf) {
360             target_ulong addr = base + ((i * nf + k) << log2_esz);
361             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
362             k++;
363         }
364     }
365     env->vstart = 0;
366     /* set tail elements to 1s */
367     for (k = 0; k < nf; ++k) {
368         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
369                           (k * max_elems + max_elems) * esz);
370     }
371     if (nf * max_elems % total_elems != 0) {
372         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
373         uint32_t registers_used =
374             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
375         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
376                           registers_used * vlenb);
377     }
378 }
379 
380 /*
381  * masked unit-stride load and store operation will be a special case of stride,
382  * stride = NF * sizeof (MTYPE)
383  */
384 
385 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
386 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
387                          CPURISCVState *env, uint32_t desc)             \
388 {                                                                       \
389     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
390     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
391                      ctzl(sizeof(ETYPE)), GETPC());                     \
392 }                                                                       \
393                                                                         \
394 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
395                   CPURISCVState *env, uint32_t desc)                    \
396 {                                                                       \
397     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
398                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
399 }
400 
401 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
402 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
403 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
404 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
405 
406 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
407 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
408                          CPURISCVState *env, uint32_t desc)              \
409 {                                                                        \
410     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
411     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
412                      ctzl(sizeof(ETYPE)), GETPC());                      \
413 }                                                                        \
414                                                                          \
415 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
416                   CPURISCVState *env, uint32_t desc)                     \
417 {                                                                        \
418     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
419                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
420 }
421 
422 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
423 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
424 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
425 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
426 
427 /*
428  *** unit stride mask load and store, EEW = 1
429  */
430 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
431                     CPURISCVState *env, uint32_t desc)
432 {
433     /* evl = ceil(vl/8) */
434     uint8_t evl = (env->vl + 7) >> 3;
435     vext_ldst_us(vd, base, env, desc, lde_b,
436                  0, evl, GETPC());
437 }
438 
439 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, ste_b,
445                  0, evl, GETPC());
446 }
447 
448 /*
449  *** index: access vector element from indexed memory
450  */
451 typedef target_ulong vext_get_index_addr(target_ulong base,
452         uint32_t idx, void *vs2);
453 
454 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
455 static target_ulong NAME(target_ulong base,            \
456                          uint32_t idx, void *vs2)      \
457 {                                                      \
458     return (base + *((ETYPE *)vs2 + H(idx)));          \
459 }
460 
461 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
462 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
463 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
464 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
465 
466 static inline void
467 vext_ldst_index(void *vd, void *v0, target_ulong base,
468                 void *vs2, CPURISCVState *env, uint32_t desc,
469                 vext_get_index_addr get_index_addr,
470                 vext_ldst_elem_fn *ldst_elem,
471                 uint32_t log2_esz, uintptr_t ra)
472 {
473     uint32_t i, k;
474     uint32_t nf = vext_nf(desc);
475     uint32_t vm = vext_vm(desc);
476     uint32_t max_elems = vext_max_elems(desc, log2_esz);
477     uint32_t esz = 1 << log2_esz;
478     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
479     uint32_t vta = vext_vta(desc);
480 
481     /* load bytes from guest memory */
482     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
483         if (!vm && !vext_elem_mask(v0, i)) {
484             continue;
485         }
486 
487         k = 0;
488         while (k < nf) {
489             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
490             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
491             k++;
492         }
493     }
494     env->vstart = 0;
495     /* set tail elements to 1s */
496     for (k = 0; k < nf; ++k) {
497         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
498                           (k * max_elems + max_elems) * esz);
499     }
500     if (nf * max_elems % total_elems != 0) {
501         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
502         uint32_t registers_used =
503             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
504         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
505                           registers_used * vlenb);
506     }
507 }
508 
509 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
510 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
511                   void *vs2, CPURISCVState *env, uint32_t desc)            \
512 {                                                                          \
513     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
514                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
515 }
516 
517 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
529 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
530 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
531 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
532 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
533 
534 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
535 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
536                   void *vs2, CPURISCVState *env, uint32_t desc)  \
537 {                                                                \
538     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
539                     STORE_FN, ctzl(sizeof(ETYPE)),               \
540                     GETPC());                                    \
541 }
542 
543 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
555 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
556 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
557 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
558 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
559 
560 /*
561  *** unit-stride fault-only-fisrt load instructions
562  */
563 static inline void
564 vext_ldff(void *vd, void *v0, target_ulong base,
565           CPURISCVState *env, uint32_t desc,
566           vext_ldst_elem_fn *ldst_elem,
567           uint32_t log2_esz, uintptr_t ra)
568 {
569     void *host;
570     uint32_t i, k, vl = 0;
571     uint32_t nf = vext_nf(desc);
572     uint32_t vm = vext_vm(desc);
573     uint32_t max_elems = vext_max_elems(desc, log2_esz);
574     uint32_t esz = 1 << log2_esz;
575     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
576     uint32_t vta = vext_vta(desc);
577     target_ulong addr, offset, remain;
578 
579     /* probe every access*/
580     for (i = env->vstart; i < env->vl; i++) {
581         if (!vm && !vext_elem_mask(v0, i)) {
582             continue;
583         }
584         addr = adjust_addr(env, base + i * (nf << log2_esz));
585         if (i == 0) {
586             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
587         } else {
588             /* if it triggers an exception, no need to check watchpoint */
589             remain = nf << log2_esz;
590             while (remain > 0) {
591                 offset = -(addr | TARGET_PAGE_MASK);
592                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
593                                          cpu_mmu_index(env, false));
594                 if (host) {
595 #ifdef CONFIG_USER_ONLY
596                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
597                         vl = i;
598                         goto ProbeSuccess;
599                     }
600 #else
601                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
602 #endif
603                 } else {
604                     vl = i;
605                     goto ProbeSuccess;
606                 }
607                 if (remain <=  offset) {
608                     break;
609                 }
610                 remain -= offset;
611                 addr = adjust_addr(env, addr + offset);
612             }
613         }
614     }
615 ProbeSuccess:
616     /* load bytes from guest memory */
617     if (vl != 0) {
618         env->vl = vl;
619     }
620     for (i = env->vstart; i < env->vl; i++) {
621         k = 0;
622         if (!vm && !vext_elem_mask(v0, i)) {
623             continue;
624         }
625         while (k < nf) {
626             target_ulong addr = base + ((i * nf + k) << log2_esz);
627             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
628             k++;
629         }
630     }
631     env->vstart = 0;
632     /* set tail elements to 1s */
633     for (k = 0; k < nf; ++k) {
634         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
635                           (k * max_elems + max_elems) * esz);
636     }
637     if (nf * max_elems % total_elems != 0) {
638         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
639         uint32_t registers_used =
640             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
641         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
642                           registers_used * vlenb);
643     }
644 }
645 
646 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
647 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
648                   CPURISCVState *env, uint32_t desc)      \
649 {                                                         \
650     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
651               ctzl(sizeof(ETYPE)), GETPC());              \
652 }
653 
654 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
655 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
656 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
657 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
658 
659 #define DO_SWAP(N, M) (M)
660 #define DO_AND(N, M)  (N & M)
661 #define DO_XOR(N, M)  (N ^ M)
662 #define DO_OR(N, M)   (N | M)
663 #define DO_ADD(N, M)  (N + M)
664 
665 /* Signed min/max */
666 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
667 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
668 
669 /* Unsigned min/max */
670 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
671 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
672 
673 /*
674  *** load and store whole register instructions
675  */
676 static void
677 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
678                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
679 {
680     uint32_t i, k, off, pos;
681     uint32_t nf = vext_nf(desc);
682     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
683     uint32_t max_elems = vlenb >> log2_esz;
684 
685     k = env->vstart / max_elems;
686     off = env->vstart % max_elems;
687 
688     if (off) {
689         /* load/store rest of elements of current segment pointed by vstart */
690         for (pos = off; pos < max_elems; pos++, env->vstart++) {
691             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
692             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
693         }
694         k++;
695     }
696 
697     /* load/store elements for rest of segments */
698     for (; k < nf; k++) {
699         for (i = 0; i < max_elems; i++, env->vstart++) {
700             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
701             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
702         }
703     }
704 
705     env->vstart = 0;
706 }
707 
708 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
709 void HELPER(NAME)(void *vd, target_ulong base,       \
710                   CPURISCVState *env, uint32_t desc) \
711 {                                                    \
712     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
713                     ctzl(sizeof(ETYPE)), GETPC());   \
714 }
715 
716 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
717 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
718 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
719 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
720 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
721 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
722 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
723 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
724 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
725 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
726 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
727 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
728 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
729 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
730 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
731 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
732 
733 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
734 void HELPER(NAME)(void *vd, target_ulong base,       \
735                   CPURISCVState *env, uint32_t desc) \
736 {                                                    \
737     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
738                     ctzl(sizeof(ETYPE)), GETPC());   \
739 }
740 
741 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
742 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
743 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
744 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
745 
746 /*
747  *** Vector Integer Arithmetic Instructions
748  */
749 
750 /* expand macro args before macro */
751 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
752 
753 /* (TD, T1, T2, TX1, TX2) */
754 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
755 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
756 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
757 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
758 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
759 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
760 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
761 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
762 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
763 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
764 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
765 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
766 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
767 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
768 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
769 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
770 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
771 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
772 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
773 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
774 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
775 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
776 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
777 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
778 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
779 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
780 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
781 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
782 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
783 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
784 
785 /* operation of two vector elements */
786 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
787 
788 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
789 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
790 {                                                               \
791     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
792     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
793     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
794 }
795 #define DO_SUB(N, M) (N - M)
796 #define DO_RSUB(N, M) (M - N)
797 
798 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
799 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
800 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
801 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
802 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
803 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
804 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
805 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
806 
807 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
808                        CPURISCVState *env, uint32_t desc,
809                        opivv2_fn *fn, uint32_t esz)
810 {
811     uint32_t vm = vext_vm(desc);
812     uint32_t vl = env->vl;
813     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
814     uint32_t vta = vext_vta(desc);
815     uint32_t i;
816 
817     for (i = env->vstart; i < vl; i++) {
818         if (!vm && !vext_elem_mask(v0, i)) {
819             continue;
820         }
821         fn(vd, vs1, vs2, i);
822     }
823     env->vstart = 0;
824     /* set tail elements to 1s */
825     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
826 }
827 
828 /* generate the helpers for OPIVV */
829 #define GEN_VEXT_VV(NAME, ESZ)                            \
830 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
831                   void *vs2, CPURISCVState *env,          \
832                   uint32_t desc)                          \
833 {                                                         \
834     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
835                do_##NAME, ESZ);                           \
836 }
837 
838 GEN_VEXT_VV(vadd_vv_b, 1)
839 GEN_VEXT_VV(vadd_vv_h, 2)
840 GEN_VEXT_VV(vadd_vv_w, 4)
841 GEN_VEXT_VV(vadd_vv_d, 8)
842 GEN_VEXT_VV(vsub_vv_b, 1)
843 GEN_VEXT_VV(vsub_vv_h, 2)
844 GEN_VEXT_VV(vsub_vv_w, 4)
845 GEN_VEXT_VV(vsub_vv_d, 8)
846 
847 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
848 
849 /*
850  * (T1)s1 gives the real operator type.
851  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
852  */
853 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
854 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
855 {                                                                   \
856     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
857     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
858 }
859 
860 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
861 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
862 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
863 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
864 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
865 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
866 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
867 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
868 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
869 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
870 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
871 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
872 
873 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
874                        CPURISCVState *env, uint32_t desc,
875                        opivx2_fn fn, uint32_t esz)
876 {
877     uint32_t vm = vext_vm(desc);
878     uint32_t vl = env->vl;
879     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
880     uint32_t vta = vext_vta(desc);
881     uint32_t i;
882 
883     for (i = env->vstart; i < vl; i++) {
884         if (!vm && !vext_elem_mask(v0, i)) {
885             continue;
886         }
887         fn(vd, s1, vs2, i);
888     }
889     env->vstart = 0;
890     /* set tail elements to 1s */
891     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
892 }
893 
894 /* generate the helpers for OPIVX */
895 #define GEN_VEXT_VX(NAME, ESZ)                            \
896 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
897                   void *vs2, CPURISCVState *env,          \
898                   uint32_t desc)                          \
899 {                                                         \
900     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
901                do_##NAME, ESZ);                           \
902 }
903 
904 GEN_VEXT_VX(vadd_vx_b, 1)
905 GEN_VEXT_VX(vadd_vx_h, 2)
906 GEN_VEXT_VX(vadd_vx_w, 4)
907 GEN_VEXT_VX(vadd_vx_d, 8)
908 GEN_VEXT_VX(vsub_vx_b, 1)
909 GEN_VEXT_VX(vsub_vx_h, 2)
910 GEN_VEXT_VX(vsub_vx_w, 4)
911 GEN_VEXT_VX(vsub_vx_d, 8)
912 GEN_VEXT_VX(vrsub_vx_b, 1)
913 GEN_VEXT_VX(vrsub_vx_h, 2)
914 GEN_VEXT_VX(vrsub_vx_w, 4)
915 GEN_VEXT_VX(vrsub_vx_d, 8)
916 
917 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
918 {
919     intptr_t oprsz = simd_oprsz(desc);
920     intptr_t i;
921 
922     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
923         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
924     }
925 }
926 
927 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
928 {
929     intptr_t oprsz = simd_oprsz(desc);
930     intptr_t i;
931 
932     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
933         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
934     }
935 }
936 
937 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
938 {
939     intptr_t oprsz = simd_oprsz(desc);
940     intptr_t i;
941 
942     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
943         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
944     }
945 }
946 
947 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
948 {
949     intptr_t oprsz = simd_oprsz(desc);
950     intptr_t i;
951 
952     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
953         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
954     }
955 }
956 
957 /* Vector Widening Integer Add/Subtract */
958 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
959 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
960 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
961 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
962 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
963 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
964 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
965 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
966 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
967 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
968 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
969 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
970 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
972 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
973 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
975 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
976 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
978 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
979 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
981 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
982 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
983 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
984 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
985 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
986 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
987 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
988 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
989 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
990 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
991 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
992 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
993 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
994 GEN_VEXT_VV(vwaddu_vv_b, 2)
995 GEN_VEXT_VV(vwaddu_vv_h, 4)
996 GEN_VEXT_VV(vwaddu_vv_w, 8)
997 GEN_VEXT_VV(vwsubu_vv_b, 2)
998 GEN_VEXT_VV(vwsubu_vv_h, 4)
999 GEN_VEXT_VV(vwsubu_vv_w, 8)
1000 GEN_VEXT_VV(vwadd_vv_b, 2)
1001 GEN_VEXT_VV(vwadd_vv_h, 4)
1002 GEN_VEXT_VV(vwadd_vv_w, 8)
1003 GEN_VEXT_VV(vwsub_vv_b, 2)
1004 GEN_VEXT_VV(vwsub_vv_h, 4)
1005 GEN_VEXT_VV(vwsub_vv_w, 8)
1006 GEN_VEXT_VV(vwaddu_wv_b, 2)
1007 GEN_VEXT_VV(vwaddu_wv_h, 4)
1008 GEN_VEXT_VV(vwaddu_wv_w, 8)
1009 GEN_VEXT_VV(vwsubu_wv_b, 2)
1010 GEN_VEXT_VV(vwsubu_wv_h, 4)
1011 GEN_VEXT_VV(vwsubu_wv_w, 8)
1012 GEN_VEXT_VV(vwadd_wv_b, 2)
1013 GEN_VEXT_VV(vwadd_wv_h, 4)
1014 GEN_VEXT_VV(vwadd_wv_w, 8)
1015 GEN_VEXT_VV(vwsub_wv_b, 2)
1016 GEN_VEXT_VV(vwsub_wv_h, 4)
1017 GEN_VEXT_VV(vwsub_wv_w, 8)
1018 
1019 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1022 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1025 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1028 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1031 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1032 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1034 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1035 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1037 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1038 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1040 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1041 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1043 GEN_VEXT_VX(vwaddu_vx_b, 2)
1044 GEN_VEXT_VX(vwaddu_vx_h, 4)
1045 GEN_VEXT_VX(vwaddu_vx_w, 8)
1046 GEN_VEXT_VX(vwsubu_vx_b, 2)
1047 GEN_VEXT_VX(vwsubu_vx_h, 4)
1048 GEN_VEXT_VX(vwsubu_vx_w, 8)
1049 GEN_VEXT_VX(vwadd_vx_b, 2)
1050 GEN_VEXT_VX(vwadd_vx_h, 4)
1051 GEN_VEXT_VX(vwadd_vx_w, 8)
1052 GEN_VEXT_VX(vwsub_vx_b, 2)
1053 GEN_VEXT_VX(vwsub_vx_h, 4)
1054 GEN_VEXT_VX(vwsub_vx_w, 8)
1055 GEN_VEXT_VX(vwaddu_wx_b, 2)
1056 GEN_VEXT_VX(vwaddu_wx_h, 4)
1057 GEN_VEXT_VX(vwaddu_wx_w, 8)
1058 GEN_VEXT_VX(vwsubu_wx_b, 2)
1059 GEN_VEXT_VX(vwsubu_wx_h, 4)
1060 GEN_VEXT_VX(vwsubu_wx_w, 8)
1061 GEN_VEXT_VX(vwadd_wx_b, 2)
1062 GEN_VEXT_VX(vwadd_wx_h, 4)
1063 GEN_VEXT_VX(vwadd_wx_w, 8)
1064 GEN_VEXT_VX(vwsub_wx_b, 2)
1065 GEN_VEXT_VX(vwsub_wx_h, 4)
1066 GEN_VEXT_VX(vwsub_wx_w, 8)
1067 
1068 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1069 #define DO_VADC(N, M, C) (N + M + C)
1070 #define DO_VSBC(N, M, C) (N - M - C)
1071 
1072 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1073 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1074                   CPURISCVState *env, uint32_t desc)          \
1075 {                                                             \
1076     uint32_t vl = env->vl;                                    \
1077     uint32_t esz = sizeof(ETYPE);                             \
1078     uint32_t total_elems =                                    \
1079         vext_get_total_elems(env, desc, esz);                 \
1080     uint32_t vta = vext_vta(desc);                            \
1081     uint32_t i;                                               \
1082                                                               \
1083     for (i = env->vstart; i < vl; i++) {                      \
1084         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1085         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1086         ETYPE carry = vext_elem_mask(v0, i);                  \
1087                                                               \
1088         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1089     }                                                         \
1090     env->vstart = 0;                                          \
1091     /* set tail elements to 1s */                             \
1092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1093 }
1094 
1095 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1097 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1098 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1099 
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1102 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1103 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1104 
1105 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1106 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1107                   CPURISCVState *env, uint32_t desc)                     \
1108 {                                                                        \
1109     uint32_t vl = env->vl;                                               \
1110     uint32_t esz = sizeof(ETYPE);                                        \
1111     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1112     uint32_t vta = vext_vta(desc);                                       \
1113     uint32_t i;                                                          \
1114                                                                          \
1115     for (i = env->vstart; i < vl; i++) {                                 \
1116         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1117         ETYPE carry = vext_elem_mask(v0, i);                             \
1118                                                                          \
1119         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1120     }                                                                    \
1121     env->vstart = 0;                                          \
1122     /* set tail elements to 1s */                                        \
1123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1124 }
1125 
1126 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1128 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1129 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1130 
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1133 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1134 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1135 
1136 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1137                           (__typeof(N))(N + M) < N)
1138 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1139 
1140 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1141 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1142                   CPURISCVState *env, uint32_t desc)          \
1143 {                                                             \
1144     uint32_t vl = env->vl;                                    \
1145     uint32_t vm = vext_vm(desc);                              \
1146     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1147     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1148     uint32_t i;                                               \
1149                                                               \
1150     for (i = env->vstart; i < vl; i++) {                      \
1151         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1152         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1153         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1154         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1155     }                                                         \
1156     env->vstart = 0;                                          \
1157     /* mask destination register are always tail-agnostic */  \
1158     /* set tail elements to 1s */                             \
1159     if (vta_all_1s) {                                         \
1160         for (; i < total_elems; i++) {                        \
1161             vext_set_elem_mask(vd, i, 1);                     \
1162         }                                                     \
1163     }                                                         \
1164 }
1165 
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1168 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1169 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1170 
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1173 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1174 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1175 
1176 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1177 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1178                   void *vs2, CPURISCVState *env, uint32_t desc) \
1179 {                                                               \
1180     uint32_t vl = env->vl;                                      \
1181     uint32_t vm = vext_vm(desc);                                \
1182     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1183     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1184     uint32_t i;                                                 \
1185                                                                 \
1186     for (i = env->vstart; i < vl; i++) {                        \
1187         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1188         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1189         vext_set_elem_mask(vd, i,                               \
1190                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1191     }                                                           \
1192     env->vstart = 0;                                            \
1193     /* mask destination register are always tail-agnostic */    \
1194     /* set tail elements to 1s */                               \
1195     if (vta_all_1s) {                                           \
1196         for (; i < total_elems; i++) {                          \
1197             vext_set_elem_mask(vd, i, 1);                       \
1198         }                                                       \
1199     }                                                           \
1200 }
1201 
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206 
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211 
1212 /* Vector Bitwise Logical Instructions */
1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225 GEN_VEXT_VV(vand_vv_b, 1)
1226 GEN_VEXT_VV(vand_vv_h, 2)
1227 GEN_VEXT_VV(vand_vv_w, 4)
1228 GEN_VEXT_VV(vand_vv_d, 8)
1229 GEN_VEXT_VV(vor_vv_b, 1)
1230 GEN_VEXT_VV(vor_vv_h, 2)
1231 GEN_VEXT_VV(vor_vv_w, 4)
1232 GEN_VEXT_VV(vor_vv_d, 8)
1233 GEN_VEXT_VV(vxor_vv_b, 1)
1234 GEN_VEXT_VV(vxor_vv_h, 2)
1235 GEN_VEXT_VV(vxor_vv_w, 4)
1236 GEN_VEXT_VV(vxor_vv_d, 8)
1237 
1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250 GEN_VEXT_VX(vand_vx_b, 1)
1251 GEN_VEXT_VX(vand_vx_h, 2)
1252 GEN_VEXT_VX(vand_vx_w, 4)
1253 GEN_VEXT_VX(vand_vx_d, 8)
1254 GEN_VEXT_VX(vor_vx_b, 1)
1255 GEN_VEXT_VX(vor_vx_h, 2)
1256 GEN_VEXT_VX(vor_vx_w, 4)
1257 GEN_VEXT_VX(vor_vx_d, 8)
1258 GEN_VEXT_VX(vxor_vx_b, 1)
1259 GEN_VEXT_VX(vxor_vx_h, 2)
1260 GEN_VEXT_VX(vxor_vx_w, 4)
1261 GEN_VEXT_VX(vxor_vx_d, 8)
1262 
1263 /* Vector Single-Width Bit Shift Instructions */
1264 #define DO_SLL(N, M)  (N << (M))
1265 #define DO_SRL(N, M)  (N >> (M))
1266 
1267 /* generate the helpers for shift instructions with two vector operators */
1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1271 {                                                                         \
1272     uint32_t vm = vext_vm(desc);                                          \
1273     uint32_t vl = env->vl;                                                \
1274     uint32_t i;                                                           \
1275                                                                           \
1276     for (i = env->vstart; i < vl; i++) {                                  \
1277         if (!vm && !vext_elem_mask(v0, i)) {                              \
1278             continue;                                                     \
1279         }                                                                 \
1280         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1281         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1282         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1283     }                                                                     \
1284     env->vstart = 0;                                                      \
1285 }
1286 
1287 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1288 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1289 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1290 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1291 
1292 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1293 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1294 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1295 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1296 
1297 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1298 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1299 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1300 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1301 
1302 /* generate the helpers for shift instructions with one vector and one scalar */
1303 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1304 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1305         void *vs2, CPURISCVState *env, uint32_t desc)       \
1306 {                                                           \
1307     uint32_t vm = vext_vm(desc);                            \
1308     uint32_t vl = env->vl;                                  \
1309     uint32_t i;                                             \
1310                                                             \
1311     for (i = env->vstart; i < vl; i++) {                    \
1312         if (!vm && !vext_elem_mask(v0, i)) {                \
1313             continue;                                       \
1314         }                                                   \
1315         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1316         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1317     }                                                       \
1318     env->vstart = 0;                                        \
1319 }
1320 
1321 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1322 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1323 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1324 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1325 
1326 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1327 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1328 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1329 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1330 
1331 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1332 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1333 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1334 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1335 
1336 /* Vector Narrowing Integer Right Shift Instructions */
1337 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1338 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1339 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1340 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1341 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1342 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1343 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1346 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1347 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1348 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1349 
1350 /* Vector Integer Comparison Instructions */
1351 #define DO_MSEQ(N, M) (N == M)
1352 #define DO_MSNE(N, M) (N != M)
1353 #define DO_MSLT(N, M) (N < M)
1354 #define DO_MSLE(N, M) (N <= M)
1355 #define DO_MSGT(N, M) (N > M)
1356 
1357 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1358 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1359                   CPURISCVState *env, uint32_t desc)          \
1360 {                                                             \
1361     uint32_t vm = vext_vm(desc);                              \
1362     uint32_t vl = env->vl;                                    \
1363     uint32_t i;                                               \
1364                                                               \
1365     for (i = env->vstart; i < vl; i++) {                      \
1366         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1367         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1368         if (!vm && !vext_elem_mask(v0, i)) {                  \
1369             continue;                                         \
1370         }                                                     \
1371         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1372     }                                                         \
1373     env->vstart = 0;                                          \
1374 }
1375 
1376 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1377 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1378 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1379 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1380 
1381 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1382 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1383 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1384 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1385 
1386 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1387 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1388 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1389 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1390 
1391 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1392 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1393 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1394 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1395 
1396 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1397 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1398 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1399 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1400 
1401 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1402 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1403 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1404 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1405 
1406 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1407 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1408                   CPURISCVState *env, uint32_t desc)                \
1409 {                                                                   \
1410     uint32_t vm = vext_vm(desc);                                    \
1411     uint32_t vl = env->vl;                                          \
1412     uint32_t i;                                                     \
1413                                                                     \
1414     for (i = env->vstart; i < vl; i++) {                            \
1415         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1416         if (!vm && !vext_elem_mask(v0, i)) {                        \
1417             continue;                                               \
1418         }                                                           \
1419         vext_set_elem_mask(vd, i,                                   \
1420                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1421     }                                                               \
1422     env->vstart = 0;                                                \
1423 }
1424 
1425 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1426 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1427 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1428 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1429 
1430 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1431 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1432 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1433 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1434 
1435 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1436 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1437 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1438 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1439 
1440 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1441 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1442 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1443 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1444 
1445 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1446 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1447 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1448 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1449 
1450 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1451 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1452 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1453 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1454 
1455 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1456 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1457 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1458 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1459 
1460 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1461 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1462 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1463 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1464 
1465 /* Vector Integer Min/Max Instructions */
1466 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1467 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1468 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1469 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1470 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1471 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1472 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1473 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1474 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1475 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1476 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1477 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1478 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1479 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1480 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1481 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1482 GEN_VEXT_VV(vminu_vv_b, 1)
1483 GEN_VEXT_VV(vminu_vv_h, 2)
1484 GEN_VEXT_VV(vminu_vv_w, 4)
1485 GEN_VEXT_VV(vminu_vv_d, 8)
1486 GEN_VEXT_VV(vmin_vv_b, 1)
1487 GEN_VEXT_VV(vmin_vv_h, 2)
1488 GEN_VEXT_VV(vmin_vv_w, 4)
1489 GEN_VEXT_VV(vmin_vv_d, 8)
1490 GEN_VEXT_VV(vmaxu_vv_b, 1)
1491 GEN_VEXT_VV(vmaxu_vv_h, 2)
1492 GEN_VEXT_VV(vmaxu_vv_w, 4)
1493 GEN_VEXT_VV(vmaxu_vv_d, 8)
1494 GEN_VEXT_VV(vmax_vv_b, 1)
1495 GEN_VEXT_VV(vmax_vv_h, 2)
1496 GEN_VEXT_VV(vmax_vv_w, 4)
1497 GEN_VEXT_VV(vmax_vv_d, 8)
1498 
1499 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1500 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1501 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1502 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1503 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1504 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1505 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1506 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1507 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1508 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1509 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1510 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1511 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1512 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1513 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1514 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1515 GEN_VEXT_VX(vminu_vx_b, 1)
1516 GEN_VEXT_VX(vminu_vx_h, 2)
1517 GEN_VEXT_VX(vminu_vx_w, 4)
1518 GEN_VEXT_VX(vminu_vx_d, 8)
1519 GEN_VEXT_VX(vmin_vx_b, 1)
1520 GEN_VEXT_VX(vmin_vx_h, 2)
1521 GEN_VEXT_VX(vmin_vx_w, 4)
1522 GEN_VEXT_VX(vmin_vx_d, 8)
1523 GEN_VEXT_VX(vmaxu_vx_b, 1)
1524 GEN_VEXT_VX(vmaxu_vx_h, 2)
1525 GEN_VEXT_VX(vmaxu_vx_w, 4)
1526 GEN_VEXT_VX(vmaxu_vx_d, 8)
1527 GEN_VEXT_VX(vmax_vx_b, 1)
1528 GEN_VEXT_VX(vmax_vx_h, 2)
1529 GEN_VEXT_VX(vmax_vx_w, 4)
1530 GEN_VEXT_VX(vmax_vx_d, 8)
1531 
1532 /* Vector Single-Width Integer Multiply Instructions */
1533 #define DO_MUL(N, M) (N * M)
1534 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1535 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1536 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1537 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1538 GEN_VEXT_VV(vmul_vv_b, 1)
1539 GEN_VEXT_VV(vmul_vv_h, 2)
1540 GEN_VEXT_VV(vmul_vv_w, 4)
1541 GEN_VEXT_VV(vmul_vv_d, 8)
1542 
1543 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1544 {
1545     return (int16_t)s2 * (int16_t)s1 >> 8;
1546 }
1547 
1548 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1549 {
1550     return (int32_t)s2 * (int32_t)s1 >> 16;
1551 }
1552 
1553 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1554 {
1555     return (int64_t)s2 * (int64_t)s1 >> 32;
1556 }
1557 
1558 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1559 {
1560     uint64_t hi_64, lo_64;
1561 
1562     muls64(&lo_64, &hi_64, s1, s2);
1563     return hi_64;
1564 }
1565 
1566 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1567 {
1568     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1569 }
1570 
1571 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1572 {
1573     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1574 }
1575 
1576 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1577 {
1578     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1579 }
1580 
1581 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1582 {
1583     uint64_t hi_64, lo_64;
1584 
1585     mulu64(&lo_64, &hi_64, s2, s1);
1586     return hi_64;
1587 }
1588 
1589 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1590 {
1591     return (int16_t)s2 * (uint16_t)s1 >> 8;
1592 }
1593 
1594 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1595 {
1596     return (int32_t)s2 * (uint32_t)s1 >> 16;
1597 }
1598 
1599 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1600 {
1601     return (int64_t)s2 * (uint64_t)s1 >> 32;
1602 }
1603 
1604 /*
1605  * Let  A = signed operand,
1606  *      B = unsigned operand
1607  *      P = mulu64(A, B), unsigned product
1608  *
1609  * LET  X = 2 ** 64  - A, 2's complement of A
1610  *      SP = signed product
1611  * THEN
1612  *      IF A < 0
1613  *          SP = -X * B
1614  *             = -(2 ** 64 - A) * B
1615  *             = A * B - 2 ** 64 * B
1616  *             = P - 2 ** 64 * B
1617  *      ELSE
1618  *          SP = P
1619  * THEN
1620  *      HI_P -= (A < 0 ? B : 0)
1621  */
1622 
1623 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1624 {
1625     uint64_t hi_64, lo_64;
1626 
1627     mulu64(&lo_64, &hi_64, s2, s1);
1628 
1629     hi_64 -= s2 < 0 ? s1 : 0;
1630     return hi_64;
1631 }
1632 
1633 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1634 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1635 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1636 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1637 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1638 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1639 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1640 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1641 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1642 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1643 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1644 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1645 GEN_VEXT_VV(vmulh_vv_b, 1)
1646 GEN_VEXT_VV(vmulh_vv_h, 2)
1647 GEN_VEXT_VV(vmulh_vv_w, 4)
1648 GEN_VEXT_VV(vmulh_vv_d, 8)
1649 GEN_VEXT_VV(vmulhu_vv_b, 1)
1650 GEN_VEXT_VV(vmulhu_vv_h, 2)
1651 GEN_VEXT_VV(vmulhu_vv_w, 4)
1652 GEN_VEXT_VV(vmulhu_vv_d, 8)
1653 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1654 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1655 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1656 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1657 
1658 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1659 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1660 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1661 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1662 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1663 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1664 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1665 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1666 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1667 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1668 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1669 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1670 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1671 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1672 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1673 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1674 GEN_VEXT_VX(vmul_vx_b, 1)
1675 GEN_VEXT_VX(vmul_vx_h, 2)
1676 GEN_VEXT_VX(vmul_vx_w, 4)
1677 GEN_VEXT_VX(vmul_vx_d, 8)
1678 GEN_VEXT_VX(vmulh_vx_b, 1)
1679 GEN_VEXT_VX(vmulh_vx_h, 2)
1680 GEN_VEXT_VX(vmulh_vx_w, 4)
1681 GEN_VEXT_VX(vmulh_vx_d, 8)
1682 GEN_VEXT_VX(vmulhu_vx_b, 1)
1683 GEN_VEXT_VX(vmulhu_vx_h, 2)
1684 GEN_VEXT_VX(vmulhu_vx_w, 4)
1685 GEN_VEXT_VX(vmulhu_vx_d, 8)
1686 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1687 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1688 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1689 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1690 
1691 /* Vector Integer Divide Instructions */
1692 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1693 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1694 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1695         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1696 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1697         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1698 
1699 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1700 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1701 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1702 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1703 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1704 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1705 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1706 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1707 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1708 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1709 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1710 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1711 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1712 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1713 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1714 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1715 GEN_VEXT_VV(vdivu_vv_b, 1)
1716 GEN_VEXT_VV(vdivu_vv_h, 2)
1717 GEN_VEXT_VV(vdivu_vv_w, 4)
1718 GEN_VEXT_VV(vdivu_vv_d, 8)
1719 GEN_VEXT_VV(vdiv_vv_b, 1)
1720 GEN_VEXT_VV(vdiv_vv_h, 2)
1721 GEN_VEXT_VV(vdiv_vv_w, 4)
1722 GEN_VEXT_VV(vdiv_vv_d, 8)
1723 GEN_VEXT_VV(vremu_vv_b, 1)
1724 GEN_VEXT_VV(vremu_vv_h, 2)
1725 GEN_VEXT_VV(vremu_vv_w, 4)
1726 GEN_VEXT_VV(vremu_vv_d, 8)
1727 GEN_VEXT_VV(vrem_vv_b, 1)
1728 GEN_VEXT_VV(vrem_vv_h, 2)
1729 GEN_VEXT_VV(vrem_vv_w, 4)
1730 GEN_VEXT_VV(vrem_vv_d, 8)
1731 
1732 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1733 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1734 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1735 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1736 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1737 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1738 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1739 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1740 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1741 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1742 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1743 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1744 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1745 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1746 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1747 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1748 GEN_VEXT_VX(vdivu_vx_b, 1)
1749 GEN_VEXT_VX(vdivu_vx_h, 2)
1750 GEN_VEXT_VX(vdivu_vx_w, 4)
1751 GEN_VEXT_VX(vdivu_vx_d, 8)
1752 GEN_VEXT_VX(vdiv_vx_b, 1)
1753 GEN_VEXT_VX(vdiv_vx_h, 2)
1754 GEN_VEXT_VX(vdiv_vx_w, 4)
1755 GEN_VEXT_VX(vdiv_vx_d, 8)
1756 GEN_VEXT_VX(vremu_vx_b, 1)
1757 GEN_VEXT_VX(vremu_vx_h, 2)
1758 GEN_VEXT_VX(vremu_vx_w, 4)
1759 GEN_VEXT_VX(vremu_vx_d, 8)
1760 GEN_VEXT_VX(vrem_vx_b, 1)
1761 GEN_VEXT_VX(vrem_vx_h, 2)
1762 GEN_VEXT_VX(vrem_vx_w, 4)
1763 GEN_VEXT_VX(vrem_vx_d, 8)
1764 
1765 /* Vector Widening Integer Multiply Instructions */
1766 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1767 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1768 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1769 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1770 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1771 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1772 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1773 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1774 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1775 GEN_VEXT_VV(vwmul_vv_b, 2)
1776 GEN_VEXT_VV(vwmul_vv_h, 4)
1777 GEN_VEXT_VV(vwmul_vv_w, 8)
1778 GEN_VEXT_VV(vwmulu_vv_b, 2)
1779 GEN_VEXT_VV(vwmulu_vv_h, 4)
1780 GEN_VEXT_VV(vwmulu_vv_w, 8)
1781 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1782 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1783 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1784 
1785 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1786 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1787 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1788 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1789 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1790 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1791 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1792 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1793 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1794 GEN_VEXT_VX(vwmul_vx_b, 2)
1795 GEN_VEXT_VX(vwmul_vx_h, 4)
1796 GEN_VEXT_VX(vwmul_vx_w, 8)
1797 GEN_VEXT_VX(vwmulu_vx_b, 2)
1798 GEN_VEXT_VX(vwmulu_vx_h, 4)
1799 GEN_VEXT_VX(vwmulu_vx_w, 8)
1800 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1801 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1802 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1803 
1804 /* Vector Single-Width Integer Multiply-Add Instructions */
1805 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1806 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1807 {                                                                  \
1808     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1809     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1810     TD d = *((TD *)vd + HD(i));                                    \
1811     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1812 }
1813 
1814 #define DO_MACC(N, M, D) (M * N + D)
1815 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1816 #define DO_MADD(N, M, D) (M * D + N)
1817 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1818 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1819 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1820 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1821 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1822 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1823 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1824 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1825 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1826 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1827 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1828 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1829 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1830 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1831 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1832 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1833 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1834 GEN_VEXT_VV(vmacc_vv_b, 1)
1835 GEN_VEXT_VV(vmacc_vv_h, 2)
1836 GEN_VEXT_VV(vmacc_vv_w, 4)
1837 GEN_VEXT_VV(vmacc_vv_d, 8)
1838 GEN_VEXT_VV(vnmsac_vv_b, 1)
1839 GEN_VEXT_VV(vnmsac_vv_h, 2)
1840 GEN_VEXT_VV(vnmsac_vv_w, 4)
1841 GEN_VEXT_VV(vnmsac_vv_d, 8)
1842 GEN_VEXT_VV(vmadd_vv_b, 1)
1843 GEN_VEXT_VV(vmadd_vv_h, 2)
1844 GEN_VEXT_VV(vmadd_vv_w, 4)
1845 GEN_VEXT_VV(vmadd_vv_d, 8)
1846 GEN_VEXT_VV(vnmsub_vv_b, 1)
1847 GEN_VEXT_VV(vnmsub_vv_h, 2)
1848 GEN_VEXT_VV(vnmsub_vv_w, 4)
1849 GEN_VEXT_VV(vnmsub_vv_d, 8)
1850 
1851 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1852 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1853 {                                                                   \
1854     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1855     TD d = *((TD *)vd + HD(i));                                     \
1856     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1857 }
1858 
1859 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1860 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1861 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1862 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1863 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1864 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1865 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1866 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1867 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1868 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1869 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1870 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1871 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1872 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1873 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1874 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1875 GEN_VEXT_VX(vmacc_vx_b, 1)
1876 GEN_VEXT_VX(vmacc_vx_h, 2)
1877 GEN_VEXT_VX(vmacc_vx_w, 4)
1878 GEN_VEXT_VX(vmacc_vx_d, 8)
1879 GEN_VEXT_VX(vnmsac_vx_b, 1)
1880 GEN_VEXT_VX(vnmsac_vx_h, 2)
1881 GEN_VEXT_VX(vnmsac_vx_w, 4)
1882 GEN_VEXT_VX(vnmsac_vx_d, 8)
1883 GEN_VEXT_VX(vmadd_vx_b, 1)
1884 GEN_VEXT_VX(vmadd_vx_h, 2)
1885 GEN_VEXT_VX(vmadd_vx_w, 4)
1886 GEN_VEXT_VX(vmadd_vx_d, 8)
1887 GEN_VEXT_VX(vnmsub_vx_b, 1)
1888 GEN_VEXT_VX(vnmsub_vx_h, 2)
1889 GEN_VEXT_VX(vnmsub_vx_w, 4)
1890 GEN_VEXT_VX(vnmsub_vx_d, 8)
1891 
1892 /* Vector Widening Integer Multiply-Add Instructions */
1893 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1894 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1895 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1896 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1897 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1898 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1899 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1900 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1901 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1902 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1903 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1904 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1905 GEN_VEXT_VV(vwmacc_vv_b, 2)
1906 GEN_VEXT_VV(vwmacc_vv_h, 4)
1907 GEN_VEXT_VV(vwmacc_vv_w, 8)
1908 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1909 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1910 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1911 
1912 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1913 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1914 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1915 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1916 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1917 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1918 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1919 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1920 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1921 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1922 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1923 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1924 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1925 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1926 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1927 GEN_VEXT_VX(vwmacc_vx_b, 2)
1928 GEN_VEXT_VX(vwmacc_vx_h, 4)
1929 GEN_VEXT_VX(vwmacc_vx_w, 8)
1930 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1931 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1932 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1933 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1934 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1935 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1936 
1937 /* Vector Integer Merge and Move Instructions */
1938 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1939 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1940                   uint32_t desc)                                     \
1941 {                                                                    \
1942     uint32_t vl = env->vl;                                           \
1943     uint32_t i;                                                      \
1944                                                                      \
1945     for (i = env->vstart; i < vl; i++) {                             \
1946         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1947         *((ETYPE *)vd + H(i)) = s1;                                  \
1948     }                                                                \
1949     env->vstart = 0;                                                 \
1950 }
1951 
1952 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1953 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1954 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1955 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1956 
1957 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1958 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1959                   uint32_t desc)                                     \
1960 {                                                                    \
1961     uint32_t vl = env->vl;                                           \
1962     uint32_t i;                                                      \
1963                                                                      \
1964     for (i = env->vstart; i < vl; i++) {                             \
1965         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1966     }                                                                \
1967     env->vstart = 0;                                                 \
1968 }
1969 
1970 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1971 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1972 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1973 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1974 
1975 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1976 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1977                   CPURISCVState *env, uint32_t desc)                 \
1978 {                                                                    \
1979     uint32_t vl = env->vl;                                           \
1980     uint32_t i;                                                      \
1981                                                                      \
1982     for (i = env->vstart; i < vl; i++) {                             \
1983         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1984         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1985     }                                                                \
1986     env->vstart = 0;                                                 \
1987 }
1988 
1989 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1990 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1991 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1992 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1993 
1994 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1995 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1996                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1997 {                                                                    \
1998     uint32_t vl = env->vl;                                           \
1999     uint32_t i;                                                      \
2000                                                                      \
2001     for (i = env->vstart; i < vl; i++) {                             \
2002         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2003         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2004                    (ETYPE)(target_long)s1);                          \
2005         *((ETYPE *)vd + H(i)) = d;                                   \
2006     }                                                                \
2007     env->vstart = 0;                                                 \
2008 }
2009 
2010 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2011 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2012 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2013 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2014 
2015 /*
2016  *** Vector Fixed-Point Arithmetic Instructions
2017  */
2018 
2019 /* Vector Single-Width Saturating Add and Subtract */
2020 
2021 /*
2022  * As fixed point instructions probably have round mode and saturation,
2023  * define common macros for fixed point here.
2024  */
2025 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2026                           CPURISCVState *env, int vxrm);
2027 
2028 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2029 static inline void                                                  \
2030 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2031           CPURISCVState *env, int vxrm)                             \
2032 {                                                                   \
2033     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2034     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2035     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2036 }
2037 
2038 static inline void
2039 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2040              CPURISCVState *env,
2041              uint32_t vl, uint32_t vm, int vxrm,
2042              opivv2_rm_fn *fn)
2043 {
2044     for (uint32_t i = env->vstart; i < vl; i++) {
2045         if (!vm && !vext_elem_mask(v0, i)) {
2046             continue;
2047         }
2048         fn(vd, vs1, vs2, i, env, vxrm);
2049     }
2050     env->vstart = 0;
2051 }
2052 
2053 static inline void
2054 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2055              CPURISCVState *env,
2056              uint32_t desc,
2057              opivv2_rm_fn *fn)
2058 {
2059     uint32_t vm = vext_vm(desc);
2060     uint32_t vl = env->vl;
2061 
2062     switch (env->vxrm) {
2063     case 0: /* rnu */
2064         vext_vv_rm_1(vd, v0, vs1, vs2,
2065                      env, vl, vm, 0, fn);
2066         break;
2067     case 1: /* rne */
2068         vext_vv_rm_1(vd, v0, vs1, vs2,
2069                      env, vl, vm, 1, fn);
2070         break;
2071     case 2: /* rdn */
2072         vext_vv_rm_1(vd, v0, vs1, vs2,
2073                      env, vl, vm, 2, fn);
2074         break;
2075     default: /* rod */
2076         vext_vv_rm_1(vd, v0, vs1, vs2,
2077                      env, vl, vm, 3, fn);
2078         break;
2079     }
2080 }
2081 
2082 /* generate helpers for fixed point instructions with OPIVV format */
2083 #define GEN_VEXT_VV_RM(NAME)                                    \
2084 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2085                   CPURISCVState *env, uint32_t desc)            \
2086 {                                                               \
2087     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2088                  do_##NAME);                                    \
2089 }
2090 
2091 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2092 {
2093     uint8_t res = a + b;
2094     if (res < a) {
2095         res = UINT8_MAX;
2096         env->vxsat = 0x1;
2097     }
2098     return res;
2099 }
2100 
2101 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2102                                uint16_t b)
2103 {
2104     uint16_t res = a + b;
2105     if (res < a) {
2106         res = UINT16_MAX;
2107         env->vxsat = 0x1;
2108     }
2109     return res;
2110 }
2111 
2112 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2113                                uint32_t b)
2114 {
2115     uint32_t res = a + b;
2116     if (res < a) {
2117         res = UINT32_MAX;
2118         env->vxsat = 0x1;
2119     }
2120     return res;
2121 }
2122 
2123 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2124                                uint64_t b)
2125 {
2126     uint64_t res = a + b;
2127     if (res < a) {
2128         res = UINT64_MAX;
2129         env->vxsat = 0x1;
2130     }
2131     return res;
2132 }
2133 
2134 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2135 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2136 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2137 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2138 GEN_VEXT_VV_RM(vsaddu_vv_b)
2139 GEN_VEXT_VV_RM(vsaddu_vv_h)
2140 GEN_VEXT_VV_RM(vsaddu_vv_w)
2141 GEN_VEXT_VV_RM(vsaddu_vv_d)
2142 
2143 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2144                           CPURISCVState *env, int vxrm);
2145 
2146 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2147 static inline void                                                  \
2148 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2149           CPURISCVState *env, int vxrm)                             \
2150 {                                                                   \
2151     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2152     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2153 }
2154 
2155 static inline void
2156 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2157              CPURISCVState *env,
2158              uint32_t vl, uint32_t vm, int vxrm,
2159              opivx2_rm_fn *fn)
2160 {
2161     for (uint32_t i = env->vstart; i < vl; i++) {
2162         if (!vm && !vext_elem_mask(v0, i)) {
2163             continue;
2164         }
2165         fn(vd, s1, vs2, i, env, vxrm);
2166     }
2167     env->vstart = 0;
2168 }
2169 
2170 static inline void
2171 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2172              CPURISCVState *env,
2173              uint32_t desc,
2174              opivx2_rm_fn *fn)
2175 {
2176     uint32_t vm = vext_vm(desc);
2177     uint32_t vl = env->vl;
2178 
2179     switch (env->vxrm) {
2180     case 0: /* rnu */
2181         vext_vx_rm_1(vd, v0, s1, vs2,
2182                      env, vl, vm, 0, fn);
2183         break;
2184     case 1: /* rne */
2185         vext_vx_rm_1(vd, v0, s1, vs2,
2186                      env, vl, vm, 1, fn);
2187         break;
2188     case 2: /* rdn */
2189         vext_vx_rm_1(vd, v0, s1, vs2,
2190                      env, vl, vm, 2, fn);
2191         break;
2192     default: /* rod */
2193         vext_vx_rm_1(vd, v0, s1, vs2,
2194                      env, vl, vm, 3, fn);
2195         break;
2196     }
2197 }
2198 
2199 /* generate helpers for fixed point instructions with OPIVX format */
2200 #define GEN_VEXT_VX_RM(NAME)                              \
2201 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2202         void *vs2, CPURISCVState *env, uint32_t desc)     \
2203 {                                                         \
2204     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2205                  do_##NAME);                              \
2206 }
2207 
2208 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2209 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2210 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2211 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2212 GEN_VEXT_VX_RM(vsaddu_vx_b)
2213 GEN_VEXT_VX_RM(vsaddu_vx_h)
2214 GEN_VEXT_VX_RM(vsaddu_vx_w)
2215 GEN_VEXT_VX_RM(vsaddu_vx_d)
2216 
2217 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2218 {
2219     int8_t res = a + b;
2220     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2221         res = a > 0 ? INT8_MAX : INT8_MIN;
2222         env->vxsat = 0x1;
2223     }
2224     return res;
2225 }
2226 
2227 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2228 {
2229     int16_t res = a + b;
2230     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2231         res = a > 0 ? INT16_MAX : INT16_MIN;
2232         env->vxsat = 0x1;
2233     }
2234     return res;
2235 }
2236 
2237 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2238 {
2239     int32_t res = a + b;
2240     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2241         res = a > 0 ? INT32_MAX : INT32_MIN;
2242         env->vxsat = 0x1;
2243     }
2244     return res;
2245 }
2246 
2247 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2248 {
2249     int64_t res = a + b;
2250     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2251         res = a > 0 ? INT64_MAX : INT64_MIN;
2252         env->vxsat = 0x1;
2253     }
2254     return res;
2255 }
2256 
2257 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2258 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2259 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2260 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2261 GEN_VEXT_VV_RM(vsadd_vv_b)
2262 GEN_VEXT_VV_RM(vsadd_vv_h)
2263 GEN_VEXT_VV_RM(vsadd_vv_w)
2264 GEN_VEXT_VV_RM(vsadd_vv_d)
2265 
2266 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2267 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2268 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2269 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2270 GEN_VEXT_VX_RM(vsadd_vx_b)
2271 GEN_VEXT_VX_RM(vsadd_vx_h)
2272 GEN_VEXT_VX_RM(vsadd_vx_w)
2273 GEN_VEXT_VX_RM(vsadd_vx_d)
2274 
2275 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2276 {
2277     uint8_t res = a - b;
2278     if (res > a) {
2279         res = 0;
2280         env->vxsat = 0x1;
2281     }
2282     return res;
2283 }
2284 
2285 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2286                                uint16_t b)
2287 {
2288     uint16_t res = a - b;
2289     if (res > a) {
2290         res = 0;
2291         env->vxsat = 0x1;
2292     }
2293     return res;
2294 }
2295 
2296 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2297                                uint32_t b)
2298 {
2299     uint32_t res = a - b;
2300     if (res > a) {
2301         res = 0;
2302         env->vxsat = 0x1;
2303     }
2304     return res;
2305 }
2306 
2307 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2308                                uint64_t b)
2309 {
2310     uint64_t res = a - b;
2311     if (res > a) {
2312         res = 0;
2313         env->vxsat = 0x1;
2314     }
2315     return res;
2316 }
2317 
2318 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2319 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2320 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2321 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2322 GEN_VEXT_VV_RM(vssubu_vv_b)
2323 GEN_VEXT_VV_RM(vssubu_vv_h)
2324 GEN_VEXT_VV_RM(vssubu_vv_w)
2325 GEN_VEXT_VV_RM(vssubu_vv_d)
2326 
2327 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2328 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2329 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2330 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2331 GEN_VEXT_VX_RM(vssubu_vx_b)
2332 GEN_VEXT_VX_RM(vssubu_vx_h)
2333 GEN_VEXT_VX_RM(vssubu_vx_w)
2334 GEN_VEXT_VX_RM(vssubu_vx_d)
2335 
2336 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2337 {
2338     int8_t res = a - b;
2339     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2340         res = a >= 0 ? INT8_MAX : INT8_MIN;
2341         env->vxsat = 0x1;
2342     }
2343     return res;
2344 }
2345 
2346 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2347 {
2348     int16_t res = a - b;
2349     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2350         res = a >= 0 ? INT16_MAX : INT16_MIN;
2351         env->vxsat = 0x1;
2352     }
2353     return res;
2354 }
2355 
2356 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2357 {
2358     int32_t res = a - b;
2359     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2360         res = a >= 0 ? INT32_MAX : INT32_MIN;
2361         env->vxsat = 0x1;
2362     }
2363     return res;
2364 }
2365 
2366 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2367 {
2368     int64_t res = a - b;
2369     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2370         res = a >= 0 ? INT64_MAX : INT64_MIN;
2371         env->vxsat = 0x1;
2372     }
2373     return res;
2374 }
2375 
2376 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2377 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2378 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2379 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2380 GEN_VEXT_VV_RM(vssub_vv_b)
2381 GEN_VEXT_VV_RM(vssub_vv_h)
2382 GEN_VEXT_VV_RM(vssub_vv_w)
2383 GEN_VEXT_VV_RM(vssub_vv_d)
2384 
2385 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2386 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2387 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2388 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2389 GEN_VEXT_VX_RM(vssub_vx_b)
2390 GEN_VEXT_VX_RM(vssub_vx_h)
2391 GEN_VEXT_VX_RM(vssub_vx_w)
2392 GEN_VEXT_VX_RM(vssub_vx_d)
2393 
2394 /* Vector Single-Width Averaging Add and Subtract */
2395 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2396 {
2397     uint8_t d = extract64(v, shift, 1);
2398     uint8_t d1;
2399     uint64_t D1, D2;
2400 
2401     if (shift == 0 || shift > 64) {
2402         return 0;
2403     }
2404 
2405     d1 = extract64(v, shift - 1, 1);
2406     D1 = extract64(v, 0, shift);
2407     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2408         return d1;
2409     } else if (vxrm == 1) { /* round-to-nearest-even */
2410         if (shift > 1) {
2411             D2 = extract64(v, 0, shift - 1);
2412             return d1 & ((D2 != 0) | d);
2413         } else {
2414             return d1 & d;
2415         }
2416     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2417         return !d & (D1 != 0);
2418     }
2419     return 0; /* round-down (truncate) */
2420 }
2421 
2422 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2423 {
2424     int64_t res = (int64_t)a + b;
2425     uint8_t round = get_round(vxrm, res, 1);
2426 
2427     return (res >> 1) + round;
2428 }
2429 
2430 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2431 {
2432     int64_t res = a + b;
2433     uint8_t round = get_round(vxrm, res, 1);
2434     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2435 
2436     /* With signed overflow, bit 64 is inverse of bit 63. */
2437     return ((res >> 1) ^ over) + round;
2438 }
2439 
2440 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2441 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2442 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2443 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2444 GEN_VEXT_VV_RM(vaadd_vv_b)
2445 GEN_VEXT_VV_RM(vaadd_vv_h)
2446 GEN_VEXT_VV_RM(vaadd_vv_w)
2447 GEN_VEXT_VV_RM(vaadd_vv_d)
2448 
2449 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2450 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2451 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2452 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2453 GEN_VEXT_VX_RM(vaadd_vx_b)
2454 GEN_VEXT_VX_RM(vaadd_vx_h)
2455 GEN_VEXT_VX_RM(vaadd_vx_w)
2456 GEN_VEXT_VX_RM(vaadd_vx_d)
2457 
2458 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2459                                uint32_t a, uint32_t b)
2460 {
2461     uint64_t res = (uint64_t)a + b;
2462     uint8_t round = get_round(vxrm, res, 1);
2463 
2464     return (res >> 1) + round;
2465 }
2466 
2467 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2468                                uint64_t a, uint64_t b)
2469 {
2470     uint64_t res = a + b;
2471     uint8_t round = get_round(vxrm, res, 1);
2472     uint64_t over = (uint64_t)(res < a) << 63;
2473 
2474     return ((res >> 1) | over) + round;
2475 }
2476 
2477 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2478 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2479 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2480 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2481 GEN_VEXT_VV_RM(vaaddu_vv_b)
2482 GEN_VEXT_VV_RM(vaaddu_vv_h)
2483 GEN_VEXT_VV_RM(vaaddu_vv_w)
2484 GEN_VEXT_VV_RM(vaaddu_vv_d)
2485 
2486 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2487 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2488 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2489 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2490 GEN_VEXT_VX_RM(vaaddu_vx_b)
2491 GEN_VEXT_VX_RM(vaaddu_vx_h)
2492 GEN_VEXT_VX_RM(vaaddu_vx_w)
2493 GEN_VEXT_VX_RM(vaaddu_vx_d)
2494 
2495 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2496 {
2497     int64_t res = (int64_t)a - b;
2498     uint8_t round = get_round(vxrm, res, 1);
2499 
2500     return (res >> 1) + round;
2501 }
2502 
2503 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2504 {
2505     int64_t res = (int64_t)a - b;
2506     uint8_t round = get_round(vxrm, res, 1);
2507     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2508 
2509     /* With signed overflow, bit 64 is inverse of bit 63. */
2510     return ((res >> 1) ^ over) + round;
2511 }
2512 
2513 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2514 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2515 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2516 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2517 GEN_VEXT_VV_RM(vasub_vv_b)
2518 GEN_VEXT_VV_RM(vasub_vv_h)
2519 GEN_VEXT_VV_RM(vasub_vv_w)
2520 GEN_VEXT_VV_RM(vasub_vv_d)
2521 
2522 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2523 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2524 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2525 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2526 GEN_VEXT_VX_RM(vasub_vx_b)
2527 GEN_VEXT_VX_RM(vasub_vx_h)
2528 GEN_VEXT_VX_RM(vasub_vx_w)
2529 GEN_VEXT_VX_RM(vasub_vx_d)
2530 
2531 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2532                                uint32_t a, uint32_t b)
2533 {
2534     int64_t res = (int64_t)a - b;
2535     uint8_t round = get_round(vxrm, res, 1);
2536 
2537     return (res >> 1) + round;
2538 }
2539 
2540 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2541                                uint64_t a, uint64_t b)
2542 {
2543     uint64_t res = (uint64_t)a - b;
2544     uint8_t round = get_round(vxrm, res, 1);
2545     uint64_t over = (uint64_t)(res > a) << 63;
2546 
2547     return ((res >> 1) | over) + round;
2548 }
2549 
2550 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2551 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2552 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2553 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2554 GEN_VEXT_VV_RM(vasubu_vv_b)
2555 GEN_VEXT_VV_RM(vasubu_vv_h)
2556 GEN_VEXT_VV_RM(vasubu_vv_w)
2557 GEN_VEXT_VV_RM(vasubu_vv_d)
2558 
2559 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2560 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2561 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2562 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2563 GEN_VEXT_VX_RM(vasubu_vx_b)
2564 GEN_VEXT_VX_RM(vasubu_vx_h)
2565 GEN_VEXT_VX_RM(vasubu_vx_w)
2566 GEN_VEXT_VX_RM(vasubu_vx_d)
2567 
2568 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2569 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2570 {
2571     uint8_t round;
2572     int16_t res;
2573 
2574     res = (int16_t)a * (int16_t)b;
2575     round = get_round(vxrm, res, 7);
2576     res   = (res >> 7) + round;
2577 
2578     if (res > INT8_MAX) {
2579         env->vxsat = 0x1;
2580         return INT8_MAX;
2581     } else if (res < INT8_MIN) {
2582         env->vxsat = 0x1;
2583         return INT8_MIN;
2584     } else {
2585         return res;
2586     }
2587 }
2588 
2589 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2590 {
2591     uint8_t round;
2592     int32_t res;
2593 
2594     res = (int32_t)a * (int32_t)b;
2595     round = get_round(vxrm, res, 15);
2596     res   = (res >> 15) + round;
2597 
2598     if (res > INT16_MAX) {
2599         env->vxsat = 0x1;
2600         return INT16_MAX;
2601     } else if (res < INT16_MIN) {
2602         env->vxsat = 0x1;
2603         return INT16_MIN;
2604     } else {
2605         return res;
2606     }
2607 }
2608 
2609 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2610 {
2611     uint8_t round;
2612     int64_t res;
2613 
2614     res = (int64_t)a * (int64_t)b;
2615     round = get_round(vxrm, res, 31);
2616     res   = (res >> 31) + round;
2617 
2618     if (res > INT32_MAX) {
2619         env->vxsat = 0x1;
2620         return INT32_MAX;
2621     } else if (res < INT32_MIN) {
2622         env->vxsat = 0x1;
2623         return INT32_MIN;
2624     } else {
2625         return res;
2626     }
2627 }
2628 
2629 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2630 {
2631     uint8_t round;
2632     uint64_t hi_64, lo_64;
2633     int64_t res;
2634 
2635     if (a == INT64_MIN && b == INT64_MIN) {
2636         env->vxsat = 1;
2637         return INT64_MAX;
2638     }
2639 
2640     muls64(&lo_64, &hi_64, a, b);
2641     round = get_round(vxrm, lo_64, 63);
2642     /*
2643      * Cannot overflow, as there are always
2644      * 2 sign bits after multiply.
2645      */
2646     res = (hi_64 << 1) | (lo_64 >> 63);
2647     if (round) {
2648         if (res == INT64_MAX) {
2649             env->vxsat = 1;
2650         } else {
2651             res += 1;
2652         }
2653     }
2654     return res;
2655 }
2656 
2657 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2658 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2659 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2660 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2661 GEN_VEXT_VV_RM(vsmul_vv_b)
2662 GEN_VEXT_VV_RM(vsmul_vv_h)
2663 GEN_VEXT_VV_RM(vsmul_vv_w)
2664 GEN_VEXT_VV_RM(vsmul_vv_d)
2665 
2666 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2667 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2668 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2669 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2670 GEN_VEXT_VX_RM(vsmul_vx_b)
2671 GEN_VEXT_VX_RM(vsmul_vx_h)
2672 GEN_VEXT_VX_RM(vsmul_vx_w)
2673 GEN_VEXT_VX_RM(vsmul_vx_d)
2674 
2675 /* Vector Single-Width Scaling Shift Instructions */
2676 static inline uint8_t
2677 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2678 {
2679     uint8_t round, shift = b & 0x7;
2680     uint8_t res;
2681 
2682     round = get_round(vxrm, a, shift);
2683     res   = (a >> shift)  + round;
2684     return res;
2685 }
2686 static inline uint16_t
2687 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2688 {
2689     uint8_t round, shift = b & 0xf;
2690     uint16_t res;
2691 
2692     round = get_round(vxrm, a, shift);
2693     res   = (a >> shift)  + round;
2694     return res;
2695 }
2696 static inline uint32_t
2697 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2698 {
2699     uint8_t round, shift = b & 0x1f;
2700     uint32_t res;
2701 
2702     round = get_round(vxrm, a, shift);
2703     res   = (a >> shift)  + round;
2704     return res;
2705 }
2706 static inline uint64_t
2707 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2708 {
2709     uint8_t round, shift = b & 0x3f;
2710     uint64_t res;
2711 
2712     round = get_round(vxrm, a, shift);
2713     res   = (a >> shift)  + round;
2714     return res;
2715 }
2716 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2717 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2718 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2719 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2720 GEN_VEXT_VV_RM(vssrl_vv_b)
2721 GEN_VEXT_VV_RM(vssrl_vv_h)
2722 GEN_VEXT_VV_RM(vssrl_vv_w)
2723 GEN_VEXT_VV_RM(vssrl_vv_d)
2724 
2725 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2726 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2727 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2728 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2729 GEN_VEXT_VX_RM(vssrl_vx_b)
2730 GEN_VEXT_VX_RM(vssrl_vx_h)
2731 GEN_VEXT_VX_RM(vssrl_vx_w)
2732 GEN_VEXT_VX_RM(vssrl_vx_d)
2733 
2734 static inline int8_t
2735 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2736 {
2737     uint8_t round, shift = b & 0x7;
2738     int8_t res;
2739 
2740     round = get_round(vxrm, a, shift);
2741     res   = (a >> shift)  + round;
2742     return res;
2743 }
2744 static inline int16_t
2745 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2746 {
2747     uint8_t round, shift = b & 0xf;
2748     int16_t res;
2749 
2750     round = get_round(vxrm, a, shift);
2751     res   = (a >> shift)  + round;
2752     return res;
2753 }
2754 static inline int32_t
2755 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2756 {
2757     uint8_t round, shift = b & 0x1f;
2758     int32_t res;
2759 
2760     round = get_round(vxrm, a, shift);
2761     res   = (a >> shift)  + round;
2762     return res;
2763 }
2764 static inline int64_t
2765 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2766 {
2767     uint8_t round, shift = b & 0x3f;
2768     int64_t res;
2769 
2770     round = get_round(vxrm, a, shift);
2771     res   = (a >> shift)  + round;
2772     return res;
2773 }
2774 
2775 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2776 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2777 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2778 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2779 GEN_VEXT_VV_RM(vssra_vv_b)
2780 GEN_VEXT_VV_RM(vssra_vv_h)
2781 GEN_VEXT_VV_RM(vssra_vv_w)
2782 GEN_VEXT_VV_RM(vssra_vv_d)
2783 
2784 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2785 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2786 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2787 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2788 GEN_VEXT_VX_RM(vssra_vx_b)
2789 GEN_VEXT_VX_RM(vssra_vx_h)
2790 GEN_VEXT_VX_RM(vssra_vx_w)
2791 GEN_VEXT_VX_RM(vssra_vx_d)
2792 
2793 /* Vector Narrowing Fixed-Point Clip Instructions */
2794 static inline int8_t
2795 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2796 {
2797     uint8_t round, shift = b & 0xf;
2798     int16_t res;
2799 
2800     round = get_round(vxrm, a, shift);
2801     res   = (a >> shift)  + round;
2802     if (res > INT8_MAX) {
2803         env->vxsat = 0x1;
2804         return INT8_MAX;
2805     } else if (res < INT8_MIN) {
2806         env->vxsat = 0x1;
2807         return INT8_MIN;
2808     } else {
2809         return res;
2810     }
2811 }
2812 
2813 static inline int16_t
2814 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2815 {
2816     uint8_t round, shift = b & 0x1f;
2817     int32_t res;
2818 
2819     round = get_round(vxrm, a, shift);
2820     res   = (a >> shift)  + round;
2821     if (res > INT16_MAX) {
2822         env->vxsat = 0x1;
2823         return INT16_MAX;
2824     } else if (res < INT16_MIN) {
2825         env->vxsat = 0x1;
2826         return INT16_MIN;
2827     } else {
2828         return res;
2829     }
2830 }
2831 
2832 static inline int32_t
2833 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2834 {
2835     uint8_t round, shift = b & 0x3f;
2836     int64_t res;
2837 
2838     round = get_round(vxrm, a, shift);
2839     res   = (a >> shift)  + round;
2840     if (res > INT32_MAX) {
2841         env->vxsat = 0x1;
2842         return INT32_MAX;
2843     } else if (res < INT32_MIN) {
2844         env->vxsat = 0x1;
2845         return INT32_MIN;
2846     } else {
2847         return res;
2848     }
2849 }
2850 
2851 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2852 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2853 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2854 GEN_VEXT_VV_RM(vnclip_wv_b)
2855 GEN_VEXT_VV_RM(vnclip_wv_h)
2856 GEN_VEXT_VV_RM(vnclip_wv_w)
2857 
2858 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2859 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2860 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2861 GEN_VEXT_VX_RM(vnclip_wx_b)
2862 GEN_VEXT_VX_RM(vnclip_wx_h)
2863 GEN_VEXT_VX_RM(vnclip_wx_w)
2864 
2865 static inline uint8_t
2866 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2867 {
2868     uint8_t round, shift = b & 0xf;
2869     uint16_t res;
2870 
2871     round = get_round(vxrm, a, shift);
2872     res   = (a >> shift)  + round;
2873     if (res > UINT8_MAX) {
2874         env->vxsat = 0x1;
2875         return UINT8_MAX;
2876     } else {
2877         return res;
2878     }
2879 }
2880 
2881 static inline uint16_t
2882 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2883 {
2884     uint8_t round, shift = b & 0x1f;
2885     uint32_t res;
2886 
2887     round = get_round(vxrm, a, shift);
2888     res   = (a >> shift)  + round;
2889     if (res > UINT16_MAX) {
2890         env->vxsat = 0x1;
2891         return UINT16_MAX;
2892     } else {
2893         return res;
2894     }
2895 }
2896 
2897 static inline uint32_t
2898 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2899 {
2900     uint8_t round, shift = b & 0x3f;
2901     uint64_t res;
2902 
2903     round = get_round(vxrm, a, shift);
2904     res   = (a >> shift)  + round;
2905     if (res > UINT32_MAX) {
2906         env->vxsat = 0x1;
2907         return UINT32_MAX;
2908     } else {
2909         return res;
2910     }
2911 }
2912 
2913 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2914 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2915 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2916 GEN_VEXT_VV_RM(vnclipu_wv_b)
2917 GEN_VEXT_VV_RM(vnclipu_wv_h)
2918 GEN_VEXT_VV_RM(vnclipu_wv_w)
2919 
2920 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2921 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2922 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2923 GEN_VEXT_VX_RM(vnclipu_wx_b)
2924 GEN_VEXT_VX_RM(vnclipu_wx_h)
2925 GEN_VEXT_VX_RM(vnclipu_wx_w)
2926 
2927 /*
2928  *** Vector Float Point Arithmetic Instructions
2929  */
2930 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2931 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2932 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2933                       CPURISCVState *env)                      \
2934 {                                                              \
2935     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2936     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2937     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2938 }
2939 
2940 #define GEN_VEXT_VV_ENV(NAME)                             \
2941 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2942                   void *vs2, CPURISCVState *env,          \
2943                   uint32_t desc)                          \
2944 {                                                         \
2945     uint32_t vm = vext_vm(desc);                          \
2946     uint32_t vl = env->vl;                                \
2947     uint32_t i;                                           \
2948                                                           \
2949     for (i = env->vstart; i < vl; i++) {                  \
2950         if (!vm && !vext_elem_mask(v0, i)) {              \
2951             continue;                                     \
2952         }                                                 \
2953         do_##NAME(vd, vs1, vs2, i, env);                  \
2954     }                                                     \
2955     env->vstart = 0;                                      \
2956 }
2957 
2958 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2959 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2960 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2961 GEN_VEXT_VV_ENV(vfadd_vv_h)
2962 GEN_VEXT_VV_ENV(vfadd_vv_w)
2963 GEN_VEXT_VV_ENV(vfadd_vv_d)
2964 
2965 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2966 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2967                       CPURISCVState *env)                      \
2968 {                                                              \
2969     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2970     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2971 }
2972 
2973 #define GEN_VEXT_VF(NAME)                                 \
2974 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2975                   void *vs2, CPURISCVState *env,          \
2976                   uint32_t desc)                          \
2977 {                                                         \
2978     uint32_t vm = vext_vm(desc);                          \
2979     uint32_t vl = env->vl;                                \
2980     uint32_t i;                                           \
2981                                                           \
2982     for (i = env->vstart; i < vl; i++) {                  \
2983         if (!vm && !vext_elem_mask(v0, i)) {              \
2984             continue;                                     \
2985         }                                                 \
2986         do_##NAME(vd, s1, vs2, i, env);                   \
2987     }                                                     \
2988     env->vstart = 0;                                      \
2989 }
2990 
2991 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2992 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2993 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2994 GEN_VEXT_VF(vfadd_vf_h)
2995 GEN_VEXT_VF(vfadd_vf_w)
2996 GEN_VEXT_VF(vfadd_vf_d)
2997 
2998 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2999 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3000 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3001 GEN_VEXT_VV_ENV(vfsub_vv_h)
3002 GEN_VEXT_VV_ENV(vfsub_vv_w)
3003 GEN_VEXT_VV_ENV(vfsub_vv_d)
3004 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3005 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3006 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3007 GEN_VEXT_VF(vfsub_vf_h)
3008 GEN_VEXT_VF(vfsub_vf_w)
3009 GEN_VEXT_VF(vfsub_vf_d)
3010 
3011 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3012 {
3013     return float16_sub(b, a, s);
3014 }
3015 
3016 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3017 {
3018     return float32_sub(b, a, s);
3019 }
3020 
3021 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3022 {
3023     return float64_sub(b, a, s);
3024 }
3025 
3026 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3027 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3028 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3029 GEN_VEXT_VF(vfrsub_vf_h)
3030 GEN_VEXT_VF(vfrsub_vf_w)
3031 GEN_VEXT_VF(vfrsub_vf_d)
3032 
3033 /* Vector Widening Floating-Point Add/Subtract Instructions */
3034 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3035 {
3036     return float32_add(float16_to_float32(a, true, s),
3037             float16_to_float32(b, true, s), s);
3038 }
3039 
3040 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3041 {
3042     return float64_add(float32_to_float64(a, s),
3043             float32_to_float64(b, s), s);
3044 
3045 }
3046 
3047 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3048 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3049 GEN_VEXT_VV_ENV(vfwadd_vv_h)
3050 GEN_VEXT_VV_ENV(vfwadd_vv_w)
3051 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3052 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3053 GEN_VEXT_VF(vfwadd_vf_h)
3054 GEN_VEXT_VF(vfwadd_vf_w)
3055 
3056 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3057 {
3058     return float32_sub(float16_to_float32(a, true, s),
3059             float16_to_float32(b, true, s), s);
3060 }
3061 
3062 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3063 {
3064     return float64_sub(float32_to_float64(a, s),
3065             float32_to_float64(b, s), s);
3066 
3067 }
3068 
3069 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3070 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3071 GEN_VEXT_VV_ENV(vfwsub_vv_h)
3072 GEN_VEXT_VV_ENV(vfwsub_vv_w)
3073 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3074 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3075 GEN_VEXT_VF(vfwsub_vf_h)
3076 GEN_VEXT_VF(vfwsub_vf_w)
3077 
3078 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3079 {
3080     return float32_add(a, float16_to_float32(b, true, s), s);
3081 }
3082 
3083 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3084 {
3085     return float64_add(a, float32_to_float64(b, s), s);
3086 }
3087 
3088 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3089 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3090 GEN_VEXT_VV_ENV(vfwadd_wv_h)
3091 GEN_VEXT_VV_ENV(vfwadd_wv_w)
3092 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3093 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3094 GEN_VEXT_VF(vfwadd_wf_h)
3095 GEN_VEXT_VF(vfwadd_wf_w)
3096 
3097 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3098 {
3099     return float32_sub(a, float16_to_float32(b, true, s), s);
3100 }
3101 
3102 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3103 {
3104     return float64_sub(a, float32_to_float64(b, s), s);
3105 }
3106 
3107 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3108 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3109 GEN_VEXT_VV_ENV(vfwsub_wv_h)
3110 GEN_VEXT_VV_ENV(vfwsub_wv_w)
3111 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3112 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3113 GEN_VEXT_VF(vfwsub_wf_h)
3114 GEN_VEXT_VF(vfwsub_wf_w)
3115 
3116 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3117 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3118 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3119 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3120 GEN_VEXT_VV_ENV(vfmul_vv_h)
3121 GEN_VEXT_VV_ENV(vfmul_vv_w)
3122 GEN_VEXT_VV_ENV(vfmul_vv_d)
3123 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3124 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3125 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3126 GEN_VEXT_VF(vfmul_vf_h)
3127 GEN_VEXT_VF(vfmul_vf_w)
3128 GEN_VEXT_VF(vfmul_vf_d)
3129 
3130 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3131 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3132 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3133 GEN_VEXT_VV_ENV(vfdiv_vv_h)
3134 GEN_VEXT_VV_ENV(vfdiv_vv_w)
3135 GEN_VEXT_VV_ENV(vfdiv_vv_d)
3136 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3137 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3138 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3139 GEN_VEXT_VF(vfdiv_vf_h)
3140 GEN_VEXT_VF(vfdiv_vf_w)
3141 GEN_VEXT_VF(vfdiv_vf_d)
3142 
3143 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3144 {
3145     return float16_div(b, a, s);
3146 }
3147 
3148 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3149 {
3150     return float32_div(b, a, s);
3151 }
3152 
3153 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3154 {
3155     return float64_div(b, a, s);
3156 }
3157 
3158 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3159 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3160 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3161 GEN_VEXT_VF(vfrdiv_vf_h)
3162 GEN_VEXT_VF(vfrdiv_vf_w)
3163 GEN_VEXT_VF(vfrdiv_vf_d)
3164 
3165 /* Vector Widening Floating-Point Multiply */
3166 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3167 {
3168     return float32_mul(float16_to_float32(a, true, s),
3169             float16_to_float32(b, true, s), s);
3170 }
3171 
3172 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3173 {
3174     return float64_mul(float32_to_float64(a, s),
3175             float32_to_float64(b, s), s);
3176 
3177 }
3178 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3179 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3180 GEN_VEXT_VV_ENV(vfwmul_vv_h)
3181 GEN_VEXT_VV_ENV(vfwmul_vv_w)
3182 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3183 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3184 GEN_VEXT_VF(vfwmul_vf_h)
3185 GEN_VEXT_VF(vfwmul_vf_w)
3186 
3187 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3188 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3189 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3190         CPURISCVState *env)                                        \
3191 {                                                                  \
3192     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3193     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3194     TD d = *((TD *)vd + HD(i));                                    \
3195     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3196 }
3197 
3198 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3199 {
3200     return float16_muladd(a, b, d, 0, s);
3201 }
3202 
3203 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3204 {
3205     return float32_muladd(a, b, d, 0, s);
3206 }
3207 
3208 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3209 {
3210     return float64_muladd(a, b, d, 0, s);
3211 }
3212 
3213 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3214 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3215 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3216 GEN_VEXT_VV_ENV(vfmacc_vv_h)
3217 GEN_VEXT_VV_ENV(vfmacc_vv_w)
3218 GEN_VEXT_VV_ENV(vfmacc_vv_d)
3219 
3220 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3221 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3222         CPURISCVState *env)                                       \
3223 {                                                                 \
3224     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3225     TD d = *((TD *)vd + HD(i));                                   \
3226     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3227 }
3228 
3229 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3230 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3231 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3232 GEN_VEXT_VF(vfmacc_vf_h)
3233 GEN_VEXT_VF(vfmacc_vf_w)
3234 GEN_VEXT_VF(vfmacc_vf_d)
3235 
3236 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3237 {
3238     return float16_muladd(a, b, d,
3239             float_muladd_negate_c | float_muladd_negate_product, s);
3240 }
3241 
3242 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3243 {
3244     return float32_muladd(a, b, d,
3245             float_muladd_negate_c | float_muladd_negate_product, s);
3246 }
3247 
3248 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3249 {
3250     return float64_muladd(a, b, d,
3251             float_muladd_negate_c | float_muladd_negate_product, s);
3252 }
3253 
3254 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3255 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3256 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3257 GEN_VEXT_VV_ENV(vfnmacc_vv_h)
3258 GEN_VEXT_VV_ENV(vfnmacc_vv_w)
3259 GEN_VEXT_VV_ENV(vfnmacc_vv_d)
3260 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3261 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3262 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3263 GEN_VEXT_VF(vfnmacc_vf_h)
3264 GEN_VEXT_VF(vfnmacc_vf_w)
3265 GEN_VEXT_VF(vfnmacc_vf_d)
3266 
3267 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3268 {
3269     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3270 }
3271 
3272 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3273 {
3274     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3275 }
3276 
3277 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3278 {
3279     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3280 }
3281 
3282 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3283 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3284 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3285 GEN_VEXT_VV_ENV(vfmsac_vv_h)
3286 GEN_VEXT_VV_ENV(vfmsac_vv_w)
3287 GEN_VEXT_VV_ENV(vfmsac_vv_d)
3288 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3289 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3290 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3291 GEN_VEXT_VF(vfmsac_vf_h)
3292 GEN_VEXT_VF(vfmsac_vf_w)
3293 GEN_VEXT_VF(vfmsac_vf_d)
3294 
3295 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3296 {
3297     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3298 }
3299 
3300 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3301 {
3302     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3303 }
3304 
3305 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3306 {
3307     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3308 }
3309 
3310 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3311 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3312 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3313 GEN_VEXT_VV_ENV(vfnmsac_vv_h)
3314 GEN_VEXT_VV_ENV(vfnmsac_vv_w)
3315 GEN_VEXT_VV_ENV(vfnmsac_vv_d)
3316 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3317 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3318 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3319 GEN_VEXT_VF(vfnmsac_vf_h)
3320 GEN_VEXT_VF(vfnmsac_vf_w)
3321 GEN_VEXT_VF(vfnmsac_vf_d)
3322 
3323 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3324 {
3325     return float16_muladd(d, b, a, 0, s);
3326 }
3327 
3328 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3329 {
3330     return float32_muladd(d, b, a, 0, s);
3331 }
3332 
3333 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3334 {
3335     return float64_muladd(d, b, a, 0, s);
3336 }
3337 
3338 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3339 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3340 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3341 GEN_VEXT_VV_ENV(vfmadd_vv_h)
3342 GEN_VEXT_VV_ENV(vfmadd_vv_w)
3343 GEN_VEXT_VV_ENV(vfmadd_vv_d)
3344 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3345 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3346 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3347 GEN_VEXT_VF(vfmadd_vf_h)
3348 GEN_VEXT_VF(vfmadd_vf_w)
3349 GEN_VEXT_VF(vfmadd_vf_d)
3350 
3351 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3352 {
3353     return float16_muladd(d, b, a,
3354             float_muladd_negate_c | float_muladd_negate_product, s);
3355 }
3356 
3357 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3358 {
3359     return float32_muladd(d, b, a,
3360             float_muladd_negate_c | float_muladd_negate_product, s);
3361 }
3362 
3363 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3364 {
3365     return float64_muladd(d, b, a,
3366             float_muladd_negate_c | float_muladd_negate_product, s);
3367 }
3368 
3369 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3370 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3371 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3372 GEN_VEXT_VV_ENV(vfnmadd_vv_h)
3373 GEN_VEXT_VV_ENV(vfnmadd_vv_w)
3374 GEN_VEXT_VV_ENV(vfnmadd_vv_d)
3375 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3376 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3377 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3378 GEN_VEXT_VF(vfnmadd_vf_h)
3379 GEN_VEXT_VF(vfnmadd_vf_w)
3380 GEN_VEXT_VF(vfnmadd_vf_d)
3381 
3382 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3383 {
3384     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3385 }
3386 
3387 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3388 {
3389     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3390 }
3391 
3392 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3393 {
3394     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3395 }
3396 
3397 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3398 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3399 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3400 GEN_VEXT_VV_ENV(vfmsub_vv_h)
3401 GEN_VEXT_VV_ENV(vfmsub_vv_w)
3402 GEN_VEXT_VV_ENV(vfmsub_vv_d)
3403 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3404 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3405 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3406 GEN_VEXT_VF(vfmsub_vf_h)
3407 GEN_VEXT_VF(vfmsub_vf_w)
3408 GEN_VEXT_VF(vfmsub_vf_d)
3409 
3410 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3411 {
3412     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3413 }
3414 
3415 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3416 {
3417     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3418 }
3419 
3420 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3421 {
3422     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3423 }
3424 
3425 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3426 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3427 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3428 GEN_VEXT_VV_ENV(vfnmsub_vv_h)
3429 GEN_VEXT_VV_ENV(vfnmsub_vv_w)
3430 GEN_VEXT_VV_ENV(vfnmsub_vv_d)
3431 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3432 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3433 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3434 GEN_VEXT_VF(vfnmsub_vf_h)
3435 GEN_VEXT_VF(vfnmsub_vf_w)
3436 GEN_VEXT_VF(vfnmsub_vf_d)
3437 
3438 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3439 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3440 {
3441     return float32_muladd(float16_to_float32(a, true, s),
3442                         float16_to_float32(b, true, s), d, 0, s);
3443 }
3444 
3445 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3446 {
3447     return float64_muladd(float32_to_float64(a, s),
3448                         float32_to_float64(b, s), d, 0, s);
3449 }
3450 
3451 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3452 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3453 GEN_VEXT_VV_ENV(vfwmacc_vv_h)
3454 GEN_VEXT_VV_ENV(vfwmacc_vv_w)
3455 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3456 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3457 GEN_VEXT_VF(vfwmacc_vf_h)
3458 GEN_VEXT_VF(vfwmacc_vf_w)
3459 
3460 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3461 {
3462     return float32_muladd(float16_to_float32(a, true, s),
3463                         float16_to_float32(b, true, s), d,
3464                         float_muladd_negate_c | float_muladd_negate_product, s);
3465 }
3466 
3467 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3468 {
3469     return float64_muladd(float32_to_float64(a, s),
3470                         float32_to_float64(b, s), d,
3471                         float_muladd_negate_c | float_muladd_negate_product, s);
3472 }
3473 
3474 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3475 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3476 GEN_VEXT_VV_ENV(vfwnmacc_vv_h)
3477 GEN_VEXT_VV_ENV(vfwnmacc_vv_w)
3478 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3479 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3480 GEN_VEXT_VF(vfwnmacc_vf_h)
3481 GEN_VEXT_VF(vfwnmacc_vf_w)
3482 
3483 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3484 {
3485     return float32_muladd(float16_to_float32(a, true, s),
3486                         float16_to_float32(b, true, s), d,
3487                         float_muladd_negate_c, s);
3488 }
3489 
3490 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3491 {
3492     return float64_muladd(float32_to_float64(a, s),
3493                         float32_to_float64(b, s), d,
3494                         float_muladd_negate_c, s);
3495 }
3496 
3497 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3498 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3499 GEN_VEXT_VV_ENV(vfwmsac_vv_h)
3500 GEN_VEXT_VV_ENV(vfwmsac_vv_w)
3501 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3502 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3503 GEN_VEXT_VF(vfwmsac_vf_h)
3504 GEN_VEXT_VF(vfwmsac_vf_w)
3505 
3506 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3507 {
3508     return float32_muladd(float16_to_float32(a, true, s),
3509                         float16_to_float32(b, true, s), d,
3510                         float_muladd_negate_product, s);
3511 }
3512 
3513 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3514 {
3515     return float64_muladd(float32_to_float64(a, s),
3516                         float32_to_float64(b, s), d,
3517                         float_muladd_negate_product, s);
3518 }
3519 
3520 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3521 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3522 GEN_VEXT_VV_ENV(vfwnmsac_vv_h)
3523 GEN_VEXT_VV_ENV(vfwnmsac_vv_w)
3524 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3525 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3526 GEN_VEXT_VF(vfwnmsac_vf_h)
3527 GEN_VEXT_VF(vfwnmsac_vf_w)
3528 
3529 /* Vector Floating-Point Square-Root Instruction */
3530 /* (TD, T2, TX2) */
3531 #define OP_UU_H uint16_t, uint16_t, uint16_t
3532 #define OP_UU_W uint32_t, uint32_t, uint32_t
3533 #define OP_UU_D uint64_t, uint64_t, uint64_t
3534 
3535 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3536 static void do_##NAME(void *vd, void *vs2, int i,      \
3537         CPURISCVState *env)                            \
3538 {                                                      \
3539     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3540     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3541 }
3542 
3543 #define GEN_VEXT_V_ENV(NAME)                           \
3544 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3545         CPURISCVState *env, uint32_t desc)             \
3546 {                                                      \
3547     uint32_t vm = vext_vm(desc);                       \
3548     uint32_t vl = env->vl;                             \
3549     uint32_t i;                                        \
3550                                                        \
3551     if (vl == 0) {                                     \
3552         return;                                        \
3553     }                                                  \
3554     for (i = env->vstart; i < vl; i++) {               \
3555         if (!vm && !vext_elem_mask(v0, i)) {           \
3556             continue;                                  \
3557         }                                              \
3558         do_##NAME(vd, vs2, i, env);                    \
3559     }                                                  \
3560     env->vstart = 0;                                   \
3561 }
3562 
3563 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3564 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3565 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3566 GEN_VEXT_V_ENV(vfsqrt_v_h)
3567 GEN_VEXT_V_ENV(vfsqrt_v_w)
3568 GEN_VEXT_V_ENV(vfsqrt_v_d)
3569 
3570 /*
3571  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3572  *
3573  * Adapted from riscv-v-spec recip.c:
3574  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3575  */
3576 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3577 {
3578     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3579     uint64_t exp = extract64(f, frac_size, exp_size);
3580     uint64_t frac = extract64(f, 0, frac_size);
3581 
3582     const uint8_t lookup_table[] = {
3583         52, 51, 50, 48, 47, 46, 44, 43,
3584         42, 41, 40, 39, 38, 36, 35, 34,
3585         33, 32, 31, 30, 30, 29, 28, 27,
3586         26, 25, 24, 23, 23, 22, 21, 20,
3587         19, 19, 18, 17, 16, 16, 15, 14,
3588         14, 13, 12, 12, 11, 10, 10, 9,
3589         9, 8, 7, 7, 6, 6, 5, 4,
3590         4, 3, 3, 2, 2, 1, 1, 0,
3591         127, 125, 123, 121, 119, 118, 116, 114,
3592         113, 111, 109, 108, 106, 105, 103, 102,
3593         100, 99, 97, 96, 95, 93, 92, 91,
3594         90, 88, 87, 86, 85, 84, 83, 82,
3595         80, 79, 78, 77, 76, 75, 74, 73,
3596         72, 71, 70, 70, 69, 68, 67, 66,
3597         65, 64, 63, 63, 62, 61, 60, 59,
3598         59, 58, 57, 56, 56, 55, 54, 53
3599     };
3600     const int precision = 7;
3601 
3602     if (exp == 0 && frac != 0) { /* subnormal */
3603         /* Normalize the subnormal. */
3604         while (extract64(frac, frac_size - 1, 1) == 0) {
3605             exp--;
3606             frac <<= 1;
3607         }
3608 
3609         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3610     }
3611 
3612     int idx = ((exp & 1) << (precision - 1)) |
3613                 (frac >> (frac_size - precision + 1));
3614     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3615                             (frac_size - precision);
3616     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3617 
3618     uint64_t val = 0;
3619     val = deposit64(val, 0, frac_size, out_frac);
3620     val = deposit64(val, frac_size, exp_size, out_exp);
3621     val = deposit64(val, frac_size + exp_size, 1, sign);
3622     return val;
3623 }
3624 
3625 static float16 frsqrt7_h(float16 f, float_status *s)
3626 {
3627     int exp_size = 5, frac_size = 10;
3628     bool sign = float16_is_neg(f);
3629 
3630     /*
3631      * frsqrt7(sNaN) = canonical NaN
3632      * frsqrt7(-inf) = canonical NaN
3633      * frsqrt7(-normal) = canonical NaN
3634      * frsqrt7(-subnormal) = canonical NaN
3635      */
3636     if (float16_is_signaling_nan(f, s) ||
3637             (float16_is_infinity(f) && sign) ||
3638             (float16_is_normal(f) && sign) ||
3639             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3640         s->float_exception_flags |= float_flag_invalid;
3641         return float16_default_nan(s);
3642     }
3643 
3644     /* frsqrt7(qNaN) = canonical NaN */
3645     if (float16_is_quiet_nan(f, s)) {
3646         return float16_default_nan(s);
3647     }
3648 
3649     /* frsqrt7(+-0) = +-inf */
3650     if (float16_is_zero(f)) {
3651         s->float_exception_flags |= float_flag_divbyzero;
3652         return float16_set_sign(float16_infinity, sign);
3653     }
3654 
3655     /* frsqrt7(+inf) = +0 */
3656     if (float16_is_infinity(f) && !sign) {
3657         return float16_set_sign(float16_zero, sign);
3658     }
3659 
3660     /* +normal, +subnormal */
3661     uint64_t val = frsqrt7(f, exp_size, frac_size);
3662     return make_float16(val);
3663 }
3664 
3665 static float32 frsqrt7_s(float32 f, float_status *s)
3666 {
3667     int exp_size = 8, frac_size = 23;
3668     bool sign = float32_is_neg(f);
3669 
3670     /*
3671      * frsqrt7(sNaN) = canonical NaN
3672      * frsqrt7(-inf) = canonical NaN
3673      * frsqrt7(-normal) = canonical NaN
3674      * frsqrt7(-subnormal) = canonical NaN
3675      */
3676     if (float32_is_signaling_nan(f, s) ||
3677             (float32_is_infinity(f) && sign) ||
3678             (float32_is_normal(f) && sign) ||
3679             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3680         s->float_exception_flags |= float_flag_invalid;
3681         return float32_default_nan(s);
3682     }
3683 
3684     /* frsqrt7(qNaN) = canonical NaN */
3685     if (float32_is_quiet_nan(f, s)) {
3686         return float32_default_nan(s);
3687     }
3688 
3689     /* frsqrt7(+-0) = +-inf */
3690     if (float32_is_zero(f)) {
3691         s->float_exception_flags |= float_flag_divbyzero;
3692         return float32_set_sign(float32_infinity, sign);
3693     }
3694 
3695     /* frsqrt7(+inf) = +0 */
3696     if (float32_is_infinity(f) && !sign) {
3697         return float32_set_sign(float32_zero, sign);
3698     }
3699 
3700     /* +normal, +subnormal */
3701     uint64_t val = frsqrt7(f, exp_size, frac_size);
3702     return make_float32(val);
3703 }
3704 
3705 static float64 frsqrt7_d(float64 f, float_status *s)
3706 {
3707     int exp_size = 11, frac_size = 52;
3708     bool sign = float64_is_neg(f);
3709 
3710     /*
3711      * frsqrt7(sNaN) = canonical NaN
3712      * frsqrt7(-inf) = canonical NaN
3713      * frsqrt7(-normal) = canonical NaN
3714      * frsqrt7(-subnormal) = canonical NaN
3715      */
3716     if (float64_is_signaling_nan(f, s) ||
3717             (float64_is_infinity(f) && sign) ||
3718             (float64_is_normal(f) && sign) ||
3719             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3720         s->float_exception_flags |= float_flag_invalid;
3721         return float64_default_nan(s);
3722     }
3723 
3724     /* frsqrt7(qNaN) = canonical NaN */
3725     if (float64_is_quiet_nan(f, s)) {
3726         return float64_default_nan(s);
3727     }
3728 
3729     /* frsqrt7(+-0) = +-inf */
3730     if (float64_is_zero(f)) {
3731         s->float_exception_flags |= float_flag_divbyzero;
3732         return float64_set_sign(float64_infinity, sign);
3733     }
3734 
3735     /* frsqrt7(+inf) = +0 */
3736     if (float64_is_infinity(f) && !sign) {
3737         return float64_set_sign(float64_zero, sign);
3738     }
3739 
3740     /* +normal, +subnormal */
3741     uint64_t val = frsqrt7(f, exp_size, frac_size);
3742     return make_float64(val);
3743 }
3744 
3745 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3746 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3747 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3748 GEN_VEXT_V_ENV(vfrsqrt7_v_h)
3749 GEN_VEXT_V_ENV(vfrsqrt7_v_w)
3750 GEN_VEXT_V_ENV(vfrsqrt7_v_d)
3751 
3752 /*
3753  * Vector Floating-Point Reciprocal Estimate Instruction
3754  *
3755  * Adapted from riscv-v-spec recip.c:
3756  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3757  */
3758 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3759                       float_status *s)
3760 {
3761     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3762     uint64_t exp = extract64(f, frac_size, exp_size);
3763     uint64_t frac = extract64(f, 0, frac_size);
3764 
3765     const uint8_t lookup_table[] = {
3766         127, 125, 123, 121, 119, 117, 116, 114,
3767         112, 110, 109, 107, 105, 104, 102, 100,
3768         99, 97, 96, 94, 93, 91, 90, 88,
3769         87, 85, 84, 83, 81, 80, 79, 77,
3770         76, 75, 74, 72, 71, 70, 69, 68,
3771         66, 65, 64, 63, 62, 61, 60, 59,
3772         58, 57, 56, 55, 54, 53, 52, 51,
3773         50, 49, 48, 47, 46, 45, 44, 43,
3774         42, 41, 40, 40, 39, 38, 37, 36,
3775         35, 35, 34, 33, 32, 31, 31, 30,
3776         29, 28, 28, 27, 26, 25, 25, 24,
3777         23, 23, 22, 21, 21, 20, 19, 19,
3778         18, 17, 17, 16, 15, 15, 14, 14,
3779         13, 12, 12, 11, 11, 10, 9, 9,
3780         8, 8, 7, 7, 6, 5, 5, 4,
3781         4, 3, 3, 2, 2, 1, 1, 0
3782     };
3783     const int precision = 7;
3784 
3785     if (exp == 0 && frac != 0) { /* subnormal */
3786         /* Normalize the subnormal. */
3787         while (extract64(frac, frac_size - 1, 1) == 0) {
3788             exp--;
3789             frac <<= 1;
3790         }
3791 
3792         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3793 
3794         if (exp != 0 && exp != UINT64_MAX) {
3795             /*
3796              * Overflow to inf or max value of same sign,
3797              * depending on sign and rounding mode.
3798              */
3799             s->float_exception_flags |= (float_flag_inexact |
3800                                          float_flag_overflow);
3801 
3802             if ((s->float_rounding_mode == float_round_to_zero) ||
3803                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3804                 ((s->float_rounding_mode == float_round_up) && sign)) {
3805                 /* Return greatest/negative finite value. */
3806                 return (sign << (exp_size + frac_size)) |
3807                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3808             } else {
3809                 /* Return +-inf. */
3810                 return (sign << (exp_size + frac_size)) |
3811                     MAKE_64BIT_MASK(frac_size, exp_size);
3812             }
3813         }
3814     }
3815 
3816     int idx = frac >> (frac_size - precision);
3817     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3818                             (frac_size - precision);
3819     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3820 
3821     if (out_exp == 0 || out_exp == UINT64_MAX) {
3822         /*
3823          * The result is subnormal, but don't raise the underflow exception,
3824          * because there's no additional loss of precision.
3825          */
3826         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3827         if (out_exp == UINT64_MAX) {
3828             out_frac >>= 1;
3829             out_exp = 0;
3830         }
3831     }
3832 
3833     uint64_t val = 0;
3834     val = deposit64(val, 0, frac_size, out_frac);
3835     val = deposit64(val, frac_size, exp_size, out_exp);
3836     val = deposit64(val, frac_size + exp_size, 1, sign);
3837     return val;
3838 }
3839 
3840 static float16 frec7_h(float16 f, float_status *s)
3841 {
3842     int exp_size = 5, frac_size = 10;
3843     bool sign = float16_is_neg(f);
3844 
3845     /* frec7(+-inf) = +-0 */
3846     if (float16_is_infinity(f)) {
3847         return float16_set_sign(float16_zero, sign);
3848     }
3849 
3850     /* frec7(+-0) = +-inf */
3851     if (float16_is_zero(f)) {
3852         s->float_exception_flags |= float_flag_divbyzero;
3853         return float16_set_sign(float16_infinity, sign);
3854     }
3855 
3856     /* frec7(sNaN) = canonical NaN */
3857     if (float16_is_signaling_nan(f, s)) {
3858         s->float_exception_flags |= float_flag_invalid;
3859         return float16_default_nan(s);
3860     }
3861 
3862     /* frec7(qNaN) = canonical NaN */
3863     if (float16_is_quiet_nan(f, s)) {
3864         return float16_default_nan(s);
3865     }
3866 
3867     /* +-normal, +-subnormal */
3868     uint64_t val = frec7(f, exp_size, frac_size, s);
3869     return make_float16(val);
3870 }
3871 
3872 static float32 frec7_s(float32 f, float_status *s)
3873 {
3874     int exp_size = 8, frac_size = 23;
3875     bool sign = float32_is_neg(f);
3876 
3877     /* frec7(+-inf) = +-0 */
3878     if (float32_is_infinity(f)) {
3879         return float32_set_sign(float32_zero, sign);
3880     }
3881 
3882     /* frec7(+-0) = +-inf */
3883     if (float32_is_zero(f)) {
3884         s->float_exception_flags |= float_flag_divbyzero;
3885         return float32_set_sign(float32_infinity, sign);
3886     }
3887 
3888     /* frec7(sNaN) = canonical NaN */
3889     if (float32_is_signaling_nan(f, s)) {
3890         s->float_exception_flags |= float_flag_invalid;
3891         return float32_default_nan(s);
3892     }
3893 
3894     /* frec7(qNaN) = canonical NaN */
3895     if (float32_is_quiet_nan(f, s)) {
3896         return float32_default_nan(s);
3897     }
3898 
3899     /* +-normal, +-subnormal */
3900     uint64_t val = frec7(f, exp_size, frac_size, s);
3901     return make_float32(val);
3902 }
3903 
3904 static float64 frec7_d(float64 f, float_status *s)
3905 {
3906     int exp_size = 11, frac_size = 52;
3907     bool sign = float64_is_neg(f);
3908 
3909     /* frec7(+-inf) = +-0 */
3910     if (float64_is_infinity(f)) {
3911         return float64_set_sign(float64_zero, sign);
3912     }
3913 
3914     /* frec7(+-0) = +-inf */
3915     if (float64_is_zero(f)) {
3916         s->float_exception_flags |= float_flag_divbyzero;
3917         return float64_set_sign(float64_infinity, sign);
3918     }
3919 
3920     /* frec7(sNaN) = canonical NaN */
3921     if (float64_is_signaling_nan(f, s)) {
3922         s->float_exception_flags |= float_flag_invalid;
3923         return float64_default_nan(s);
3924     }
3925 
3926     /* frec7(qNaN) = canonical NaN */
3927     if (float64_is_quiet_nan(f, s)) {
3928         return float64_default_nan(s);
3929     }
3930 
3931     /* +-normal, +-subnormal */
3932     uint64_t val = frec7(f, exp_size, frac_size, s);
3933     return make_float64(val);
3934 }
3935 
3936 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3937 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3938 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3939 GEN_VEXT_V_ENV(vfrec7_v_h)
3940 GEN_VEXT_V_ENV(vfrec7_v_w)
3941 GEN_VEXT_V_ENV(vfrec7_v_d)
3942 
3943 /* Vector Floating-Point MIN/MAX Instructions */
3944 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3945 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3946 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3947 GEN_VEXT_VV_ENV(vfmin_vv_h)
3948 GEN_VEXT_VV_ENV(vfmin_vv_w)
3949 GEN_VEXT_VV_ENV(vfmin_vv_d)
3950 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3951 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3952 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3953 GEN_VEXT_VF(vfmin_vf_h)
3954 GEN_VEXT_VF(vfmin_vf_w)
3955 GEN_VEXT_VF(vfmin_vf_d)
3956 
3957 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3958 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3959 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3960 GEN_VEXT_VV_ENV(vfmax_vv_h)
3961 GEN_VEXT_VV_ENV(vfmax_vv_w)
3962 GEN_VEXT_VV_ENV(vfmax_vv_d)
3963 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3964 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3965 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3966 GEN_VEXT_VF(vfmax_vf_h)
3967 GEN_VEXT_VF(vfmax_vf_w)
3968 GEN_VEXT_VF(vfmax_vf_d)
3969 
3970 /* Vector Floating-Point Sign-Injection Instructions */
3971 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3972 {
3973     return deposit64(b, 0, 15, a);
3974 }
3975 
3976 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3977 {
3978     return deposit64(b, 0, 31, a);
3979 }
3980 
3981 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3982 {
3983     return deposit64(b, 0, 63, a);
3984 }
3985 
3986 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3987 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3988 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3989 GEN_VEXT_VV_ENV(vfsgnj_vv_h)
3990 GEN_VEXT_VV_ENV(vfsgnj_vv_w)
3991 GEN_VEXT_VV_ENV(vfsgnj_vv_d)
3992 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3993 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3994 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3995 GEN_VEXT_VF(vfsgnj_vf_h)
3996 GEN_VEXT_VF(vfsgnj_vf_w)
3997 GEN_VEXT_VF(vfsgnj_vf_d)
3998 
3999 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4000 {
4001     return deposit64(~b, 0, 15, a);
4002 }
4003 
4004 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4005 {
4006     return deposit64(~b, 0, 31, a);
4007 }
4008 
4009 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4010 {
4011     return deposit64(~b, 0, 63, a);
4012 }
4013 
4014 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4015 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4016 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4017 GEN_VEXT_VV_ENV(vfsgnjn_vv_h)
4018 GEN_VEXT_VV_ENV(vfsgnjn_vv_w)
4019 GEN_VEXT_VV_ENV(vfsgnjn_vv_d)
4020 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4021 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4022 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4023 GEN_VEXT_VF(vfsgnjn_vf_h)
4024 GEN_VEXT_VF(vfsgnjn_vf_w)
4025 GEN_VEXT_VF(vfsgnjn_vf_d)
4026 
4027 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4028 {
4029     return deposit64(b ^ a, 0, 15, a);
4030 }
4031 
4032 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4033 {
4034     return deposit64(b ^ a, 0, 31, a);
4035 }
4036 
4037 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4038 {
4039     return deposit64(b ^ a, 0, 63, a);
4040 }
4041 
4042 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4043 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4044 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4045 GEN_VEXT_VV_ENV(vfsgnjx_vv_h)
4046 GEN_VEXT_VV_ENV(vfsgnjx_vv_w)
4047 GEN_VEXT_VV_ENV(vfsgnjx_vv_d)
4048 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4049 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4050 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4051 GEN_VEXT_VF(vfsgnjx_vf_h)
4052 GEN_VEXT_VF(vfsgnjx_vf_w)
4053 GEN_VEXT_VF(vfsgnjx_vf_d)
4054 
4055 /* Vector Floating-Point Compare Instructions */
4056 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4057 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4058                   CPURISCVState *env, uint32_t desc)          \
4059 {                                                             \
4060     uint32_t vm = vext_vm(desc);                              \
4061     uint32_t vl = env->vl;                                    \
4062     uint32_t i;                                               \
4063                                                               \
4064     for (i = env->vstart; i < vl; i++) {                      \
4065         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4066         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4067         if (!vm && !vext_elem_mask(v0, i)) {                  \
4068             continue;                                         \
4069         }                                                     \
4070         vext_set_elem_mask(vd, i,                             \
4071                            DO_OP(s2, s1, &env->fp_status));   \
4072     }                                                         \
4073     env->vstart = 0;                                          \
4074 }
4075 
4076 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4077 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4078 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4079 
4080 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4081 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4082                   CPURISCVState *env, uint32_t desc)                \
4083 {                                                                   \
4084     uint32_t vm = vext_vm(desc);                                    \
4085     uint32_t vl = env->vl;                                          \
4086     uint32_t i;                                                     \
4087                                                                     \
4088     for (i = env->vstart; i < vl; i++) {                            \
4089         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4090         if (!vm && !vext_elem_mask(v0, i)) {                        \
4091             continue;                                               \
4092         }                                                           \
4093         vext_set_elem_mask(vd, i,                                   \
4094                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4095     }                                                               \
4096     env->vstart = 0;                                                \
4097 }
4098 
4099 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4100 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4101 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4102 
4103 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4104 {
4105     FloatRelation compare = float16_compare_quiet(a, b, s);
4106     return compare != float_relation_equal;
4107 }
4108 
4109 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4110 {
4111     FloatRelation compare = float32_compare_quiet(a, b, s);
4112     return compare != float_relation_equal;
4113 }
4114 
4115 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4116 {
4117     FloatRelation compare = float64_compare_quiet(a, b, s);
4118     return compare != float_relation_equal;
4119 }
4120 
4121 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4122 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4123 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4124 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4125 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4126 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4127 
4128 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4129 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4130 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4131 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4132 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4133 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4134 
4135 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4136 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4137 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4138 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4139 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4140 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4141 
4142 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4143 {
4144     FloatRelation compare = float16_compare(a, b, s);
4145     return compare == float_relation_greater;
4146 }
4147 
4148 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4149 {
4150     FloatRelation compare = float32_compare(a, b, s);
4151     return compare == float_relation_greater;
4152 }
4153 
4154 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4155 {
4156     FloatRelation compare = float64_compare(a, b, s);
4157     return compare == float_relation_greater;
4158 }
4159 
4160 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4161 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4162 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4163 
4164 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4165 {
4166     FloatRelation compare = float16_compare(a, b, s);
4167     return compare == float_relation_greater ||
4168            compare == float_relation_equal;
4169 }
4170 
4171 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4172 {
4173     FloatRelation compare = float32_compare(a, b, s);
4174     return compare == float_relation_greater ||
4175            compare == float_relation_equal;
4176 }
4177 
4178 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4179 {
4180     FloatRelation compare = float64_compare(a, b, s);
4181     return compare == float_relation_greater ||
4182            compare == float_relation_equal;
4183 }
4184 
4185 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4186 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4187 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4188 
4189 /* Vector Floating-Point Classify Instruction */
4190 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4191 static void do_##NAME(void *vd, void *vs2, int i)      \
4192 {                                                      \
4193     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4194     *((TD *)vd + HD(i)) = OP(s2);                      \
4195 }
4196 
4197 #define GEN_VEXT_V(NAME)                               \
4198 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4199                   CPURISCVState *env, uint32_t desc)   \
4200 {                                                      \
4201     uint32_t vm = vext_vm(desc);                       \
4202     uint32_t vl = env->vl;                             \
4203     uint32_t i;                                        \
4204                                                        \
4205     for (i = env->vstart; i < vl; i++) {               \
4206         if (!vm && !vext_elem_mask(v0, i)) {           \
4207             continue;                                  \
4208         }                                              \
4209         do_##NAME(vd, vs2, i);                         \
4210     }                                                  \
4211     env->vstart = 0;                                   \
4212 }
4213 
4214 target_ulong fclass_h(uint64_t frs1)
4215 {
4216     float16 f = frs1;
4217     bool sign = float16_is_neg(f);
4218 
4219     if (float16_is_infinity(f)) {
4220         return sign ? 1 << 0 : 1 << 7;
4221     } else if (float16_is_zero(f)) {
4222         return sign ? 1 << 3 : 1 << 4;
4223     } else if (float16_is_zero_or_denormal(f)) {
4224         return sign ? 1 << 2 : 1 << 5;
4225     } else if (float16_is_any_nan(f)) {
4226         float_status s = { }; /* for snan_bit_is_one */
4227         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4228     } else {
4229         return sign ? 1 << 1 : 1 << 6;
4230     }
4231 }
4232 
4233 target_ulong fclass_s(uint64_t frs1)
4234 {
4235     float32 f = frs1;
4236     bool sign = float32_is_neg(f);
4237 
4238     if (float32_is_infinity(f)) {
4239         return sign ? 1 << 0 : 1 << 7;
4240     } else if (float32_is_zero(f)) {
4241         return sign ? 1 << 3 : 1 << 4;
4242     } else if (float32_is_zero_or_denormal(f)) {
4243         return sign ? 1 << 2 : 1 << 5;
4244     } else if (float32_is_any_nan(f)) {
4245         float_status s = { }; /* for snan_bit_is_one */
4246         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4247     } else {
4248         return sign ? 1 << 1 : 1 << 6;
4249     }
4250 }
4251 
4252 target_ulong fclass_d(uint64_t frs1)
4253 {
4254     float64 f = frs1;
4255     bool sign = float64_is_neg(f);
4256 
4257     if (float64_is_infinity(f)) {
4258         return sign ? 1 << 0 : 1 << 7;
4259     } else if (float64_is_zero(f)) {
4260         return sign ? 1 << 3 : 1 << 4;
4261     } else if (float64_is_zero_or_denormal(f)) {
4262         return sign ? 1 << 2 : 1 << 5;
4263     } else if (float64_is_any_nan(f)) {
4264         float_status s = { }; /* for snan_bit_is_one */
4265         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4266     } else {
4267         return sign ? 1 << 1 : 1 << 6;
4268     }
4269 }
4270 
4271 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4272 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4273 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4274 GEN_VEXT_V(vfclass_v_h)
4275 GEN_VEXT_V(vfclass_v_w)
4276 GEN_VEXT_V(vfclass_v_d)
4277 
4278 /* Vector Floating-Point Merge Instruction */
4279 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4280 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4281                   CPURISCVState *env, uint32_t desc)          \
4282 {                                                             \
4283     uint32_t vm = vext_vm(desc);                              \
4284     uint32_t vl = env->vl;                                    \
4285     uint32_t i;                                               \
4286                                                               \
4287     for (i = env->vstart; i < vl; i++) {                      \
4288         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4289         *((ETYPE *)vd + H(i))                                 \
4290           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4291     }                                                         \
4292     env->vstart = 0;                                          \
4293 }
4294 
4295 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4296 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4297 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4298 
4299 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4300 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4301 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4302 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4303 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4304 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h)
4305 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w)
4306 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d)
4307 
4308 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4309 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4310 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4311 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4312 GEN_VEXT_V_ENV(vfcvt_x_f_v_h)
4313 GEN_VEXT_V_ENV(vfcvt_x_f_v_w)
4314 GEN_VEXT_V_ENV(vfcvt_x_f_v_d)
4315 
4316 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4317 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4318 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4319 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4320 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h)
4321 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w)
4322 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d)
4323 
4324 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4325 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4326 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4327 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4328 GEN_VEXT_V_ENV(vfcvt_f_x_v_h)
4329 GEN_VEXT_V_ENV(vfcvt_f_x_v_w)
4330 GEN_VEXT_V_ENV(vfcvt_f_x_v_d)
4331 
4332 /* Widening Floating-Point/Integer Type-Convert Instructions */
4333 /* (TD, T2, TX2) */
4334 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4335 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4336 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4337 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4338 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4339 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4340 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h)
4341 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w)
4342 
4343 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4344 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4345 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4346 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h)
4347 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w)
4348 
4349 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4350 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4351 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4352 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4353 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b)
4354 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h)
4355 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w)
4356 
4357 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4358 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4359 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4360 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4361 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b)
4362 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h)
4363 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w)
4364 
4365 /*
4366  * vfwcvt.f.f.v vd, vs2, vm
4367  * Convert single-width float to double-width float.
4368  */
4369 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4370 {
4371     return float16_to_float32(a, true, s);
4372 }
4373 
4374 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4375 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4376 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h)
4377 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w)
4378 
4379 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4380 /* (TD, T2, TX2) */
4381 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4382 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4383 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4384 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4385 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4386 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4387 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4388 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b)
4389 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h)
4390 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w)
4391 
4392 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4393 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4394 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4395 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4396 GEN_VEXT_V_ENV(vfncvt_x_f_w_b)
4397 GEN_VEXT_V_ENV(vfncvt_x_f_w_h)
4398 GEN_VEXT_V_ENV(vfncvt_x_f_w_w)
4399 
4400 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4401 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4402 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4403 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h)
4404 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w)
4405 
4406 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4407 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4408 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4409 GEN_VEXT_V_ENV(vfncvt_f_x_w_h)
4410 GEN_VEXT_V_ENV(vfncvt_f_x_w_w)
4411 
4412 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4413 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4414 {
4415     return float32_to_float16(a, true, s);
4416 }
4417 
4418 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4419 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4420 GEN_VEXT_V_ENV(vfncvt_f_f_w_h)
4421 GEN_VEXT_V_ENV(vfncvt_f_f_w_w)
4422 
4423 /*
4424  *** Vector Reduction Operations
4425  */
4426 /* Vector Single-Width Integer Reduction Instructions */
4427 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4428 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4429         void *vs2, CPURISCVState *env, uint32_t desc)     \
4430 {                                                         \
4431     uint32_t vm = vext_vm(desc);                          \
4432     uint32_t vl = env->vl;                                \
4433     uint32_t i;                                           \
4434     TD s1 =  *((TD *)vs1 + HD(0));                        \
4435                                                           \
4436     for (i = env->vstart; i < vl; i++) {                  \
4437         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4438         if (!vm && !vext_elem_mask(v0, i)) {              \
4439             continue;                                     \
4440         }                                                 \
4441         s1 = OP(s1, (TD)s2);                              \
4442     }                                                     \
4443     *((TD *)vd + HD(0)) = s1;                             \
4444     env->vstart = 0;                                      \
4445 }
4446 
4447 /* vd[0] = sum(vs1[0], vs2[*]) */
4448 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4449 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4450 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4451 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4452 
4453 /* vd[0] = maxu(vs1[0], vs2[*]) */
4454 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4455 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4456 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4457 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4458 
4459 /* vd[0] = max(vs1[0], vs2[*]) */
4460 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4461 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4462 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4463 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4464 
4465 /* vd[0] = minu(vs1[0], vs2[*]) */
4466 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4467 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4468 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4469 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4470 
4471 /* vd[0] = min(vs1[0], vs2[*]) */
4472 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4473 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4474 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4475 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4476 
4477 /* vd[0] = and(vs1[0], vs2[*]) */
4478 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4479 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4480 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4481 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4482 
4483 /* vd[0] = or(vs1[0], vs2[*]) */
4484 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4485 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4486 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4487 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4488 
4489 /* vd[0] = xor(vs1[0], vs2[*]) */
4490 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4491 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4492 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4493 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4494 
4495 /* Vector Widening Integer Reduction Instructions */
4496 /* signed sum reduction into double-width accumulator */
4497 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4498 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4499 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4500 
4501 /* Unsigned sum reduction into double-width accumulator */
4502 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4503 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4504 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4505 
4506 /* Vector Single-Width Floating-Point Reduction Instructions */
4507 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4508 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4509                   void *vs2, CPURISCVState *env,           \
4510                   uint32_t desc)                           \
4511 {                                                          \
4512     uint32_t vm = vext_vm(desc);                           \
4513     uint32_t vl = env->vl;                                 \
4514     uint32_t i;                                            \
4515     TD s1 =  *((TD *)vs1 + HD(0));                         \
4516                                                            \
4517     for (i = env->vstart; i < vl; i++) {                   \
4518         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4519         if (!vm && !vext_elem_mask(v0, i)) {               \
4520             continue;                                      \
4521         }                                                  \
4522         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4523     }                                                      \
4524     *((TD *)vd + HD(0)) = s1;                              \
4525     env->vstart = 0;                                       \
4526 }
4527 
4528 /* Unordered sum */
4529 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4530 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4531 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4532 
4533 /* Maximum value */
4534 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4535 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4536 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4537 
4538 /* Minimum value */
4539 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4540 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4541 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4542 
4543 /* Vector Widening Floating-Point Reduction Instructions */
4544 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4545 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4546                             void *vs2, CPURISCVState *env, uint32_t desc)
4547 {
4548     uint32_t vm = vext_vm(desc);
4549     uint32_t vl = env->vl;
4550     uint32_t i;
4551     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4552 
4553     for (i = env->vstart; i < vl; i++) {
4554         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4555         if (!vm && !vext_elem_mask(v0, i)) {
4556             continue;
4557         }
4558         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4559                          &env->fp_status);
4560     }
4561     *((uint32_t *)vd + H4(0)) = s1;
4562     env->vstart = 0;
4563 }
4564 
4565 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4566                             void *vs2, CPURISCVState *env, uint32_t desc)
4567 {
4568     uint32_t vm = vext_vm(desc);
4569     uint32_t vl = env->vl;
4570     uint32_t i;
4571     uint64_t s1 =  *((uint64_t *)vs1);
4572 
4573     for (i = env->vstart; i < vl; i++) {
4574         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4575         if (!vm && !vext_elem_mask(v0, i)) {
4576             continue;
4577         }
4578         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4579                          &env->fp_status);
4580     }
4581     *((uint64_t *)vd) = s1;
4582     env->vstart = 0;
4583 }
4584 
4585 /*
4586  *** Vector Mask Operations
4587  */
4588 /* Vector Mask-Register Logical Instructions */
4589 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4590 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4591                   void *vs2, CPURISCVState *env,          \
4592                   uint32_t desc)                          \
4593 {                                                         \
4594     uint32_t vl = env->vl;                                \
4595     uint32_t i;                                           \
4596     int a, b;                                             \
4597                                                           \
4598     for (i = env->vstart; i < vl; i++) {                  \
4599         a = vext_elem_mask(vs1, i);                       \
4600         b = vext_elem_mask(vs2, i);                       \
4601         vext_set_elem_mask(vd, i, OP(b, a));              \
4602     }                                                     \
4603     env->vstart = 0;                                      \
4604 }
4605 
4606 #define DO_NAND(N, M)  (!(N & M))
4607 #define DO_ANDNOT(N, M)  (N & !M)
4608 #define DO_NOR(N, M)  (!(N | M))
4609 #define DO_ORNOT(N, M)  (N | !M)
4610 #define DO_XNOR(N, M)  (!(N ^ M))
4611 
4612 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4613 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4614 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4615 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4616 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4617 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4618 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4619 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4620 
4621 /* Vector count population in mask vcpop */
4622 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4623                              uint32_t desc)
4624 {
4625     target_ulong cnt = 0;
4626     uint32_t vm = vext_vm(desc);
4627     uint32_t vl = env->vl;
4628     int i;
4629 
4630     for (i = env->vstart; i < vl; i++) {
4631         if (vm || vext_elem_mask(v0, i)) {
4632             if (vext_elem_mask(vs2, i)) {
4633                 cnt++;
4634             }
4635         }
4636     }
4637     env->vstart = 0;
4638     return cnt;
4639 }
4640 
4641 /* vfirst find-first-set mask bit*/
4642 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4643                               uint32_t desc)
4644 {
4645     uint32_t vm = vext_vm(desc);
4646     uint32_t vl = env->vl;
4647     int i;
4648 
4649     for (i = env->vstart; i < vl; i++) {
4650         if (vm || vext_elem_mask(v0, i)) {
4651             if (vext_elem_mask(vs2, i)) {
4652                 return i;
4653             }
4654         }
4655     }
4656     env->vstart = 0;
4657     return -1LL;
4658 }
4659 
4660 enum set_mask_type {
4661     ONLY_FIRST = 1,
4662     INCLUDE_FIRST,
4663     BEFORE_FIRST,
4664 };
4665 
4666 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4667                    uint32_t desc, enum set_mask_type type)
4668 {
4669     uint32_t vm = vext_vm(desc);
4670     uint32_t vl = env->vl;
4671     int i;
4672     bool first_mask_bit = false;
4673 
4674     for (i = env->vstart; i < vl; i++) {
4675         if (!vm && !vext_elem_mask(v0, i)) {
4676             continue;
4677         }
4678         /* write a zero to all following active elements */
4679         if (first_mask_bit) {
4680             vext_set_elem_mask(vd, i, 0);
4681             continue;
4682         }
4683         if (vext_elem_mask(vs2, i)) {
4684             first_mask_bit = true;
4685             if (type == BEFORE_FIRST) {
4686                 vext_set_elem_mask(vd, i, 0);
4687             } else {
4688                 vext_set_elem_mask(vd, i, 1);
4689             }
4690         } else {
4691             if (type == ONLY_FIRST) {
4692                 vext_set_elem_mask(vd, i, 0);
4693             } else {
4694                 vext_set_elem_mask(vd, i, 1);
4695             }
4696         }
4697     }
4698     env->vstart = 0;
4699 }
4700 
4701 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4702                      uint32_t desc)
4703 {
4704     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4705 }
4706 
4707 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4708                      uint32_t desc)
4709 {
4710     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4711 }
4712 
4713 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4714                      uint32_t desc)
4715 {
4716     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4717 }
4718 
4719 /* Vector Iota Instruction */
4720 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4721 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4722                   uint32_t desc)                                          \
4723 {                                                                         \
4724     uint32_t vm = vext_vm(desc);                                          \
4725     uint32_t vl = env->vl;                                                \
4726     uint32_t sum = 0;                                                     \
4727     int i;                                                                \
4728                                                                           \
4729     for (i = env->vstart; i < vl; i++) {                                  \
4730         if (!vm && !vext_elem_mask(v0, i)) {                              \
4731             continue;                                                     \
4732         }                                                                 \
4733         *((ETYPE *)vd + H(i)) = sum;                                      \
4734         if (vext_elem_mask(vs2, i)) {                                     \
4735             sum++;                                                        \
4736         }                                                                 \
4737     }                                                                     \
4738     env->vstart = 0;                                                      \
4739 }
4740 
4741 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4742 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4743 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4744 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4745 
4746 /* Vector Element Index Instruction */
4747 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4748 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4749 {                                                                         \
4750     uint32_t vm = vext_vm(desc);                                          \
4751     uint32_t vl = env->vl;                                                \
4752     int i;                                                                \
4753                                                                           \
4754     for (i = env->vstart; i < vl; i++) {                                  \
4755         if (!vm && !vext_elem_mask(v0, i)) {                              \
4756             continue;                                                     \
4757         }                                                                 \
4758         *((ETYPE *)vd + H(i)) = i;                                        \
4759     }                                                                     \
4760     env->vstart = 0;                                                      \
4761 }
4762 
4763 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4764 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4765 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4766 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4767 
4768 /*
4769  *** Vector Permutation Instructions
4770  */
4771 
4772 /* Vector Slide Instructions */
4773 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4774 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4775                   CPURISCVState *env, uint32_t desc)                      \
4776 {                                                                         \
4777     uint32_t vm = vext_vm(desc);                                          \
4778     uint32_t vl = env->vl;                                                \
4779     target_ulong offset = s1, i_min, i;                                   \
4780                                                                           \
4781     i_min = MAX(env->vstart, offset);                                     \
4782     for (i = i_min; i < vl; i++) {                                        \
4783         if (!vm && !vext_elem_mask(v0, i)) {                              \
4784             continue;                                                     \
4785         }                                                                 \
4786         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4787     }                                                                     \
4788 }
4789 
4790 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4791 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4792 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4793 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4794 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4795 
4796 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4797 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4798                   CPURISCVState *env, uint32_t desc)                      \
4799 {                                                                         \
4800     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4801     uint32_t vm = vext_vm(desc);                                          \
4802     uint32_t vl = env->vl;                                                \
4803     target_ulong i_max, i;                                                \
4804                                                                           \
4805     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4806     for (i = env->vstart; i < i_max; ++i) {                               \
4807         if (vm || vext_elem_mask(v0, i)) {                                \
4808             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4809         }                                                                 \
4810     }                                                                     \
4811                                                                           \
4812     for (i = i_max; i < vl; ++i) {                                        \
4813         if (vm || vext_elem_mask(v0, i)) {                                \
4814             *((ETYPE *)vd + H(i)) = 0;                                    \
4815         }                                                                 \
4816     }                                                                     \
4817                                                                           \
4818     env->vstart = 0;                                                      \
4819 }
4820 
4821 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4822 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4823 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4824 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4825 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4826 
4827 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4828 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4829                      void *vs2, CPURISCVState *env, uint32_t desc)          \
4830 {                                                                           \
4831     typedef uint##BITWIDTH##_t ETYPE;                                       \
4832     uint32_t vm = vext_vm(desc);                                            \
4833     uint32_t vl = env->vl;                                                  \
4834     uint32_t i;                                                             \
4835                                                                             \
4836     for (i = env->vstart; i < vl; i++) {                                    \
4837         if (!vm && !vext_elem_mask(v0, i)) {                                \
4838             continue;                                                       \
4839         }                                                                   \
4840         if (i == 0) {                                                       \
4841             *((ETYPE *)vd + H(i)) = s1;                                     \
4842         } else {                                                            \
4843             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4844         }                                                                   \
4845     }                                                                       \
4846     env->vstart = 0;                                                        \
4847 }
4848 
4849 GEN_VEXT_VSLIE1UP(8,  H1)
4850 GEN_VEXT_VSLIE1UP(16, H2)
4851 GEN_VEXT_VSLIE1UP(32, H4)
4852 GEN_VEXT_VSLIE1UP(64, H8)
4853 
4854 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4855 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4856                   CPURISCVState *env, uint32_t desc)              \
4857 {                                                                 \
4858     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4859 }
4860 
4861 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4862 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4863 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4864 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4865 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4866 
4867 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4868 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4869                        void *vs2, CPURISCVState *env, uint32_t desc)          \
4870 {                                                                             \
4871     typedef uint##BITWIDTH##_t ETYPE;                                         \
4872     uint32_t vm = vext_vm(desc);                                              \
4873     uint32_t vl = env->vl;                                                    \
4874     uint32_t i;                                                               \
4875                                                                               \
4876     for (i = env->vstart; i < vl; i++) {                                      \
4877         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4878             continue;                                                         \
4879         }                                                                     \
4880         if (i == vl - 1) {                                                    \
4881             *((ETYPE *)vd + H(i)) = s1;                                       \
4882         } else {                                                              \
4883             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4884         }                                                                     \
4885     }                                                                         \
4886     env->vstart = 0;                                                          \
4887 }
4888 
4889 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4890 GEN_VEXT_VSLIDE1DOWN(16, H2)
4891 GEN_VEXT_VSLIDE1DOWN(32, H4)
4892 GEN_VEXT_VSLIDE1DOWN(64, H8)
4893 
4894 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4895 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4896                   CPURISCVState *env, uint32_t desc)              \
4897 {                                                                 \
4898     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4899 }
4900 
4901 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4902 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4903 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4904 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4905 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4906 
4907 /* Vector Floating-Point Slide Instructions */
4908 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4909 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4910                   CPURISCVState *env, uint32_t desc)          \
4911 {                                                             \
4912     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4913 }
4914 
4915 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4916 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4917 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4918 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4919 
4920 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4921 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4922                   CPURISCVState *env, uint32_t desc)          \
4923 {                                                             \
4924     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4925 }
4926 
4927 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4928 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4929 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4930 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4931 
4932 /* Vector Register Gather Instruction */
4933 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4934 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4935                   CPURISCVState *env, uint32_t desc)                      \
4936 {                                                                         \
4937     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4938     uint32_t vm = vext_vm(desc);                                          \
4939     uint32_t vl = env->vl;                                                \
4940     uint64_t index;                                                       \
4941     uint32_t i;                                                           \
4942                                                                           \
4943     for (i = env->vstart; i < vl; i++) {                                  \
4944         if (!vm && !vext_elem_mask(v0, i)) {                              \
4945             continue;                                                     \
4946         }                                                                 \
4947         index = *((TS1 *)vs1 + HS1(i));                                   \
4948         if (index >= vlmax) {                                             \
4949             *((TS2 *)vd + HS2(i)) = 0;                                    \
4950         } else {                                                          \
4951             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4952         }                                                                 \
4953     }                                                                     \
4954     env->vstart = 0;                                                      \
4955 }
4956 
4957 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4958 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4959 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4960 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4961 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4962 
4963 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4964 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4965 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
4966 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
4967 
4968 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4969 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4970                   CPURISCVState *env, uint32_t desc)                      \
4971 {                                                                         \
4972     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4973     uint32_t vm = vext_vm(desc);                                          \
4974     uint32_t vl = env->vl;                                                \
4975     uint64_t index = s1;                                                  \
4976     uint32_t i;                                                           \
4977                                                                           \
4978     for (i = env->vstart; i < vl; i++) {                                  \
4979         if (!vm && !vext_elem_mask(v0, i)) {                              \
4980             continue;                                                     \
4981         }                                                                 \
4982         if (index >= vlmax) {                                             \
4983             *((ETYPE *)vd + H(i)) = 0;                                    \
4984         } else {                                                          \
4985             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
4986         }                                                                 \
4987     }                                                                     \
4988     env->vstart = 0;                                                      \
4989 }
4990 
4991 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
4992 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
4993 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
4994 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
4995 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
4996 
4997 /* Vector Compress Instruction */
4998 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
4999 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5000                   CPURISCVState *env, uint32_t desc)                      \
5001 {                                                                         \
5002     uint32_t vl = env->vl;                                                \
5003     uint32_t num = 0, i;                                                  \
5004                                                                           \
5005     for (i = env->vstart; i < vl; i++) {                                  \
5006         if (!vext_elem_mask(vs1, i)) {                                    \
5007             continue;                                                     \
5008         }                                                                 \
5009         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5010         num++;                                                            \
5011     }                                                                     \
5012     env->vstart = 0;                                                      \
5013 }
5014 
5015 /* Compress into vd elements of vs2 where vs1 is enabled */
5016 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5017 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5018 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5019 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5020 
5021 /* Vector Whole Register Move */
5022 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5023 {
5024     /* EEW = SEW */
5025     uint32_t maxsz = simd_maxsz(desc);
5026     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5027     uint32_t startb = env->vstart * sewb;
5028     uint32_t i = startb;
5029 
5030     memcpy((uint8_t *)vd + H1(i),
5031            (uint8_t *)vs2 + H1(i),
5032            maxsz - startb);
5033 
5034     env->vstart = 0;
5035 }
5036 
5037 /* Vector Integer Extension */
5038 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5039 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5040                   CPURISCVState *env, uint32_t desc)             \
5041 {                                                                \
5042     uint32_t vl = env->vl;                                       \
5043     uint32_t vm = vext_vm(desc);                                 \
5044     uint32_t i;                                                  \
5045                                                                  \
5046     for (i = env->vstart; i < vl; i++) {                         \
5047         if (!vm && !vext_elem_mask(v0, i)) {                     \
5048             continue;                                            \
5049         }                                                        \
5050         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5051     }                                                            \
5052     env->vstart = 0;                                             \
5053 }
5054 
5055 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5056 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5057 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5058 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5059 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5060 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5061 
5062 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5063 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5064 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5065 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5066 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5067 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5068