xref: /openbmc/qemu/target/riscv/vector_helper.c (revision bce9a636)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286     uint32_t vma = vext_vma(desc);
287 
288     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289         k = 0;
290         while (k < nf) {
291             if (!vm && !vext_elem_mask(v0, i)) {
292                 /* set masked-off elements to 1s */
293                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294                                   (i + k * max_elems + 1) * esz);
295                 k++;
296                 continue;
297             }
298             target_ulong addr = base + stride * i + (k << log2_esz);
299             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300             k++;
301         }
302     }
303     env->vstart = 0;
304     /* set tail elements to 1s */
305     for (k = 0; k < nf; ++k) {
306         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307                           (k * max_elems + max_elems) * esz);
308     }
309     if (nf * max_elems % total_elems != 0) {
310         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311         uint32_t registers_used =
312             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314                           registers_used * vlenb);
315     }
316 }
317 
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
320                   target_ulong stride, CPURISCVState *env,              \
321                   uint32_t desc)                                        \
322 {                                                                       \
323     uint32_t vm = vext_vm(desc);                                        \
324     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
325                      ctzl(sizeof(ETYPE)), GETPC());                     \
326 }
327 
328 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332 
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
335                   target_ulong stride, CPURISCVState *env,              \
336                   uint32_t desc)                                        \
337 {                                                                       \
338     uint32_t vm = vext_vm(desc);                                        \
339     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
340                      ctzl(sizeof(ETYPE)), GETPC());                     \
341 }
342 
343 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347 
348 /*
349  *** unit-stride: access elements stored contiguously in memory
350  */
351 
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356              uintptr_t ra)
357 {
358     uint32_t i, k;
359     uint32_t nf = vext_nf(desc);
360     uint32_t max_elems = vext_max_elems(desc, log2_esz);
361     uint32_t esz = 1 << log2_esz;
362     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363     uint32_t vta = vext_vta(desc);
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375     /* set tail elements to 1s */
376     for (k = 0; k < nf; ++k) {
377         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378                           (k * max_elems + max_elems) * esz);
379     }
380     if (nf * max_elems % total_elems != 0) {
381         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382         uint32_t registers_used =
383             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385                           registers_used * vlenb);
386     }
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of stride,
391  * stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  *** unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  *** index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488     uint32_t vta = vext_vta(desc);
489     uint32_t vma = vext_vma(desc);
490 
491     /* load bytes from guest memory */
492     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493         k = 0;
494         while (k < nf) {
495             if (!vm && !vext_elem_mask(v0, i)) {
496                 /* set masked-off elements to 1s */
497                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498                                   (i + k * max_elems + 1) * esz);
499                 k++;
500                 continue;
501             }
502             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504             k++;
505         }
506     }
507     env->vstart = 0;
508     /* set tail elements to 1s */
509     for (k = 0; k < nf; ++k) {
510         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511                           (k * max_elems + max_elems) * esz);
512     }
513     if (nf * max_elems % total_elems != 0) {
514         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515         uint32_t registers_used =
516             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518                           registers_used * vlenb);
519     }
520 }
521 
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
524                   void *vs2, CPURISCVState *env, uint32_t desc)            \
525 {                                                                          \
526     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
527                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
528 }
529 
530 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546 
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
549                   void *vs2, CPURISCVState *env, uint32_t desc)  \
550 {                                                                \
551     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
552                     STORE_FN, ctzl(sizeof(ETYPE)),               \
553                     GETPC());                                    \
554 }
555 
556 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572 
573 /*
574  *** unit-stride fault-only-fisrt load instructions
575  */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578           CPURISCVState *env, uint32_t desc,
579           vext_ldst_elem_fn *ldst_elem,
580           uint32_t log2_esz, uintptr_t ra)
581 {
582     void *host;
583     uint32_t i, k, vl = 0;
584     uint32_t nf = vext_nf(desc);
585     uint32_t vm = vext_vm(desc);
586     uint32_t max_elems = vext_max_elems(desc, log2_esz);
587     uint32_t esz = 1 << log2_esz;
588     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589     uint32_t vta = vext_vta(desc);
590     uint32_t vma = vext_vma(desc);
591     target_ulong addr, offset, remain;
592 
593     /* probe every access*/
594     for (i = env->vstart; i < env->vl; i++) {
595         if (!vm && !vext_elem_mask(v0, i)) {
596             continue;
597         }
598         addr = adjust_addr(env, base + i * (nf << log2_esz));
599         if (i == 0) {
600             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601         } else {
602             /* if it triggers an exception, no need to check watchpoint */
603             remain = nf << log2_esz;
604             while (remain > 0) {
605                 offset = -(addr | TARGET_PAGE_MASK);
606                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607                                          cpu_mmu_index(env, false));
608                 if (host) {
609 #ifdef CONFIG_USER_ONLY
610                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
611                         vl = i;
612                         goto ProbeSuccess;
613                     }
614 #else
615                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617                 } else {
618                     vl = i;
619                     goto ProbeSuccess;
620                 }
621                 if (remain <=  offset) {
622                     break;
623                 }
624                 remain -= offset;
625                 addr = adjust_addr(env, addr + offset);
626             }
627         }
628     }
629 ProbeSuccess:
630     /* load bytes from guest memory */
631     if (vl != 0) {
632         env->vl = vl;
633     }
634     for (i = env->vstart; i < env->vl; i++) {
635         k = 0;
636         while (k < nf) {
637             if (!vm && !vext_elem_mask(v0, i)) {
638                 /* set masked-off elements to 1s */
639                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640                                   (i + k * max_elems + 1) * esz);
641                 k++;
642                 continue;
643             }
644             target_ulong addr = base + ((i * nf + k) << log2_esz);
645             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646             k++;
647         }
648     }
649     env->vstart = 0;
650     /* set tail elements to 1s */
651     for (k = 0; k < nf; ++k) {
652         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653                           (k * max_elems + max_elems) * esz);
654     }
655     if (nf * max_elems % total_elems != 0) {
656         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657         uint32_t registers_used =
658             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660                           registers_used * vlenb);
661     }
662 }
663 
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
666                   CPURISCVState *env, uint32_t desc)      \
667 {                                                         \
668     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
669               ctzl(sizeof(ETYPE)), GETPC());              \
670 }
671 
672 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676 
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M)  (N & M)
679 #define DO_XOR(N, M)  (N ^ M)
680 #define DO_OR(N, M)   (N | M)
681 #define DO_ADD(N, M)  (N + M)
682 
683 /* Signed min/max */
684 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
686 
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690 
691 /*
692  *** load and store whole register instructions
693  */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698     uint32_t i, k, off, pos;
699     uint32_t nf = vext_nf(desc);
700     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701     uint32_t max_elems = vlenb >> log2_esz;
702 
703     k = env->vstart / max_elems;
704     off = env->vstart % max_elems;
705 
706     if (off) {
707         /* load/store rest of elements of current segment pointed by vstart */
708         for (pos = off; pos < max_elems; pos++, env->vstart++) {
709             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711         }
712         k++;
713     }
714 
715     /* load/store elements for rest of segments */
716     for (; k < nf; k++) {
717         for (i = 0; i < max_elems; i++, env->vstart++) {
718             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720         }
721     }
722 
723     env->vstart = 0;
724 }
725 
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
727 void HELPER(NAME)(void *vd, target_ulong base,       \
728                   CPURISCVState *env, uint32_t desc) \
729 {                                                    \
730     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
731                     ctzl(sizeof(ETYPE)), GETPC());   \
732 }
733 
734 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750 
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
752 void HELPER(NAME)(void *vd, target_ulong base,       \
753                   CPURISCVState *env, uint32_t desc) \
754 {                                                    \
755     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
756                     ctzl(sizeof(ETYPE)), GETPC());   \
757 }
758 
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763 
764 /*
765  *** Vector Integer Arithmetic Instructions
766  */
767 
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
770 
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802 
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805 
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
808 {                                                               \
809     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
810     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
811     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815 
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824 
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826                        CPURISCVState *env, uint32_t desc,
827                        opivv2_fn *fn, uint32_t esz)
828 {
829     uint32_t vm = vext_vm(desc);
830     uint32_t vl = env->vl;
831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832     uint32_t vta = vext_vta(desc);
833     uint32_t vma = vext_vma(desc);
834     uint32_t i;
835 
836     for (i = env->vstart; i < vl; i++) {
837         if (!vm && !vext_elem_mask(v0, i)) {
838             /* set masked-off elements to 1s */
839             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840             continue;
841         }
842         fn(vd, vs1, vs2, i);
843     }
844     env->vstart = 0;
845     /* set tail elements to 1s */
846     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848 
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ)                            \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
852                   void *vs2, CPURISCVState *env,          \
853                   uint32_t desc)                          \
854 {                                                         \
855     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
856                do_##NAME, ESZ);                           \
857 }
858 
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867 
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869 
870 /*
871  * (T1)s1 gives the real operator type.
872  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873  */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
876 {                                                                   \
877     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
878     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
879 }
880 
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893 
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895                        CPURISCVState *env, uint32_t desc,
896                        opivx2_fn fn, uint32_t esz)
897 {
898     uint32_t vm = vext_vm(desc);
899     uint32_t vl = env->vl;
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901     uint32_t vta = vext_vta(desc);
902     uint32_t vma = vext_vma(desc);
903     uint32_t i;
904 
905     for (i = env->vstart; i < vl; i++) {
906         if (!vm && !vext_elem_mask(v0, i)) {
907             /* set masked-off elements to 1s */
908             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
909             continue;
910         }
911         fn(vd, s1, vs2, i);
912     }
913     env->vstart = 0;
914     /* set tail elements to 1s */
915     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
916 }
917 
918 /* generate the helpers for OPIVX */
919 #define GEN_VEXT_VX(NAME, ESZ)                            \
920 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
921                   void *vs2, CPURISCVState *env,          \
922                   uint32_t desc)                          \
923 {                                                         \
924     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
925                do_##NAME, ESZ);                           \
926 }
927 
928 GEN_VEXT_VX(vadd_vx_b, 1)
929 GEN_VEXT_VX(vadd_vx_h, 2)
930 GEN_VEXT_VX(vadd_vx_w, 4)
931 GEN_VEXT_VX(vadd_vx_d, 8)
932 GEN_VEXT_VX(vsub_vx_b, 1)
933 GEN_VEXT_VX(vsub_vx_h, 2)
934 GEN_VEXT_VX(vsub_vx_w, 4)
935 GEN_VEXT_VX(vsub_vx_d, 8)
936 GEN_VEXT_VX(vrsub_vx_b, 1)
937 GEN_VEXT_VX(vrsub_vx_h, 2)
938 GEN_VEXT_VX(vrsub_vx_w, 4)
939 GEN_VEXT_VX(vrsub_vx_d, 8)
940 
941 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
942 {
943     intptr_t oprsz = simd_oprsz(desc);
944     intptr_t i;
945 
946     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
947         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
948     }
949 }
950 
951 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
952 {
953     intptr_t oprsz = simd_oprsz(desc);
954     intptr_t i;
955 
956     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
957         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
958     }
959 }
960 
961 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
962 {
963     intptr_t oprsz = simd_oprsz(desc);
964     intptr_t i;
965 
966     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
967         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
968     }
969 }
970 
971 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
972 {
973     intptr_t oprsz = simd_oprsz(desc);
974     intptr_t i;
975 
976     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
977         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
978     }
979 }
980 
981 /* Vector Widening Integer Add/Subtract */
982 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
983 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
984 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
985 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
986 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
987 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
988 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
989 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
990 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
991 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
992 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
993 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
994 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
996 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
997 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
999 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1000 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1002 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1003 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1005 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1006 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1009 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1012 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1015 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1018 GEN_VEXT_VV(vwaddu_vv_b, 2)
1019 GEN_VEXT_VV(vwaddu_vv_h, 4)
1020 GEN_VEXT_VV(vwaddu_vv_w, 8)
1021 GEN_VEXT_VV(vwsubu_vv_b, 2)
1022 GEN_VEXT_VV(vwsubu_vv_h, 4)
1023 GEN_VEXT_VV(vwsubu_vv_w, 8)
1024 GEN_VEXT_VV(vwadd_vv_b, 2)
1025 GEN_VEXT_VV(vwadd_vv_h, 4)
1026 GEN_VEXT_VV(vwadd_vv_w, 8)
1027 GEN_VEXT_VV(vwsub_vv_b, 2)
1028 GEN_VEXT_VV(vwsub_vv_h, 4)
1029 GEN_VEXT_VV(vwsub_vv_w, 8)
1030 GEN_VEXT_VV(vwaddu_wv_b, 2)
1031 GEN_VEXT_VV(vwaddu_wv_h, 4)
1032 GEN_VEXT_VV(vwaddu_wv_w, 8)
1033 GEN_VEXT_VV(vwsubu_wv_b, 2)
1034 GEN_VEXT_VV(vwsubu_wv_h, 4)
1035 GEN_VEXT_VV(vwsubu_wv_w, 8)
1036 GEN_VEXT_VV(vwadd_wv_b, 2)
1037 GEN_VEXT_VV(vwadd_wv_h, 4)
1038 GEN_VEXT_VV(vwadd_wv_w, 8)
1039 GEN_VEXT_VV(vwsub_wv_b, 2)
1040 GEN_VEXT_VV(vwsub_wv_h, 4)
1041 GEN_VEXT_VV(vwsub_wv_w, 8)
1042 
1043 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1045 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1046 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1048 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1049 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1051 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1052 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1054 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1055 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1058 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1061 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1064 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1067 GEN_VEXT_VX(vwaddu_vx_b, 2)
1068 GEN_VEXT_VX(vwaddu_vx_h, 4)
1069 GEN_VEXT_VX(vwaddu_vx_w, 8)
1070 GEN_VEXT_VX(vwsubu_vx_b, 2)
1071 GEN_VEXT_VX(vwsubu_vx_h, 4)
1072 GEN_VEXT_VX(vwsubu_vx_w, 8)
1073 GEN_VEXT_VX(vwadd_vx_b, 2)
1074 GEN_VEXT_VX(vwadd_vx_h, 4)
1075 GEN_VEXT_VX(vwadd_vx_w, 8)
1076 GEN_VEXT_VX(vwsub_vx_b, 2)
1077 GEN_VEXT_VX(vwsub_vx_h, 4)
1078 GEN_VEXT_VX(vwsub_vx_w, 8)
1079 GEN_VEXT_VX(vwaddu_wx_b, 2)
1080 GEN_VEXT_VX(vwaddu_wx_h, 4)
1081 GEN_VEXT_VX(vwaddu_wx_w, 8)
1082 GEN_VEXT_VX(vwsubu_wx_b, 2)
1083 GEN_VEXT_VX(vwsubu_wx_h, 4)
1084 GEN_VEXT_VX(vwsubu_wx_w, 8)
1085 GEN_VEXT_VX(vwadd_wx_b, 2)
1086 GEN_VEXT_VX(vwadd_wx_h, 4)
1087 GEN_VEXT_VX(vwadd_wx_w, 8)
1088 GEN_VEXT_VX(vwsub_wx_b, 2)
1089 GEN_VEXT_VX(vwsub_wx_h, 4)
1090 GEN_VEXT_VX(vwsub_wx_w, 8)
1091 
1092 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1093 #define DO_VADC(N, M, C) (N + M + C)
1094 #define DO_VSBC(N, M, C) (N - M - C)
1095 
1096 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1097 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1098                   CPURISCVState *env, uint32_t desc)          \
1099 {                                                             \
1100     uint32_t vl = env->vl;                                    \
1101     uint32_t esz = sizeof(ETYPE);                             \
1102     uint32_t total_elems =                                    \
1103         vext_get_total_elems(env, desc, esz);                 \
1104     uint32_t vta = vext_vta(desc);                            \
1105     uint32_t i;                                               \
1106                                                               \
1107     for (i = env->vstart; i < vl; i++) {                      \
1108         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1109         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1110         ETYPE carry = vext_elem_mask(v0, i);                  \
1111                                                               \
1112         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1113     }                                                         \
1114     env->vstart = 0;                                          \
1115     /* set tail elements to 1s */                             \
1116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1117 }
1118 
1119 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1120 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1121 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1122 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1123 
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1125 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1126 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1127 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1128 
1129 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1130 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1131                   CPURISCVState *env, uint32_t desc)                     \
1132 {                                                                        \
1133     uint32_t vl = env->vl;                                               \
1134     uint32_t esz = sizeof(ETYPE);                                        \
1135     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1136     uint32_t vta = vext_vta(desc);                                       \
1137     uint32_t i;                                                          \
1138                                                                          \
1139     for (i = env->vstart; i < vl; i++) {                                 \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1141         ETYPE carry = vext_elem_mask(v0, i);                             \
1142                                                                          \
1143         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1144     }                                                                    \
1145     env->vstart = 0;                                          \
1146     /* set tail elements to 1s */                                        \
1147     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1148 }
1149 
1150 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1151 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1152 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1153 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1154 
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1156 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1157 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1158 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1159 
1160 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1161                           (__typeof(N))(N + M) < N)
1162 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1163 
1164 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1165 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1166                   CPURISCVState *env, uint32_t desc)          \
1167 {                                                             \
1168     uint32_t vl = env->vl;                                    \
1169     uint32_t vm = vext_vm(desc);                              \
1170     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1171     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1172     uint32_t i;                                               \
1173                                                               \
1174     for (i = env->vstart; i < vl; i++) {                      \
1175         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1178         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1179     }                                                         \
1180     env->vstart = 0;                                          \
1181     /* mask destination register are always tail-agnostic */  \
1182     /* set tail elements to 1s */                             \
1183     if (vta_all_1s) {                                         \
1184         for (; i < total_elems; i++) {                        \
1185             vext_set_elem_mask(vd, i, 1);                     \
1186         }                                                     \
1187     }                                                         \
1188 }
1189 
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1191 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1192 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1193 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1194 
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1196 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1197 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1198 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1199 
1200 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1201 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1202                   void *vs2, CPURISCVState *env, uint32_t desc) \
1203 {                                                               \
1204     uint32_t vl = env->vl;                                      \
1205     uint32_t vm = vext_vm(desc);                                \
1206     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1207     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1208     uint32_t i;                                                 \
1209                                                                 \
1210     for (i = env->vstart; i < vl; i++) {                        \
1211         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1212         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1213         vext_set_elem_mask(vd, i,                               \
1214                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1215     }                                                           \
1216     env->vstart = 0;                                            \
1217     /* mask destination register are always tail-agnostic */    \
1218     /* set tail elements to 1s */                               \
1219     if (vta_all_1s) {                                           \
1220         for (; i < total_elems; i++) {                          \
1221             vext_set_elem_mask(vd, i, 1);                       \
1222         }                                                       \
1223     }                                                           \
1224 }
1225 
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230 
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235 
1236 /* Vector Bitwise Logical Instructions */
1237 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249 GEN_VEXT_VV(vand_vv_b, 1)
1250 GEN_VEXT_VV(vand_vv_h, 2)
1251 GEN_VEXT_VV(vand_vv_w, 4)
1252 GEN_VEXT_VV(vand_vv_d, 8)
1253 GEN_VEXT_VV(vor_vv_b, 1)
1254 GEN_VEXT_VV(vor_vv_h, 2)
1255 GEN_VEXT_VV(vor_vv_w, 4)
1256 GEN_VEXT_VV(vor_vv_d, 8)
1257 GEN_VEXT_VV(vxor_vv_b, 1)
1258 GEN_VEXT_VV(vxor_vv_h, 2)
1259 GEN_VEXT_VV(vxor_vv_w, 4)
1260 GEN_VEXT_VV(vxor_vv_d, 8)
1261 
1262 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274 GEN_VEXT_VX(vand_vx_b, 1)
1275 GEN_VEXT_VX(vand_vx_h, 2)
1276 GEN_VEXT_VX(vand_vx_w, 4)
1277 GEN_VEXT_VX(vand_vx_d, 8)
1278 GEN_VEXT_VX(vor_vx_b, 1)
1279 GEN_VEXT_VX(vor_vx_h, 2)
1280 GEN_VEXT_VX(vor_vx_w, 4)
1281 GEN_VEXT_VX(vor_vx_d, 8)
1282 GEN_VEXT_VX(vxor_vx_b, 1)
1283 GEN_VEXT_VX(vxor_vx_h, 2)
1284 GEN_VEXT_VX(vxor_vx_w, 4)
1285 GEN_VEXT_VX(vxor_vx_d, 8)
1286 
1287 /* Vector Single-Width Bit Shift Instructions */
1288 #define DO_SLL(N, M)  (N << (M))
1289 #define DO_SRL(N, M)  (N >> (M))
1290 
1291 /* generate the helpers for shift instructions with two vector operators */
1292 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1295 {                                                                         \
1296     uint32_t vm = vext_vm(desc);                                          \
1297     uint32_t vl = env->vl;                                                \
1298     uint32_t esz = sizeof(TS1);                                           \
1299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300     uint32_t vta = vext_vta(desc);                                        \
1301     uint32_t i;                                                           \
1302                                                                           \
1303     for (i = env->vstart; i < vl; i++) {                                  \
1304         if (!vm && !vext_elem_mask(v0, i)) {                              \
1305             continue;                                                     \
1306         }                                                                 \
1307         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1308         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1309         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1310     }                                                                     \
1311     env->vstart = 0;                                                      \
1312     /* set tail elements to 1s */                                         \
1313     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1314 }
1315 
1316 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1317 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1318 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1319 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1320 
1321 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1322 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1323 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1324 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1325 
1326 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1327 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1328 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1329 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1330 
1331 /* generate the helpers for shift instructions with one vector and one scalar */
1332 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1333 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1334         void *vs2, CPURISCVState *env, uint32_t desc)       \
1335 {                                                           \
1336     uint32_t vm = vext_vm(desc);                            \
1337     uint32_t vl = env->vl;                                  \
1338     uint32_t esz = sizeof(TD);                              \
1339     uint32_t total_elems =                                  \
1340         vext_get_total_elems(env, desc, esz);               \
1341     uint32_t vta = vext_vta(desc);                          \
1342     uint32_t i;                                             \
1343                                                             \
1344     for (i = env->vstart; i < vl; i++) {                    \
1345         if (!vm && !vext_elem_mask(v0, i)) {                \
1346             continue;                                       \
1347         }                                                   \
1348         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1349         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1350     }                                                       \
1351     env->vstart = 0;                                        \
1352     /* set tail elements to 1s */                           \
1353     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1354 }
1355 
1356 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1357 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1358 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1359 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1360 
1361 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1362 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1363 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1364 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1365 
1366 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1367 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1368 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1369 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1370 
1371 /* Vector Narrowing Integer Right Shift Instructions */
1372 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1373 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1374 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1375 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1376 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1377 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1378 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1379 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1380 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1381 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1382 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1383 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1384 
1385 /* Vector Integer Comparison Instructions */
1386 #define DO_MSEQ(N, M) (N == M)
1387 #define DO_MSNE(N, M) (N != M)
1388 #define DO_MSLT(N, M) (N < M)
1389 #define DO_MSLE(N, M) (N <= M)
1390 #define DO_MSGT(N, M) (N > M)
1391 
1392 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1393 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1394                   CPURISCVState *env, uint32_t desc)          \
1395 {                                                             \
1396     uint32_t vm = vext_vm(desc);                              \
1397     uint32_t vl = env->vl;                                    \
1398     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1399     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1400     uint32_t i;                                               \
1401                                                               \
1402     for (i = env->vstart; i < vl; i++) {                      \
1403         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1404         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1405         if (!vm && !vext_elem_mask(v0, i)) {                  \
1406             continue;                                         \
1407         }                                                     \
1408         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1409     }                                                         \
1410     env->vstart = 0;                                          \
1411     /* mask destination register are always tail-agnostic */  \
1412     /* set tail elements to 1s */                             \
1413     if (vta_all_1s) {                                         \
1414         for (; i < total_elems; i++) {                        \
1415             vext_set_elem_mask(vd, i, 1);                     \
1416         }                                                     \
1417     }                                                         \
1418 }
1419 
1420 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1421 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1422 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1423 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1424 
1425 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1426 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1427 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1428 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1429 
1430 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1431 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1432 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1433 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1434 
1435 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1436 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1437 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1438 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1439 
1440 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1441 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1442 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1443 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1444 
1445 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1446 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1447 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1448 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1449 
1450 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1451 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1452                   CPURISCVState *env, uint32_t desc)                \
1453 {                                                                   \
1454     uint32_t vm = vext_vm(desc);                                    \
1455     uint32_t vl = env->vl;                                          \
1456     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1457     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1458     uint32_t i;                                                     \
1459                                                                     \
1460     for (i = env->vstart; i < vl; i++) {                            \
1461         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1462         if (!vm && !vext_elem_mask(v0, i)) {                        \
1463             continue;                                               \
1464         }                                                           \
1465         vext_set_elem_mask(vd, i,                                   \
1466                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1467     }                                                               \
1468     env->vstart = 0;                                                \
1469     /* mask destination register are always tail-agnostic */        \
1470     /* set tail elements to 1s */                                   \
1471     if (vta_all_1s) {                                               \
1472         for (; i < total_elems; i++) {                              \
1473             vext_set_elem_mask(vd, i, 1);                           \
1474         }                                                           \
1475     }                                                               \
1476 }
1477 
1478 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1479 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1480 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1481 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1482 
1483 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1484 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1485 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1486 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1487 
1488 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1489 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1490 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1491 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1492 
1493 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1494 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1495 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1496 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1497 
1498 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1499 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1500 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1501 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1502 
1503 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1504 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1505 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1506 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1507 
1508 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1509 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1510 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1511 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1512 
1513 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1514 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1515 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1516 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1517 
1518 /* Vector Integer Min/Max Instructions */
1519 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1520 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1521 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1522 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1523 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1524 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1525 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1526 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1527 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1528 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1529 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1530 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1531 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1532 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1533 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1534 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1535 GEN_VEXT_VV(vminu_vv_b, 1)
1536 GEN_VEXT_VV(vminu_vv_h, 2)
1537 GEN_VEXT_VV(vminu_vv_w, 4)
1538 GEN_VEXT_VV(vminu_vv_d, 8)
1539 GEN_VEXT_VV(vmin_vv_b, 1)
1540 GEN_VEXT_VV(vmin_vv_h, 2)
1541 GEN_VEXT_VV(vmin_vv_w, 4)
1542 GEN_VEXT_VV(vmin_vv_d, 8)
1543 GEN_VEXT_VV(vmaxu_vv_b, 1)
1544 GEN_VEXT_VV(vmaxu_vv_h, 2)
1545 GEN_VEXT_VV(vmaxu_vv_w, 4)
1546 GEN_VEXT_VV(vmaxu_vv_d, 8)
1547 GEN_VEXT_VV(vmax_vv_b, 1)
1548 GEN_VEXT_VV(vmax_vv_h, 2)
1549 GEN_VEXT_VV(vmax_vv_w, 4)
1550 GEN_VEXT_VV(vmax_vv_d, 8)
1551 
1552 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1553 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1554 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1555 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1556 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1557 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1558 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1559 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1560 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1561 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1562 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1563 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1564 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1565 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1566 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1567 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1568 GEN_VEXT_VX(vminu_vx_b, 1)
1569 GEN_VEXT_VX(vminu_vx_h, 2)
1570 GEN_VEXT_VX(vminu_vx_w, 4)
1571 GEN_VEXT_VX(vminu_vx_d, 8)
1572 GEN_VEXT_VX(vmin_vx_b, 1)
1573 GEN_VEXT_VX(vmin_vx_h, 2)
1574 GEN_VEXT_VX(vmin_vx_w, 4)
1575 GEN_VEXT_VX(vmin_vx_d, 8)
1576 GEN_VEXT_VX(vmaxu_vx_b, 1)
1577 GEN_VEXT_VX(vmaxu_vx_h, 2)
1578 GEN_VEXT_VX(vmaxu_vx_w, 4)
1579 GEN_VEXT_VX(vmaxu_vx_d, 8)
1580 GEN_VEXT_VX(vmax_vx_b, 1)
1581 GEN_VEXT_VX(vmax_vx_h, 2)
1582 GEN_VEXT_VX(vmax_vx_w, 4)
1583 GEN_VEXT_VX(vmax_vx_d, 8)
1584 
1585 /* Vector Single-Width Integer Multiply Instructions */
1586 #define DO_MUL(N, M) (N * M)
1587 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1588 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1589 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1590 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1591 GEN_VEXT_VV(vmul_vv_b, 1)
1592 GEN_VEXT_VV(vmul_vv_h, 2)
1593 GEN_VEXT_VV(vmul_vv_w, 4)
1594 GEN_VEXT_VV(vmul_vv_d, 8)
1595 
1596 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1597 {
1598     return (int16_t)s2 * (int16_t)s1 >> 8;
1599 }
1600 
1601 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1602 {
1603     return (int32_t)s2 * (int32_t)s1 >> 16;
1604 }
1605 
1606 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1607 {
1608     return (int64_t)s2 * (int64_t)s1 >> 32;
1609 }
1610 
1611 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1612 {
1613     uint64_t hi_64, lo_64;
1614 
1615     muls64(&lo_64, &hi_64, s1, s2);
1616     return hi_64;
1617 }
1618 
1619 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1620 {
1621     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1622 }
1623 
1624 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1625 {
1626     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1627 }
1628 
1629 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1630 {
1631     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1632 }
1633 
1634 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1635 {
1636     uint64_t hi_64, lo_64;
1637 
1638     mulu64(&lo_64, &hi_64, s2, s1);
1639     return hi_64;
1640 }
1641 
1642 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1643 {
1644     return (int16_t)s2 * (uint16_t)s1 >> 8;
1645 }
1646 
1647 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1648 {
1649     return (int32_t)s2 * (uint32_t)s1 >> 16;
1650 }
1651 
1652 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1653 {
1654     return (int64_t)s2 * (uint64_t)s1 >> 32;
1655 }
1656 
1657 /*
1658  * Let  A = signed operand,
1659  *      B = unsigned operand
1660  *      P = mulu64(A, B), unsigned product
1661  *
1662  * LET  X = 2 ** 64  - A, 2's complement of A
1663  *      SP = signed product
1664  * THEN
1665  *      IF A < 0
1666  *          SP = -X * B
1667  *             = -(2 ** 64 - A) * B
1668  *             = A * B - 2 ** 64 * B
1669  *             = P - 2 ** 64 * B
1670  *      ELSE
1671  *          SP = P
1672  * THEN
1673  *      HI_P -= (A < 0 ? B : 0)
1674  */
1675 
1676 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1677 {
1678     uint64_t hi_64, lo_64;
1679 
1680     mulu64(&lo_64, &hi_64, s2, s1);
1681 
1682     hi_64 -= s2 < 0 ? s1 : 0;
1683     return hi_64;
1684 }
1685 
1686 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1687 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1688 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1689 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1690 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1691 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1692 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1693 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1694 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1695 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1696 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1697 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1698 GEN_VEXT_VV(vmulh_vv_b, 1)
1699 GEN_VEXT_VV(vmulh_vv_h, 2)
1700 GEN_VEXT_VV(vmulh_vv_w, 4)
1701 GEN_VEXT_VV(vmulh_vv_d, 8)
1702 GEN_VEXT_VV(vmulhu_vv_b, 1)
1703 GEN_VEXT_VV(vmulhu_vv_h, 2)
1704 GEN_VEXT_VV(vmulhu_vv_w, 4)
1705 GEN_VEXT_VV(vmulhu_vv_d, 8)
1706 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1707 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1708 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1709 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1710 
1711 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1712 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1713 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1714 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1715 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1716 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1717 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1718 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1719 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1720 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1721 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1722 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1723 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1724 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1725 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1726 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1727 GEN_VEXT_VX(vmul_vx_b, 1)
1728 GEN_VEXT_VX(vmul_vx_h, 2)
1729 GEN_VEXT_VX(vmul_vx_w, 4)
1730 GEN_VEXT_VX(vmul_vx_d, 8)
1731 GEN_VEXT_VX(vmulh_vx_b, 1)
1732 GEN_VEXT_VX(vmulh_vx_h, 2)
1733 GEN_VEXT_VX(vmulh_vx_w, 4)
1734 GEN_VEXT_VX(vmulh_vx_d, 8)
1735 GEN_VEXT_VX(vmulhu_vx_b, 1)
1736 GEN_VEXT_VX(vmulhu_vx_h, 2)
1737 GEN_VEXT_VX(vmulhu_vx_w, 4)
1738 GEN_VEXT_VX(vmulhu_vx_d, 8)
1739 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1740 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1741 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1742 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1743 
1744 /* Vector Integer Divide Instructions */
1745 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1746 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1747 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1748         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1749 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1750         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1751 
1752 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1753 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1754 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1755 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1756 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1757 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1758 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1759 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1760 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1761 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1762 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1763 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1764 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1765 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1766 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1767 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1768 GEN_VEXT_VV(vdivu_vv_b, 1)
1769 GEN_VEXT_VV(vdivu_vv_h, 2)
1770 GEN_VEXT_VV(vdivu_vv_w, 4)
1771 GEN_VEXT_VV(vdivu_vv_d, 8)
1772 GEN_VEXT_VV(vdiv_vv_b, 1)
1773 GEN_VEXT_VV(vdiv_vv_h, 2)
1774 GEN_VEXT_VV(vdiv_vv_w, 4)
1775 GEN_VEXT_VV(vdiv_vv_d, 8)
1776 GEN_VEXT_VV(vremu_vv_b, 1)
1777 GEN_VEXT_VV(vremu_vv_h, 2)
1778 GEN_VEXT_VV(vremu_vv_w, 4)
1779 GEN_VEXT_VV(vremu_vv_d, 8)
1780 GEN_VEXT_VV(vrem_vv_b, 1)
1781 GEN_VEXT_VV(vrem_vv_h, 2)
1782 GEN_VEXT_VV(vrem_vv_w, 4)
1783 GEN_VEXT_VV(vrem_vv_d, 8)
1784 
1785 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1786 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1787 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1788 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1789 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1790 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1791 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1792 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1793 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1794 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1795 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1796 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1797 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1798 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1799 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1800 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1801 GEN_VEXT_VX(vdivu_vx_b, 1)
1802 GEN_VEXT_VX(vdivu_vx_h, 2)
1803 GEN_VEXT_VX(vdivu_vx_w, 4)
1804 GEN_VEXT_VX(vdivu_vx_d, 8)
1805 GEN_VEXT_VX(vdiv_vx_b, 1)
1806 GEN_VEXT_VX(vdiv_vx_h, 2)
1807 GEN_VEXT_VX(vdiv_vx_w, 4)
1808 GEN_VEXT_VX(vdiv_vx_d, 8)
1809 GEN_VEXT_VX(vremu_vx_b, 1)
1810 GEN_VEXT_VX(vremu_vx_h, 2)
1811 GEN_VEXT_VX(vremu_vx_w, 4)
1812 GEN_VEXT_VX(vremu_vx_d, 8)
1813 GEN_VEXT_VX(vrem_vx_b, 1)
1814 GEN_VEXT_VX(vrem_vx_h, 2)
1815 GEN_VEXT_VX(vrem_vx_w, 4)
1816 GEN_VEXT_VX(vrem_vx_d, 8)
1817 
1818 /* Vector Widening Integer Multiply Instructions */
1819 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1820 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1821 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1822 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1823 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1824 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1825 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1826 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1827 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1828 GEN_VEXT_VV(vwmul_vv_b, 2)
1829 GEN_VEXT_VV(vwmul_vv_h, 4)
1830 GEN_VEXT_VV(vwmul_vv_w, 8)
1831 GEN_VEXT_VV(vwmulu_vv_b, 2)
1832 GEN_VEXT_VV(vwmulu_vv_h, 4)
1833 GEN_VEXT_VV(vwmulu_vv_w, 8)
1834 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1835 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1836 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1837 
1838 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1839 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1840 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1841 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1842 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1843 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1844 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1845 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1846 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1847 GEN_VEXT_VX(vwmul_vx_b, 2)
1848 GEN_VEXT_VX(vwmul_vx_h, 4)
1849 GEN_VEXT_VX(vwmul_vx_w, 8)
1850 GEN_VEXT_VX(vwmulu_vx_b, 2)
1851 GEN_VEXT_VX(vwmulu_vx_h, 4)
1852 GEN_VEXT_VX(vwmulu_vx_w, 8)
1853 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1854 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1855 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1856 
1857 /* Vector Single-Width Integer Multiply-Add Instructions */
1858 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1859 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1860 {                                                                  \
1861     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1862     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1863     TD d = *((TD *)vd + HD(i));                                    \
1864     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1865 }
1866 
1867 #define DO_MACC(N, M, D) (M * N + D)
1868 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1869 #define DO_MADD(N, M, D) (M * D + N)
1870 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1871 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1872 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1873 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1874 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1875 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1876 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1877 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1878 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1879 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1880 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1881 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1882 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1883 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1884 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1885 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1886 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1887 GEN_VEXT_VV(vmacc_vv_b, 1)
1888 GEN_VEXT_VV(vmacc_vv_h, 2)
1889 GEN_VEXT_VV(vmacc_vv_w, 4)
1890 GEN_VEXT_VV(vmacc_vv_d, 8)
1891 GEN_VEXT_VV(vnmsac_vv_b, 1)
1892 GEN_VEXT_VV(vnmsac_vv_h, 2)
1893 GEN_VEXT_VV(vnmsac_vv_w, 4)
1894 GEN_VEXT_VV(vnmsac_vv_d, 8)
1895 GEN_VEXT_VV(vmadd_vv_b, 1)
1896 GEN_VEXT_VV(vmadd_vv_h, 2)
1897 GEN_VEXT_VV(vmadd_vv_w, 4)
1898 GEN_VEXT_VV(vmadd_vv_d, 8)
1899 GEN_VEXT_VV(vnmsub_vv_b, 1)
1900 GEN_VEXT_VV(vnmsub_vv_h, 2)
1901 GEN_VEXT_VV(vnmsub_vv_w, 4)
1902 GEN_VEXT_VV(vnmsub_vv_d, 8)
1903 
1904 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1905 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1906 {                                                                   \
1907     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1908     TD d = *((TD *)vd + HD(i));                                     \
1909     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1910 }
1911 
1912 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1913 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1914 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1915 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1916 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1917 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1918 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1919 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1920 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1921 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1922 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1923 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1924 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1925 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1926 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1927 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1928 GEN_VEXT_VX(vmacc_vx_b, 1)
1929 GEN_VEXT_VX(vmacc_vx_h, 2)
1930 GEN_VEXT_VX(vmacc_vx_w, 4)
1931 GEN_VEXT_VX(vmacc_vx_d, 8)
1932 GEN_VEXT_VX(vnmsac_vx_b, 1)
1933 GEN_VEXT_VX(vnmsac_vx_h, 2)
1934 GEN_VEXT_VX(vnmsac_vx_w, 4)
1935 GEN_VEXT_VX(vnmsac_vx_d, 8)
1936 GEN_VEXT_VX(vmadd_vx_b, 1)
1937 GEN_VEXT_VX(vmadd_vx_h, 2)
1938 GEN_VEXT_VX(vmadd_vx_w, 4)
1939 GEN_VEXT_VX(vmadd_vx_d, 8)
1940 GEN_VEXT_VX(vnmsub_vx_b, 1)
1941 GEN_VEXT_VX(vnmsub_vx_h, 2)
1942 GEN_VEXT_VX(vnmsub_vx_w, 4)
1943 GEN_VEXT_VX(vnmsub_vx_d, 8)
1944 
1945 /* Vector Widening Integer Multiply-Add Instructions */
1946 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1947 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1948 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1949 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1950 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1951 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1952 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1953 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1954 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1955 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1956 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1957 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1958 GEN_VEXT_VV(vwmacc_vv_b, 2)
1959 GEN_VEXT_VV(vwmacc_vv_h, 4)
1960 GEN_VEXT_VV(vwmacc_vv_w, 8)
1961 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1962 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1963 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1964 
1965 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1968 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1969 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1970 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1971 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1972 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1973 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1974 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1975 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1976 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1977 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1978 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1979 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1980 GEN_VEXT_VX(vwmacc_vx_b, 2)
1981 GEN_VEXT_VX(vwmacc_vx_h, 4)
1982 GEN_VEXT_VX(vwmacc_vx_w, 8)
1983 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1984 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1985 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1986 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1987 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1988 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1989 
1990 /* Vector Integer Merge and Move Instructions */
1991 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1992 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1993                   uint32_t desc)                                     \
1994 {                                                                    \
1995     uint32_t vl = env->vl;                                           \
1996     uint32_t esz = sizeof(ETYPE);                                    \
1997     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1998     uint32_t vta = vext_vta(desc);                                   \
1999     uint32_t i;                                                      \
2000                                                                      \
2001     for (i = env->vstart; i < vl; i++) {                             \
2002         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2003         *((ETYPE *)vd + H(i)) = s1;                                  \
2004     }                                                                \
2005     env->vstart = 0;                                                 \
2006     /* set tail elements to 1s */                                    \
2007     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2008 }
2009 
2010 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2011 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2012 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2013 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2014 
2015 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2016 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2017                   uint32_t desc)                                     \
2018 {                                                                    \
2019     uint32_t vl = env->vl;                                           \
2020     uint32_t esz = sizeof(ETYPE);                                    \
2021     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2022     uint32_t vta = vext_vta(desc);                                   \
2023     uint32_t i;                                                      \
2024                                                                      \
2025     for (i = env->vstart; i < vl; i++) {                             \
2026         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2027     }                                                                \
2028     env->vstart = 0;                                                 \
2029     /* set tail elements to 1s */                                    \
2030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2031 }
2032 
2033 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2034 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2035 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2036 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2037 
2038 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2039 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2040                   CPURISCVState *env, uint32_t desc)                 \
2041 {                                                                    \
2042     uint32_t vl = env->vl;                                           \
2043     uint32_t esz = sizeof(ETYPE);                                    \
2044     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2045     uint32_t vta = vext_vta(desc);                                   \
2046     uint32_t i;                                                      \
2047                                                                      \
2048     for (i = env->vstart; i < vl; i++) {                             \
2049         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2050         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2051     }                                                                \
2052     env->vstart = 0;                                                 \
2053     /* set tail elements to 1s */                                    \
2054     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2055 }
2056 
2057 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2058 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2059 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2060 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2061 
2062 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2063 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2064                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2065 {                                                                    \
2066     uint32_t vl = env->vl;                                           \
2067     uint32_t esz = sizeof(ETYPE);                                    \
2068     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2069     uint32_t vta = vext_vta(desc);                                   \
2070     uint32_t i;                                                      \
2071                                                                      \
2072     for (i = env->vstart; i < vl; i++) {                             \
2073         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2074         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2075                    (ETYPE)(target_long)s1);                          \
2076         *((ETYPE *)vd + H(i)) = d;                                   \
2077     }                                                                \
2078     env->vstart = 0;                                                 \
2079     /* set tail elements to 1s */                                    \
2080     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2081 }
2082 
2083 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2084 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2085 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2086 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2087 
2088 /*
2089  *** Vector Fixed-Point Arithmetic Instructions
2090  */
2091 
2092 /* Vector Single-Width Saturating Add and Subtract */
2093 
2094 /*
2095  * As fixed point instructions probably have round mode and saturation,
2096  * define common macros for fixed point here.
2097  */
2098 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2099                           CPURISCVState *env, int vxrm);
2100 
2101 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2102 static inline void                                                  \
2103 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2104           CPURISCVState *env, int vxrm)                             \
2105 {                                                                   \
2106     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2107     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2108     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2109 }
2110 
2111 static inline void
2112 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2113              CPURISCVState *env,
2114              uint32_t vl, uint32_t vm, int vxrm,
2115              opivv2_rm_fn *fn)
2116 {
2117     for (uint32_t i = env->vstart; i < vl; i++) {
2118         if (!vm && !vext_elem_mask(v0, i)) {
2119             continue;
2120         }
2121         fn(vd, vs1, vs2, i, env, vxrm);
2122     }
2123     env->vstart = 0;
2124 }
2125 
2126 static inline void
2127 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2128              CPURISCVState *env,
2129              uint32_t desc,
2130              opivv2_rm_fn *fn, uint32_t esz)
2131 {
2132     uint32_t vm = vext_vm(desc);
2133     uint32_t vl = env->vl;
2134     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2135     uint32_t vta = vext_vta(desc);
2136 
2137     switch (env->vxrm) {
2138     case 0: /* rnu */
2139         vext_vv_rm_1(vd, v0, vs1, vs2,
2140                      env, vl, vm, 0, fn);
2141         break;
2142     case 1: /* rne */
2143         vext_vv_rm_1(vd, v0, vs1, vs2,
2144                      env, vl, vm, 1, fn);
2145         break;
2146     case 2: /* rdn */
2147         vext_vv_rm_1(vd, v0, vs1, vs2,
2148                      env, vl, vm, 2, fn);
2149         break;
2150     default: /* rod */
2151         vext_vv_rm_1(vd, v0, vs1, vs2,
2152                      env, vl, vm, 3, fn);
2153         break;
2154     }
2155     /* set tail elements to 1s */
2156     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2157 }
2158 
2159 /* generate helpers for fixed point instructions with OPIVV format */
2160 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2161 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2162                   CPURISCVState *env, uint32_t desc)            \
2163 {                                                               \
2164     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2165                  do_##NAME, ESZ);                               \
2166 }
2167 
2168 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2169 {
2170     uint8_t res = a + b;
2171     if (res < a) {
2172         res = UINT8_MAX;
2173         env->vxsat = 0x1;
2174     }
2175     return res;
2176 }
2177 
2178 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2179                                uint16_t b)
2180 {
2181     uint16_t res = a + b;
2182     if (res < a) {
2183         res = UINT16_MAX;
2184         env->vxsat = 0x1;
2185     }
2186     return res;
2187 }
2188 
2189 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2190                                uint32_t b)
2191 {
2192     uint32_t res = a + b;
2193     if (res < a) {
2194         res = UINT32_MAX;
2195         env->vxsat = 0x1;
2196     }
2197     return res;
2198 }
2199 
2200 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2201                                uint64_t b)
2202 {
2203     uint64_t res = a + b;
2204     if (res < a) {
2205         res = UINT64_MAX;
2206         env->vxsat = 0x1;
2207     }
2208     return res;
2209 }
2210 
2211 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2212 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2213 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2214 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2215 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2216 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2217 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2218 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2219 
2220 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2221                           CPURISCVState *env, int vxrm);
2222 
2223 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2224 static inline void                                                  \
2225 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2226           CPURISCVState *env, int vxrm)                             \
2227 {                                                                   \
2228     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2229     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2230 }
2231 
2232 static inline void
2233 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2234              CPURISCVState *env,
2235              uint32_t vl, uint32_t vm, int vxrm,
2236              opivx2_rm_fn *fn)
2237 {
2238     for (uint32_t i = env->vstart; i < vl; i++) {
2239         if (!vm && !vext_elem_mask(v0, i)) {
2240             continue;
2241         }
2242         fn(vd, s1, vs2, i, env, vxrm);
2243     }
2244     env->vstart = 0;
2245 }
2246 
2247 static inline void
2248 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2249              CPURISCVState *env,
2250              uint32_t desc,
2251              opivx2_rm_fn *fn, uint32_t esz)
2252 {
2253     uint32_t vm = vext_vm(desc);
2254     uint32_t vl = env->vl;
2255     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2256     uint32_t vta = vext_vta(desc);
2257 
2258     switch (env->vxrm) {
2259     case 0: /* rnu */
2260         vext_vx_rm_1(vd, v0, s1, vs2,
2261                      env, vl, vm, 0, fn);
2262         break;
2263     case 1: /* rne */
2264         vext_vx_rm_1(vd, v0, s1, vs2,
2265                      env, vl, vm, 1, fn);
2266         break;
2267     case 2: /* rdn */
2268         vext_vx_rm_1(vd, v0, s1, vs2,
2269                      env, vl, vm, 2, fn);
2270         break;
2271     default: /* rod */
2272         vext_vx_rm_1(vd, v0, s1, vs2,
2273                      env, vl, vm, 3, fn);
2274         break;
2275     }
2276     /* set tail elements to 1s */
2277     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2278 }
2279 
2280 /* generate helpers for fixed point instructions with OPIVX format */
2281 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2282 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2283         void *vs2, CPURISCVState *env, uint32_t desc)     \
2284 {                                                         \
2285     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2286                  do_##NAME, ESZ);                         \
2287 }
2288 
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2290 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2291 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2292 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2293 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2294 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2295 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2296 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2297 
2298 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2299 {
2300     int8_t res = a + b;
2301     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2302         res = a > 0 ? INT8_MAX : INT8_MIN;
2303         env->vxsat = 0x1;
2304     }
2305     return res;
2306 }
2307 
2308 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2309 {
2310     int16_t res = a + b;
2311     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2312         res = a > 0 ? INT16_MAX : INT16_MIN;
2313         env->vxsat = 0x1;
2314     }
2315     return res;
2316 }
2317 
2318 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2319 {
2320     int32_t res = a + b;
2321     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2322         res = a > 0 ? INT32_MAX : INT32_MIN;
2323         env->vxsat = 0x1;
2324     }
2325     return res;
2326 }
2327 
2328 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2329 {
2330     int64_t res = a + b;
2331     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2332         res = a > 0 ? INT64_MAX : INT64_MIN;
2333         env->vxsat = 0x1;
2334     }
2335     return res;
2336 }
2337 
2338 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2339 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2340 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2341 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2342 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2343 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2344 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2345 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2346 
2347 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2348 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2349 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2350 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2351 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2352 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2353 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2354 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2355 
2356 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2357 {
2358     uint8_t res = a - b;
2359     if (res > a) {
2360         res = 0;
2361         env->vxsat = 0x1;
2362     }
2363     return res;
2364 }
2365 
2366 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2367                                uint16_t b)
2368 {
2369     uint16_t res = a - b;
2370     if (res > a) {
2371         res = 0;
2372         env->vxsat = 0x1;
2373     }
2374     return res;
2375 }
2376 
2377 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2378                                uint32_t b)
2379 {
2380     uint32_t res = a - b;
2381     if (res > a) {
2382         res = 0;
2383         env->vxsat = 0x1;
2384     }
2385     return res;
2386 }
2387 
2388 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2389                                uint64_t b)
2390 {
2391     uint64_t res = a - b;
2392     if (res > a) {
2393         res = 0;
2394         env->vxsat = 0x1;
2395     }
2396     return res;
2397 }
2398 
2399 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2400 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2401 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2402 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2403 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2404 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2405 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2406 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2407 
2408 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2409 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2410 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2411 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2412 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2413 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2414 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2415 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2416 
2417 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2418 {
2419     int8_t res = a - b;
2420     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2421         res = a >= 0 ? INT8_MAX : INT8_MIN;
2422         env->vxsat = 0x1;
2423     }
2424     return res;
2425 }
2426 
2427 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2428 {
2429     int16_t res = a - b;
2430     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2431         res = a >= 0 ? INT16_MAX : INT16_MIN;
2432         env->vxsat = 0x1;
2433     }
2434     return res;
2435 }
2436 
2437 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2438 {
2439     int32_t res = a - b;
2440     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2441         res = a >= 0 ? INT32_MAX : INT32_MIN;
2442         env->vxsat = 0x1;
2443     }
2444     return res;
2445 }
2446 
2447 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2448 {
2449     int64_t res = a - b;
2450     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2451         res = a >= 0 ? INT64_MAX : INT64_MIN;
2452         env->vxsat = 0x1;
2453     }
2454     return res;
2455 }
2456 
2457 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2458 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2459 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2460 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2461 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2462 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2463 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2464 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2465 
2466 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2467 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2468 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2469 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2470 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2471 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2472 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2473 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2474 
2475 /* Vector Single-Width Averaging Add and Subtract */
2476 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2477 {
2478     uint8_t d = extract64(v, shift, 1);
2479     uint8_t d1;
2480     uint64_t D1, D2;
2481 
2482     if (shift == 0 || shift > 64) {
2483         return 0;
2484     }
2485 
2486     d1 = extract64(v, shift - 1, 1);
2487     D1 = extract64(v, 0, shift);
2488     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2489         return d1;
2490     } else if (vxrm == 1) { /* round-to-nearest-even */
2491         if (shift > 1) {
2492             D2 = extract64(v, 0, shift - 1);
2493             return d1 & ((D2 != 0) | d);
2494         } else {
2495             return d1 & d;
2496         }
2497     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2498         return !d & (D1 != 0);
2499     }
2500     return 0; /* round-down (truncate) */
2501 }
2502 
2503 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2504 {
2505     int64_t res = (int64_t)a + b;
2506     uint8_t round = get_round(vxrm, res, 1);
2507 
2508     return (res >> 1) + round;
2509 }
2510 
2511 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2512 {
2513     int64_t res = a + b;
2514     uint8_t round = get_round(vxrm, res, 1);
2515     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2516 
2517     /* With signed overflow, bit 64 is inverse of bit 63. */
2518     return ((res >> 1) ^ over) + round;
2519 }
2520 
2521 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2522 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2523 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2524 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2525 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2526 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2527 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2528 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2529 
2530 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2531 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2532 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2533 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2534 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2535 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2536 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2537 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2538 
2539 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2540                                uint32_t a, uint32_t b)
2541 {
2542     uint64_t res = (uint64_t)a + b;
2543     uint8_t round = get_round(vxrm, res, 1);
2544 
2545     return (res >> 1) + round;
2546 }
2547 
2548 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2549                                uint64_t a, uint64_t b)
2550 {
2551     uint64_t res = a + b;
2552     uint8_t round = get_round(vxrm, res, 1);
2553     uint64_t over = (uint64_t)(res < a) << 63;
2554 
2555     return ((res >> 1) | over) + round;
2556 }
2557 
2558 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2559 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2560 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2561 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2562 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2563 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2564 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2565 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2566 
2567 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2568 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2569 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2570 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2571 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2572 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2573 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2574 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2575 
2576 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2577 {
2578     int64_t res = (int64_t)a - b;
2579     uint8_t round = get_round(vxrm, res, 1);
2580 
2581     return (res >> 1) + round;
2582 }
2583 
2584 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2585 {
2586     int64_t res = (int64_t)a - b;
2587     uint8_t round = get_round(vxrm, res, 1);
2588     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2589 
2590     /* With signed overflow, bit 64 is inverse of bit 63. */
2591     return ((res >> 1) ^ over) + round;
2592 }
2593 
2594 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2595 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2596 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2597 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2598 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2599 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2600 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2601 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2602 
2603 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2604 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2605 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2606 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2607 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2608 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2609 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2610 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2611 
2612 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2613                                uint32_t a, uint32_t b)
2614 {
2615     int64_t res = (int64_t)a - b;
2616     uint8_t round = get_round(vxrm, res, 1);
2617 
2618     return (res >> 1) + round;
2619 }
2620 
2621 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2622                                uint64_t a, uint64_t b)
2623 {
2624     uint64_t res = (uint64_t)a - b;
2625     uint8_t round = get_round(vxrm, res, 1);
2626     uint64_t over = (uint64_t)(res > a) << 63;
2627 
2628     return ((res >> 1) | over) + round;
2629 }
2630 
2631 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2632 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2633 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2634 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2635 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2636 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2637 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2638 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2639 
2640 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2641 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2642 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2643 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2644 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2645 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2646 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2647 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2648 
2649 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2650 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2651 {
2652     uint8_t round;
2653     int16_t res;
2654 
2655     res = (int16_t)a * (int16_t)b;
2656     round = get_round(vxrm, res, 7);
2657     res   = (res >> 7) + round;
2658 
2659     if (res > INT8_MAX) {
2660         env->vxsat = 0x1;
2661         return INT8_MAX;
2662     } else if (res < INT8_MIN) {
2663         env->vxsat = 0x1;
2664         return INT8_MIN;
2665     } else {
2666         return res;
2667     }
2668 }
2669 
2670 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2671 {
2672     uint8_t round;
2673     int32_t res;
2674 
2675     res = (int32_t)a * (int32_t)b;
2676     round = get_round(vxrm, res, 15);
2677     res   = (res >> 15) + round;
2678 
2679     if (res > INT16_MAX) {
2680         env->vxsat = 0x1;
2681         return INT16_MAX;
2682     } else if (res < INT16_MIN) {
2683         env->vxsat = 0x1;
2684         return INT16_MIN;
2685     } else {
2686         return res;
2687     }
2688 }
2689 
2690 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2691 {
2692     uint8_t round;
2693     int64_t res;
2694 
2695     res = (int64_t)a * (int64_t)b;
2696     round = get_round(vxrm, res, 31);
2697     res   = (res >> 31) + round;
2698 
2699     if (res > INT32_MAX) {
2700         env->vxsat = 0x1;
2701         return INT32_MAX;
2702     } else if (res < INT32_MIN) {
2703         env->vxsat = 0x1;
2704         return INT32_MIN;
2705     } else {
2706         return res;
2707     }
2708 }
2709 
2710 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2711 {
2712     uint8_t round;
2713     uint64_t hi_64, lo_64;
2714     int64_t res;
2715 
2716     if (a == INT64_MIN && b == INT64_MIN) {
2717         env->vxsat = 1;
2718         return INT64_MAX;
2719     }
2720 
2721     muls64(&lo_64, &hi_64, a, b);
2722     round = get_round(vxrm, lo_64, 63);
2723     /*
2724      * Cannot overflow, as there are always
2725      * 2 sign bits after multiply.
2726      */
2727     res = (hi_64 << 1) | (lo_64 >> 63);
2728     if (round) {
2729         if (res == INT64_MAX) {
2730             env->vxsat = 1;
2731         } else {
2732             res += 1;
2733         }
2734     }
2735     return res;
2736 }
2737 
2738 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2739 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2740 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2741 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2742 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2743 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2744 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2745 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2746 
2747 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2748 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2749 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2750 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2751 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2752 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2753 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2754 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2755 
2756 /* Vector Single-Width Scaling Shift Instructions */
2757 static inline uint8_t
2758 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2759 {
2760     uint8_t round, shift = b & 0x7;
2761     uint8_t res;
2762 
2763     round = get_round(vxrm, a, shift);
2764     res   = (a >> shift)  + round;
2765     return res;
2766 }
2767 static inline uint16_t
2768 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2769 {
2770     uint8_t round, shift = b & 0xf;
2771     uint16_t res;
2772 
2773     round = get_round(vxrm, a, shift);
2774     res   = (a >> shift)  + round;
2775     return res;
2776 }
2777 static inline uint32_t
2778 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2779 {
2780     uint8_t round, shift = b & 0x1f;
2781     uint32_t res;
2782 
2783     round = get_round(vxrm, a, shift);
2784     res   = (a >> shift)  + round;
2785     return res;
2786 }
2787 static inline uint64_t
2788 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2789 {
2790     uint8_t round, shift = b & 0x3f;
2791     uint64_t res;
2792 
2793     round = get_round(vxrm, a, shift);
2794     res   = (a >> shift)  + round;
2795     return res;
2796 }
2797 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2798 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2799 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2800 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2801 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2802 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2803 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2804 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2805 
2806 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2807 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2808 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2809 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2810 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2811 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2812 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2813 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2814 
2815 static inline int8_t
2816 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2817 {
2818     uint8_t round, shift = b & 0x7;
2819     int8_t res;
2820 
2821     round = get_round(vxrm, a, shift);
2822     res   = (a >> shift)  + round;
2823     return res;
2824 }
2825 static inline int16_t
2826 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2827 {
2828     uint8_t round, shift = b & 0xf;
2829     int16_t res;
2830 
2831     round = get_round(vxrm, a, shift);
2832     res   = (a >> shift)  + round;
2833     return res;
2834 }
2835 static inline int32_t
2836 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2837 {
2838     uint8_t round, shift = b & 0x1f;
2839     int32_t res;
2840 
2841     round = get_round(vxrm, a, shift);
2842     res   = (a >> shift)  + round;
2843     return res;
2844 }
2845 static inline int64_t
2846 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2847 {
2848     uint8_t round, shift = b & 0x3f;
2849     int64_t res;
2850 
2851     round = get_round(vxrm, a, shift);
2852     res   = (a >> shift)  + round;
2853     return res;
2854 }
2855 
2856 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2857 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2858 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2859 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2860 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2861 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2862 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2863 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2864 
2865 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2866 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2867 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2868 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2869 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2870 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2871 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2872 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2873 
2874 /* Vector Narrowing Fixed-Point Clip Instructions */
2875 static inline int8_t
2876 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2877 {
2878     uint8_t round, shift = b & 0xf;
2879     int16_t res;
2880 
2881     round = get_round(vxrm, a, shift);
2882     res   = (a >> shift)  + round;
2883     if (res > INT8_MAX) {
2884         env->vxsat = 0x1;
2885         return INT8_MAX;
2886     } else if (res < INT8_MIN) {
2887         env->vxsat = 0x1;
2888         return INT8_MIN;
2889     } else {
2890         return res;
2891     }
2892 }
2893 
2894 static inline int16_t
2895 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2896 {
2897     uint8_t round, shift = b & 0x1f;
2898     int32_t res;
2899 
2900     round = get_round(vxrm, a, shift);
2901     res   = (a >> shift)  + round;
2902     if (res > INT16_MAX) {
2903         env->vxsat = 0x1;
2904         return INT16_MAX;
2905     } else if (res < INT16_MIN) {
2906         env->vxsat = 0x1;
2907         return INT16_MIN;
2908     } else {
2909         return res;
2910     }
2911 }
2912 
2913 static inline int32_t
2914 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2915 {
2916     uint8_t round, shift = b & 0x3f;
2917     int64_t res;
2918 
2919     round = get_round(vxrm, a, shift);
2920     res   = (a >> shift)  + round;
2921     if (res > INT32_MAX) {
2922         env->vxsat = 0x1;
2923         return INT32_MAX;
2924     } else if (res < INT32_MIN) {
2925         env->vxsat = 0x1;
2926         return INT32_MIN;
2927     } else {
2928         return res;
2929     }
2930 }
2931 
2932 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2933 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2934 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2935 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2936 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2937 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2938 
2939 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2940 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2941 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2942 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2943 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2944 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2945 
2946 static inline uint8_t
2947 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2948 {
2949     uint8_t round, shift = b & 0xf;
2950     uint16_t res;
2951 
2952     round = get_round(vxrm, a, shift);
2953     res   = (a >> shift)  + round;
2954     if (res > UINT8_MAX) {
2955         env->vxsat = 0x1;
2956         return UINT8_MAX;
2957     } else {
2958         return res;
2959     }
2960 }
2961 
2962 static inline uint16_t
2963 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2964 {
2965     uint8_t round, shift = b & 0x1f;
2966     uint32_t res;
2967 
2968     round = get_round(vxrm, a, shift);
2969     res   = (a >> shift)  + round;
2970     if (res > UINT16_MAX) {
2971         env->vxsat = 0x1;
2972         return UINT16_MAX;
2973     } else {
2974         return res;
2975     }
2976 }
2977 
2978 static inline uint32_t
2979 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2980 {
2981     uint8_t round, shift = b & 0x3f;
2982     uint64_t res;
2983 
2984     round = get_round(vxrm, a, shift);
2985     res   = (a >> shift)  + round;
2986     if (res > UINT32_MAX) {
2987         env->vxsat = 0x1;
2988         return UINT32_MAX;
2989     } else {
2990         return res;
2991     }
2992 }
2993 
2994 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2995 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2996 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2997 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2998 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2999 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3000 
3001 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3002 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3003 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3004 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3005 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3006 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3007 
3008 /*
3009  *** Vector Float Point Arithmetic Instructions
3010  */
3011 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3012 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3013 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3014                       CPURISCVState *env)                      \
3015 {                                                              \
3016     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3017     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3018     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3019 }
3020 
3021 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3022 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3023                   void *vs2, CPURISCVState *env,          \
3024                   uint32_t desc)                          \
3025 {                                                         \
3026     uint32_t vm = vext_vm(desc);                          \
3027     uint32_t vl = env->vl;                                \
3028     uint32_t total_elems =                                \
3029         vext_get_total_elems(env, desc, ESZ);             \
3030     uint32_t vta = vext_vta(desc);                        \
3031     uint32_t i;                                           \
3032                                                           \
3033     for (i = env->vstart; i < vl; i++) {                  \
3034         if (!vm && !vext_elem_mask(v0, i)) {              \
3035             continue;                                     \
3036         }                                                 \
3037         do_##NAME(vd, vs1, vs2, i, env);                  \
3038     }                                                     \
3039     env->vstart = 0;                                      \
3040     /* set tail elements to 1s */                         \
3041     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3042                       total_elems * ESZ);                 \
3043 }
3044 
3045 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3046 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3047 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3048 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3049 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3050 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3051 
3052 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3053 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3054                       CPURISCVState *env)                      \
3055 {                                                              \
3056     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3057     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3058 }
3059 
3060 #define GEN_VEXT_VF(NAME, ESZ)                            \
3061 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3062                   void *vs2, CPURISCVState *env,          \
3063                   uint32_t desc)                          \
3064 {                                                         \
3065     uint32_t vm = vext_vm(desc);                          \
3066     uint32_t vl = env->vl;                                \
3067     uint32_t total_elems =                                \
3068         vext_get_total_elems(env, desc, ESZ);              \
3069     uint32_t vta = vext_vta(desc);                        \
3070     uint32_t i;                                           \
3071                                                           \
3072     for (i = env->vstart; i < vl; i++) {                  \
3073         if (!vm && !vext_elem_mask(v0, i)) {              \
3074             continue;                                     \
3075         }                                                 \
3076         do_##NAME(vd, s1, vs2, i, env);                   \
3077     }                                                     \
3078     env->vstart = 0;                                      \
3079     /* set tail elements to 1s */                         \
3080     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3081                       total_elems * ESZ);                 \
3082 }
3083 
3084 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3085 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3086 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3087 GEN_VEXT_VF(vfadd_vf_h, 2)
3088 GEN_VEXT_VF(vfadd_vf_w, 4)
3089 GEN_VEXT_VF(vfadd_vf_d, 8)
3090 
3091 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3092 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3093 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3094 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3095 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3096 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3097 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3098 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3099 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3100 GEN_VEXT_VF(vfsub_vf_h, 2)
3101 GEN_VEXT_VF(vfsub_vf_w, 4)
3102 GEN_VEXT_VF(vfsub_vf_d, 8)
3103 
3104 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3105 {
3106     return float16_sub(b, a, s);
3107 }
3108 
3109 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3110 {
3111     return float32_sub(b, a, s);
3112 }
3113 
3114 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3115 {
3116     return float64_sub(b, a, s);
3117 }
3118 
3119 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3120 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3121 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3122 GEN_VEXT_VF(vfrsub_vf_h, 2)
3123 GEN_VEXT_VF(vfrsub_vf_w, 4)
3124 GEN_VEXT_VF(vfrsub_vf_d, 8)
3125 
3126 /* Vector Widening Floating-Point Add/Subtract Instructions */
3127 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3128 {
3129     return float32_add(float16_to_float32(a, true, s),
3130             float16_to_float32(b, true, s), s);
3131 }
3132 
3133 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3134 {
3135     return float64_add(float32_to_float64(a, s),
3136             float32_to_float64(b, s), s);
3137 
3138 }
3139 
3140 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3141 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3142 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3143 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3144 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3145 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3146 GEN_VEXT_VF(vfwadd_vf_h, 4)
3147 GEN_VEXT_VF(vfwadd_vf_w, 8)
3148 
3149 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3150 {
3151     return float32_sub(float16_to_float32(a, true, s),
3152             float16_to_float32(b, true, s), s);
3153 }
3154 
3155 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3156 {
3157     return float64_sub(float32_to_float64(a, s),
3158             float32_to_float64(b, s), s);
3159 
3160 }
3161 
3162 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3163 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3164 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3165 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3166 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3167 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3168 GEN_VEXT_VF(vfwsub_vf_h, 4)
3169 GEN_VEXT_VF(vfwsub_vf_w, 8)
3170 
3171 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3172 {
3173     return float32_add(a, float16_to_float32(b, true, s), s);
3174 }
3175 
3176 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3177 {
3178     return float64_add(a, float32_to_float64(b, s), s);
3179 }
3180 
3181 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3182 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3183 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3184 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3185 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3186 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3187 GEN_VEXT_VF(vfwadd_wf_h, 4)
3188 GEN_VEXT_VF(vfwadd_wf_w, 8)
3189 
3190 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3191 {
3192     return float32_sub(a, float16_to_float32(b, true, s), s);
3193 }
3194 
3195 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3196 {
3197     return float64_sub(a, float32_to_float64(b, s), s);
3198 }
3199 
3200 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3201 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3202 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3203 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3204 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3205 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3206 GEN_VEXT_VF(vfwsub_wf_h, 4)
3207 GEN_VEXT_VF(vfwsub_wf_w, 8)
3208 
3209 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3210 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3211 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3212 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3213 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3214 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3215 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3216 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3217 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3218 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3219 GEN_VEXT_VF(vfmul_vf_h, 2)
3220 GEN_VEXT_VF(vfmul_vf_w, 4)
3221 GEN_VEXT_VF(vfmul_vf_d, 8)
3222 
3223 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3224 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3225 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3226 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3227 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3228 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3229 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3230 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3231 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3232 GEN_VEXT_VF(vfdiv_vf_h, 2)
3233 GEN_VEXT_VF(vfdiv_vf_w, 4)
3234 GEN_VEXT_VF(vfdiv_vf_d, 8)
3235 
3236 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3237 {
3238     return float16_div(b, a, s);
3239 }
3240 
3241 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3242 {
3243     return float32_div(b, a, s);
3244 }
3245 
3246 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3247 {
3248     return float64_div(b, a, s);
3249 }
3250 
3251 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3252 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3253 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3254 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3255 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3256 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3257 
3258 /* Vector Widening Floating-Point Multiply */
3259 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3260 {
3261     return float32_mul(float16_to_float32(a, true, s),
3262             float16_to_float32(b, true, s), s);
3263 }
3264 
3265 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3266 {
3267     return float64_mul(float32_to_float64(a, s),
3268             float32_to_float64(b, s), s);
3269 
3270 }
3271 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3272 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3273 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3274 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3275 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3276 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3277 GEN_VEXT_VF(vfwmul_vf_h, 4)
3278 GEN_VEXT_VF(vfwmul_vf_w, 8)
3279 
3280 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3281 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3282 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3283         CPURISCVState *env)                                        \
3284 {                                                                  \
3285     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3286     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3287     TD d = *((TD *)vd + HD(i));                                    \
3288     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3289 }
3290 
3291 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3292 {
3293     return float16_muladd(a, b, d, 0, s);
3294 }
3295 
3296 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3297 {
3298     return float32_muladd(a, b, d, 0, s);
3299 }
3300 
3301 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3302 {
3303     return float64_muladd(a, b, d, 0, s);
3304 }
3305 
3306 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3307 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3308 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3309 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3310 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3311 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3312 
3313 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3314 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3315         CPURISCVState *env)                                       \
3316 {                                                                 \
3317     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3318     TD d = *((TD *)vd + HD(i));                                   \
3319     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3320 }
3321 
3322 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3323 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3324 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3325 GEN_VEXT_VF(vfmacc_vf_h, 2)
3326 GEN_VEXT_VF(vfmacc_vf_w, 4)
3327 GEN_VEXT_VF(vfmacc_vf_d, 8)
3328 
3329 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3330 {
3331     return float16_muladd(a, b, d,
3332             float_muladd_negate_c | float_muladd_negate_product, s);
3333 }
3334 
3335 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3336 {
3337     return float32_muladd(a, b, d,
3338             float_muladd_negate_c | float_muladd_negate_product, s);
3339 }
3340 
3341 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3342 {
3343     return float64_muladd(a, b, d,
3344             float_muladd_negate_c | float_muladd_negate_product, s);
3345 }
3346 
3347 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3348 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3349 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3350 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3351 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3352 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3353 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3354 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3355 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3356 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3357 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3358 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3359 
3360 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3361 {
3362     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3363 }
3364 
3365 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3366 {
3367     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3368 }
3369 
3370 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3371 {
3372     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3373 }
3374 
3375 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3376 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3377 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3378 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3379 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3380 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3381 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3382 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3383 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3384 GEN_VEXT_VF(vfmsac_vf_h, 2)
3385 GEN_VEXT_VF(vfmsac_vf_w, 4)
3386 GEN_VEXT_VF(vfmsac_vf_d, 8)
3387 
3388 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3389 {
3390     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3391 }
3392 
3393 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3394 {
3395     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3396 }
3397 
3398 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3399 {
3400     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3401 }
3402 
3403 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3404 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3405 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3406 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3407 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3408 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3409 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3410 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3411 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3412 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3413 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3414 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3415 
3416 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3417 {
3418     return float16_muladd(d, b, a, 0, s);
3419 }
3420 
3421 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3422 {
3423     return float32_muladd(d, b, a, 0, s);
3424 }
3425 
3426 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3427 {
3428     return float64_muladd(d, b, a, 0, s);
3429 }
3430 
3431 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3432 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3433 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3434 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3435 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3436 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3437 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3438 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3439 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3440 GEN_VEXT_VF(vfmadd_vf_h, 2)
3441 GEN_VEXT_VF(vfmadd_vf_w, 4)
3442 GEN_VEXT_VF(vfmadd_vf_d, 8)
3443 
3444 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3445 {
3446     return float16_muladd(d, b, a,
3447             float_muladd_negate_c | float_muladd_negate_product, s);
3448 }
3449 
3450 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3451 {
3452     return float32_muladd(d, b, a,
3453             float_muladd_negate_c | float_muladd_negate_product, s);
3454 }
3455 
3456 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3457 {
3458     return float64_muladd(d, b, a,
3459             float_muladd_negate_c | float_muladd_negate_product, s);
3460 }
3461 
3462 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3463 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3464 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3465 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3466 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3467 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3468 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3469 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3470 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3471 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3472 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3473 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3474 
3475 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3476 {
3477     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3478 }
3479 
3480 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3481 {
3482     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3483 }
3484 
3485 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3486 {
3487     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3488 }
3489 
3490 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3491 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3492 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3493 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3494 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3495 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3496 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3497 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3498 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3499 GEN_VEXT_VF(vfmsub_vf_h, 2)
3500 GEN_VEXT_VF(vfmsub_vf_w, 4)
3501 GEN_VEXT_VF(vfmsub_vf_d, 8)
3502 
3503 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3504 {
3505     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3506 }
3507 
3508 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3509 {
3510     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3511 }
3512 
3513 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3514 {
3515     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3516 }
3517 
3518 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3519 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3520 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3521 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3522 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3523 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3524 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3525 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3526 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3527 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3528 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3529 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3530 
3531 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3532 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3533 {
3534     return float32_muladd(float16_to_float32(a, true, s),
3535                         float16_to_float32(b, true, s), d, 0, s);
3536 }
3537 
3538 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3539 {
3540     return float64_muladd(float32_to_float64(a, s),
3541                         float32_to_float64(b, s), d, 0, s);
3542 }
3543 
3544 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3545 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3546 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3547 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3548 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3549 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3550 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3551 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3552 
3553 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3554 {
3555     return float32_muladd(float16_to_float32(a, true, s),
3556                         float16_to_float32(b, true, s), d,
3557                         float_muladd_negate_c | float_muladd_negate_product, s);
3558 }
3559 
3560 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3561 {
3562     return float64_muladd(float32_to_float64(a, s),
3563                         float32_to_float64(b, s), d,
3564                         float_muladd_negate_c | float_muladd_negate_product, s);
3565 }
3566 
3567 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3568 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3569 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3570 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3571 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3572 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3573 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3574 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3575 
3576 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3577 {
3578     return float32_muladd(float16_to_float32(a, true, s),
3579                         float16_to_float32(b, true, s), d,
3580                         float_muladd_negate_c, s);
3581 }
3582 
3583 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3584 {
3585     return float64_muladd(float32_to_float64(a, s),
3586                         float32_to_float64(b, s), d,
3587                         float_muladd_negate_c, s);
3588 }
3589 
3590 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3591 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3592 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3593 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3594 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3595 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3596 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3597 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3598 
3599 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3600 {
3601     return float32_muladd(float16_to_float32(a, true, s),
3602                         float16_to_float32(b, true, s), d,
3603                         float_muladd_negate_product, s);
3604 }
3605 
3606 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3607 {
3608     return float64_muladd(float32_to_float64(a, s),
3609                         float32_to_float64(b, s), d,
3610                         float_muladd_negate_product, s);
3611 }
3612 
3613 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3614 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3615 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3616 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3617 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3618 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3619 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3620 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3621 
3622 /* Vector Floating-Point Square-Root Instruction */
3623 /* (TD, T2, TX2) */
3624 #define OP_UU_H uint16_t, uint16_t, uint16_t
3625 #define OP_UU_W uint32_t, uint32_t, uint32_t
3626 #define OP_UU_D uint64_t, uint64_t, uint64_t
3627 
3628 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3629 static void do_##NAME(void *vd, void *vs2, int i,      \
3630         CPURISCVState *env)                            \
3631 {                                                      \
3632     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3633     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3634 }
3635 
3636 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3637 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3638         CPURISCVState *env, uint32_t desc)             \
3639 {                                                      \
3640     uint32_t vm = vext_vm(desc);                       \
3641     uint32_t vl = env->vl;                             \
3642     uint32_t total_elems =                             \
3643         vext_get_total_elems(env, desc, ESZ);          \
3644     uint32_t vta = vext_vta(desc);                     \
3645     uint32_t i;                                        \
3646                                                        \
3647     if (vl == 0) {                                     \
3648         return;                                        \
3649     }                                                  \
3650     for (i = env->vstart; i < vl; i++) {               \
3651         if (!vm && !vext_elem_mask(v0, i)) {           \
3652             continue;                                  \
3653         }                                              \
3654         do_##NAME(vd, vs2, i, env);                    \
3655     }                                                  \
3656     env->vstart = 0;                                   \
3657     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3658                       total_elems * ESZ);              \
3659 }
3660 
3661 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3662 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3663 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3664 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3665 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3666 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3667 
3668 /*
3669  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3670  *
3671  * Adapted from riscv-v-spec recip.c:
3672  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3673  */
3674 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3675 {
3676     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3677     uint64_t exp = extract64(f, frac_size, exp_size);
3678     uint64_t frac = extract64(f, 0, frac_size);
3679 
3680     const uint8_t lookup_table[] = {
3681         52, 51, 50, 48, 47, 46, 44, 43,
3682         42, 41, 40, 39, 38, 36, 35, 34,
3683         33, 32, 31, 30, 30, 29, 28, 27,
3684         26, 25, 24, 23, 23, 22, 21, 20,
3685         19, 19, 18, 17, 16, 16, 15, 14,
3686         14, 13, 12, 12, 11, 10, 10, 9,
3687         9, 8, 7, 7, 6, 6, 5, 4,
3688         4, 3, 3, 2, 2, 1, 1, 0,
3689         127, 125, 123, 121, 119, 118, 116, 114,
3690         113, 111, 109, 108, 106, 105, 103, 102,
3691         100, 99, 97, 96, 95, 93, 92, 91,
3692         90, 88, 87, 86, 85, 84, 83, 82,
3693         80, 79, 78, 77, 76, 75, 74, 73,
3694         72, 71, 70, 70, 69, 68, 67, 66,
3695         65, 64, 63, 63, 62, 61, 60, 59,
3696         59, 58, 57, 56, 56, 55, 54, 53
3697     };
3698     const int precision = 7;
3699 
3700     if (exp == 0 && frac != 0) { /* subnormal */
3701         /* Normalize the subnormal. */
3702         while (extract64(frac, frac_size - 1, 1) == 0) {
3703             exp--;
3704             frac <<= 1;
3705         }
3706 
3707         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3708     }
3709 
3710     int idx = ((exp & 1) << (precision - 1)) |
3711                 (frac >> (frac_size - precision + 1));
3712     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3713                             (frac_size - precision);
3714     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3715 
3716     uint64_t val = 0;
3717     val = deposit64(val, 0, frac_size, out_frac);
3718     val = deposit64(val, frac_size, exp_size, out_exp);
3719     val = deposit64(val, frac_size + exp_size, 1, sign);
3720     return val;
3721 }
3722 
3723 static float16 frsqrt7_h(float16 f, float_status *s)
3724 {
3725     int exp_size = 5, frac_size = 10;
3726     bool sign = float16_is_neg(f);
3727 
3728     /*
3729      * frsqrt7(sNaN) = canonical NaN
3730      * frsqrt7(-inf) = canonical NaN
3731      * frsqrt7(-normal) = canonical NaN
3732      * frsqrt7(-subnormal) = canonical NaN
3733      */
3734     if (float16_is_signaling_nan(f, s) ||
3735             (float16_is_infinity(f) && sign) ||
3736             (float16_is_normal(f) && sign) ||
3737             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3738         s->float_exception_flags |= float_flag_invalid;
3739         return float16_default_nan(s);
3740     }
3741 
3742     /* frsqrt7(qNaN) = canonical NaN */
3743     if (float16_is_quiet_nan(f, s)) {
3744         return float16_default_nan(s);
3745     }
3746 
3747     /* frsqrt7(+-0) = +-inf */
3748     if (float16_is_zero(f)) {
3749         s->float_exception_flags |= float_flag_divbyzero;
3750         return float16_set_sign(float16_infinity, sign);
3751     }
3752 
3753     /* frsqrt7(+inf) = +0 */
3754     if (float16_is_infinity(f) && !sign) {
3755         return float16_set_sign(float16_zero, sign);
3756     }
3757 
3758     /* +normal, +subnormal */
3759     uint64_t val = frsqrt7(f, exp_size, frac_size);
3760     return make_float16(val);
3761 }
3762 
3763 static float32 frsqrt7_s(float32 f, float_status *s)
3764 {
3765     int exp_size = 8, frac_size = 23;
3766     bool sign = float32_is_neg(f);
3767 
3768     /*
3769      * frsqrt7(sNaN) = canonical NaN
3770      * frsqrt7(-inf) = canonical NaN
3771      * frsqrt7(-normal) = canonical NaN
3772      * frsqrt7(-subnormal) = canonical NaN
3773      */
3774     if (float32_is_signaling_nan(f, s) ||
3775             (float32_is_infinity(f) && sign) ||
3776             (float32_is_normal(f) && sign) ||
3777             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3778         s->float_exception_flags |= float_flag_invalid;
3779         return float32_default_nan(s);
3780     }
3781 
3782     /* frsqrt7(qNaN) = canonical NaN */
3783     if (float32_is_quiet_nan(f, s)) {
3784         return float32_default_nan(s);
3785     }
3786 
3787     /* frsqrt7(+-0) = +-inf */
3788     if (float32_is_zero(f)) {
3789         s->float_exception_flags |= float_flag_divbyzero;
3790         return float32_set_sign(float32_infinity, sign);
3791     }
3792 
3793     /* frsqrt7(+inf) = +0 */
3794     if (float32_is_infinity(f) && !sign) {
3795         return float32_set_sign(float32_zero, sign);
3796     }
3797 
3798     /* +normal, +subnormal */
3799     uint64_t val = frsqrt7(f, exp_size, frac_size);
3800     return make_float32(val);
3801 }
3802 
3803 static float64 frsqrt7_d(float64 f, float_status *s)
3804 {
3805     int exp_size = 11, frac_size = 52;
3806     bool sign = float64_is_neg(f);
3807 
3808     /*
3809      * frsqrt7(sNaN) = canonical NaN
3810      * frsqrt7(-inf) = canonical NaN
3811      * frsqrt7(-normal) = canonical NaN
3812      * frsqrt7(-subnormal) = canonical NaN
3813      */
3814     if (float64_is_signaling_nan(f, s) ||
3815             (float64_is_infinity(f) && sign) ||
3816             (float64_is_normal(f) && sign) ||
3817             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3818         s->float_exception_flags |= float_flag_invalid;
3819         return float64_default_nan(s);
3820     }
3821 
3822     /* frsqrt7(qNaN) = canonical NaN */
3823     if (float64_is_quiet_nan(f, s)) {
3824         return float64_default_nan(s);
3825     }
3826 
3827     /* frsqrt7(+-0) = +-inf */
3828     if (float64_is_zero(f)) {
3829         s->float_exception_flags |= float_flag_divbyzero;
3830         return float64_set_sign(float64_infinity, sign);
3831     }
3832 
3833     /* frsqrt7(+inf) = +0 */
3834     if (float64_is_infinity(f) && !sign) {
3835         return float64_set_sign(float64_zero, sign);
3836     }
3837 
3838     /* +normal, +subnormal */
3839     uint64_t val = frsqrt7(f, exp_size, frac_size);
3840     return make_float64(val);
3841 }
3842 
3843 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3844 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3845 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3846 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3847 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3848 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3849 
3850 /*
3851  * Vector Floating-Point Reciprocal Estimate Instruction
3852  *
3853  * Adapted from riscv-v-spec recip.c:
3854  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3855  */
3856 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3857                       float_status *s)
3858 {
3859     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3860     uint64_t exp = extract64(f, frac_size, exp_size);
3861     uint64_t frac = extract64(f, 0, frac_size);
3862 
3863     const uint8_t lookup_table[] = {
3864         127, 125, 123, 121, 119, 117, 116, 114,
3865         112, 110, 109, 107, 105, 104, 102, 100,
3866         99, 97, 96, 94, 93, 91, 90, 88,
3867         87, 85, 84, 83, 81, 80, 79, 77,
3868         76, 75, 74, 72, 71, 70, 69, 68,
3869         66, 65, 64, 63, 62, 61, 60, 59,
3870         58, 57, 56, 55, 54, 53, 52, 51,
3871         50, 49, 48, 47, 46, 45, 44, 43,
3872         42, 41, 40, 40, 39, 38, 37, 36,
3873         35, 35, 34, 33, 32, 31, 31, 30,
3874         29, 28, 28, 27, 26, 25, 25, 24,
3875         23, 23, 22, 21, 21, 20, 19, 19,
3876         18, 17, 17, 16, 15, 15, 14, 14,
3877         13, 12, 12, 11, 11, 10, 9, 9,
3878         8, 8, 7, 7, 6, 5, 5, 4,
3879         4, 3, 3, 2, 2, 1, 1, 0
3880     };
3881     const int precision = 7;
3882 
3883     if (exp == 0 && frac != 0) { /* subnormal */
3884         /* Normalize the subnormal. */
3885         while (extract64(frac, frac_size - 1, 1) == 0) {
3886             exp--;
3887             frac <<= 1;
3888         }
3889 
3890         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3891 
3892         if (exp != 0 && exp != UINT64_MAX) {
3893             /*
3894              * Overflow to inf or max value of same sign,
3895              * depending on sign and rounding mode.
3896              */
3897             s->float_exception_flags |= (float_flag_inexact |
3898                                          float_flag_overflow);
3899 
3900             if ((s->float_rounding_mode == float_round_to_zero) ||
3901                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3902                 ((s->float_rounding_mode == float_round_up) && sign)) {
3903                 /* Return greatest/negative finite value. */
3904                 return (sign << (exp_size + frac_size)) |
3905                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3906             } else {
3907                 /* Return +-inf. */
3908                 return (sign << (exp_size + frac_size)) |
3909                     MAKE_64BIT_MASK(frac_size, exp_size);
3910             }
3911         }
3912     }
3913 
3914     int idx = frac >> (frac_size - precision);
3915     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3916                             (frac_size - precision);
3917     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3918 
3919     if (out_exp == 0 || out_exp == UINT64_MAX) {
3920         /*
3921          * The result is subnormal, but don't raise the underflow exception,
3922          * because there's no additional loss of precision.
3923          */
3924         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3925         if (out_exp == UINT64_MAX) {
3926             out_frac >>= 1;
3927             out_exp = 0;
3928         }
3929     }
3930 
3931     uint64_t val = 0;
3932     val = deposit64(val, 0, frac_size, out_frac);
3933     val = deposit64(val, frac_size, exp_size, out_exp);
3934     val = deposit64(val, frac_size + exp_size, 1, sign);
3935     return val;
3936 }
3937 
3938 static float16 frec7_h(float16 f, float_status *s)
3939 {
3940     int exp_size = 5, frac_size = 10;
3941     bool sign = float16_is_neg(f);
3942 
3943     /* frec7(+-inf) = +-0 */
3944     if (float16_is_infinity(f)) {
3945         return float16_set_sign(float16_zero, sign);
3946     }
3947 
3948     /* frec7(+-0) = +-inf */
3949     if (float16_is_zero(f)) {
3950         s->float_exception_flags |= float_flag_divbyzero;
3951         return float16_set_sign(float16_infinity, sign);
3952     }
3953 
3954     /* frec7(sNaN) = canonical NaN */
3955     if (float16_is_signaling_nan(f, s)) {
3956         s->float_exception_flags |= float_flag_invalid;
3957         return float16_default_nan(s);
3958     }
3959 
3960     /* frec7(qNaN) = canonical NaN */
3961     if (float16_is_quiet_nan(f, s)) {
3962         return float16_default_nan(s);
3963     }
3964 
3965     /* +-normal, +-subnormal */
3966     uint64_t val = frec7(f, exp_size, frac_size, s);
3967     return make_float16(val);
3968 }
3969 
3970 static float32 frec7_s(float32 f, float_status *s)
3971 {
3972     int exp_size = 8, frac_size = 23;
3973     bool sign = float32_is_neg(f);
3974 
3975     /* frec7(+-inf) = +-0 */
3976     if (float32_is_infinity(f)) {
3977         return float32_set_sign(float32_zero, sign);
3978     }
3979 
3980     /* frec7(+-0) = +-inf */
3981     if (float32_is_zero(f)) {
3982         s->float_exception_flags |= float_flag_divbyzero;
3983         return float32_set_sign(float32_infinity, sign);
3984     }
3985 
3986     /* frec7(sNaN) = canonical NaN */
3987     if (float32_is_signaling_nan(f, s)) {
3988         s->float_exception_flags |= float_flag_invalid;
3989         return float32_default_nan(s);
3990     }
3991 
3992     /* frec7(qNaN) = canonical NaN */
3993     if (float32_is_quiet_nan(f, s)) {
3994         return float32_default_nan(s);
3995     }
3996 
3997     /* +-normal, +-subnormal */
3998     uint64_t val = frec7(f, exp_size, frac_size, s);
3999     return make_float32(val);
4000 }
4001 
4002 static float64 frec7_d(float64 f, float_status *s)
4003 {
4004     int exp_size = 11, frac_size = 52;
4005     bool sign = float64_is_neg(f);
4006 
4007     /* frec7(+-inf) = +-0 */
4008     if (float64_is_infinity(f)) {
4009         return float64_set_sign(float64_zero, sign);
4010     }
4011 
4012     /* frec7(+-0) = +-inf */
4013     if (float64_is_zero(f)) {
4014         s->float_exception_flags |= float_flag_divbyzero;
4015         return float64_set_sign(float64_infinity, sign);
4016     }
4017 
4018     /* frec7(sNaN) = canonical NaN */
4019     if (float64_is_signaling_nan(f, s)) {
4020         s->float_exception_flags |= float_flag_invalid;
4021         return float64_default_nan(s);
4022     }
4023 
4024     /* frec7(qNaN) = canonical NaN */
4025     if (float64_is_quiet_nan(f, s)) {
4026         return float64_default_nan(s);
4027     }
4028 
4029     /* +-normal, +-subnormal */
4030     uint64_t val = frec7(f, exp_size, frac_size, s);
4031     return make_float64(val);
4032 }
4033 
4034 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4035 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4036 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4037 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4038 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4039 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4040 
4041 /* Vector Floating-Point MIN/MAX Instructions */
4042 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4043 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4044 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4045 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4046 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4047 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4048 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4049 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4050 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4051 GEN_VEXT_VF(vfmin_vf_h, 2)
4052 GEN_VEXT_VF(vfmin_vf_w, 4)
4053 GEN_VEXT_VF(vfmin_vf_d, 8)
4054 
4055 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4056 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4057 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4058 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4059 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4060 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4061 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4062 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4063 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4064 GEN_VEXT_VF(vfmax_vf_h, 2)
4065 GEN_VEXT_VF(vfmax_vf_w, 4)
4066 GEN_VEXT_VF(vfmax_vf_d, 8)
4067 
4068 /* Vector Floating-Point Sign-Injection Instructions */
4069 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4070 {
4071     return deposit64(b, 0, 15, a);
4072 }
4073 
4074 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4075 {
4076     return deposit64(b, 0, 31, a);
4077 }
4078 
4079 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4080 {
4081     return deposit64(b, 0, 63, a);
4082 }
4083 
4084 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4085 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4086 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4087 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4088 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4089 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4090 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4091 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4092 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4093 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4094 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4095 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4096 
4097 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4098 {
4099     return deposit64(~b, 0, 15, a);
4100 }
4101 
4102 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4103 {
4104     return deposit64(~b, 0, 31, a);
4105 }
4106 
4107 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4108 {
4109     return deposit64(~b, 0, 63, a);
4110 }
4111 
4112 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4113 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4114 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4115 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4116 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4117 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4118 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4119 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4120 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4121 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4122 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4123 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4124 
4125 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4126 {
4127     return deposit64(b ^ a, 0, 15, a);
4128 }
4129 
4130 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4131 {
4132     return deposit64(b ^ a, 0, 31, a);
4133 }
4134 
4135 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4136 {
4137     return deposit64(b ^ a, 0, 63, a);
4138 }
4139 
4140 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4141 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4142 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4143 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4144 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4145 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4146 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4147 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4148 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4149 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4150 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4151 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4152 
4153 /* Vector Floating-Point Compare Instructions */
4154 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4155 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4156                   CPURISCVState *env, uint32_t desc)          \
4157 {                                                             \
4158     uint32_t vm = vext_vm(desc);                              \
4159     uint32_t vl = env->vl;                                    \
4160     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4161     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4162     uint32_t i;                                               \
4163                                                               \
4164     for (i = env->vstart; i < vl; i++) {                      \
4165         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4166         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4167         if (!vm && !vext_elem_mask(v0, i)) {                  \
4168             continue;                                         \
4169         }                                                     \
4170         vext_set_elem_mask(vd, i,                             \
4171                            DO_OP(s2, s1, &env->fp_status));   \
4172     }                                                         \
4173     env->vstart = 0;                                          \
4174     /* mask destination register are always tail-agnostic */  \
4175     /* set tail elements to 1s */                             \
4176     if (vta_all_1s) {                                         \
4177         for (; i < total_elems; i++) {                        \
4178             vext_set_elem_mask(vd, i, 1);                     \
4179         }                                                     \
4180     }                                                         \
4181 }
4182 
4183 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4184 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4185 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4186 
4187 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4188 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4189                   CPURISCVState *env, uint32_t desc)                \
4190 {                                                                   \
4191     uint32_t vm = vext_vm(desc);                                    \
4192     uint32_t vl = env->vl;                                          \
4193     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4194     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4195     uint32_t i;                                                     \
4196                                                                     \
4197     for (i = env->vstart; i < vl; i++) {                            \
4198         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4199         if (!vm && !vext_elem_mask(v0, i)) {                        \
4200             continue;                                               \
4201         }                                                           \
4202         vext_set_elem_mask(vd, i,                                   \
4203                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4204     }                                                               \
4205     env->vstart = 0;                                                \
4206     /* mask destination register are always tail-agnostic */        \
4207     /* set tail elements to 1s */                                   \
4208     if (vta_all_1s) {                                               \
4209         for (; i < total_elems; i++) {                              \
4210             vext_set_elem_mask(vd, i, 1);                           \
4211         }                                                           \
4212     }                                                               \
4213 }
4214 
4215 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4216 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4217 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4218 
4219 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4220 {
4221     FloatRelation compare = float16_compare_quiet(a, b, s);
4222     return compare != float_relation_equal;
4223 }
4224 
4225 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4226 {
4227     FloatRelation compare = float32_compare_quiet(a, b, s);
4228     return compare != float_relation_equal;
4229 }
4230 
4231 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4232 {
4233     FloatRelation compare = float64_compare_quiet(a, b, s);
4234     return compare != float_relation_equal;
4235 }
4236 
4237 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4238 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4239 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4240 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4241 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4242 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4243 
4244 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4245 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4246 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4247 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4248 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4249 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4250 
4251 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4252 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4253 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4254 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4255 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4256 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4257 
4258 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4259 {
4260     FloatRelation compare = float16_compare(a, b, s);
4261     return compare == float_relation_greater;
4262 }
4263 
4264 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4265 {
4266     FloatRelation compare = float32_compare(a, b, s);
4267     return compare == float_relation_greater;
4268 }
4269 
4270 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4271 {
4272     FloatRelation compare = float64_compare(a, b, s);
4273     return compare == float_relation_greater;
4274 }
4275 
4276 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4277 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4278 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4279 
4280 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4281 {
4282     FloatRelation compare = float16_compare(a, b, s);
4283     return compare == float_relation_greater ||
4284            compare == float_relation_equal;
4285 }
4286 
4287 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4288 {
4289     FloatRelation compare = float32_compare(a, b, s);
4290     return compare == float_relation_greater ||
4291            compare == float_relation_equal;
4292 }
4293 
4294 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4295 {
4296     FloatRelation compare = float64_compare(a, b, s);
4297     return compare == float_relation_greater ||
4298            compare == float_relation_equal;
4299 }
4300 
4301 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4302 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4303 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4304 
4305 /* Vector Floating-Point Classify Instruction */
4306 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4307 static void do_##NAME(void *vd, void *vs2, int i)      \
4308 {                                                      \
4309     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4310     *((TD *)vd + HD(i)) = OP(s2);                      \
4311 }
4312 
4313 #define GEN_VEXT_V(NAME, ESZ)                          \
4314 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4315                   CPURISCVState *env, uint32_t desc)   \
4316 {                                                      \
4317     uint32_t vm = vext_vm(desc);                       \
4318     uint32_t vl = env->vl;                             \
4319     uint32_t total_elems =                             \
4320         vext_get_total_elems(env, desc, ESZ);          \
4321     uint32_t vta = vext_vta(desc);                     \
4322     uint32_t i;                                        \
4323                                                        \
4324     for (i = env->vstart; i < vl; i++) {               \
4325         if (!vm && !vext_elem_mask(v0, i)) {           \
4326             continue;                                  \
4327         }                                              \
4328         do_##NAME(vd, vs2, i);                         \
4329     }                                                  \
4330     env->vstart = 0;                                   \
4331     /* set tail elements to 1s */                      \
4332     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4333                       total_elems * ESZ);              \
4334 }
4335 
4336 target_ulong fclass_h(uint64_t frs1)
4337 {
4338     float16 f = frs1;
4339     bool sign = float16_is_neg(f);
4340 
4341     if (float16_is_infinity(f)) {
4342         return sign ? 1 << 0 : 1 << 7;
4343     } else if (float16_is_zero(f)) {
4344         return sign ? 1 << 3 : 1 << 4;
4345     } else if (float16_is_zero_or_denormal(f)) {
4346         return sign ? 1 << 2 : 1 << 5;
4347     } else if (float16_is_any_nan(f)) {
4348         float_status s = { }; /* for snan_bit_is_one */
4349         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4350     } else {
4351         return sign ? 1 << 1 : 1 << 6;
4352     }
4353 }
4354 
4355 target_ulong fclass_s(uint64_t frs1)
4356 {
4357     float32 f = frs1;
4358     bool sign = float32_is_neg(f);
4359 
4360     if (float32_is_infinity(f)) {
4361         return sign ? 1 << 0 : 1 << 7;
4362     } else if (float32_is_zero(f)) {
4363         return sign ? 1 << 3 : 1 << 4;
4364     } else if (float32_is_zero_or_denormal(f)) {
4365         return sign ? 1 << 2 : 1 << 5;
4366     } else if (float32_is_any_nan(f)) {
4367         float_status s = { }; /* for snan_bit_is_one */
4368         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4369     } else {
4370         return sign ? 1 << 1 : 1 << 6;
4371     }
4372 }
4373 
4374 target_ulong fclass_d(uint64_t frs1)
4375 {
4376     float64 f = frs1;
4377     bool sign = float64_is_neg(f);
4378 
4379     if (float64_is_infinity(f)) {
4380         return sign ? 1 << 0 : 1 << 7;
4381     } else if (float64_is_zero(f)) {
4382         return sign ? 1 << 3 : 1 << 4;
4383     } else if (float64_is_zero_or_denormal(f)) {
4384         return sign ? 1 << 2 : 1 << 5;
4385     } else if (float64_is_any_nan(f)) {
4386         float_status s = { }; /* for snan_bit_is_one */
4387         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4388     } else {
4389         return sign ? 1 << 1 : 1 << 6;
4390     }
4391 }
4392 
4393 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4394 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4395 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4396 GEN_VEXT_V(vfclass_v_h, 2)
4397 GEN_VEXT_V(vfclass_v_w, 4)
4398 GEN_VEXT_V(vfclass_v_d, 8)
4399 
4400 /* Vector Floating-Point Merge Instruction */
4401 
4402 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4403 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4404                   CPURISCVState *env, uint32_t desc)          \
4405 {                                                             \
4406     uint32_t vm = vext_vm(desc);                              \
4407     uint32_t vl = env->vl;                                    \
4408     uint32_t esz = sizeof(ETYPE);                             \
4409     uint32_t total_elems =                                    \
4410         vext_get_total_elems(env, desc, esz);                 \
4411     uint32_t vta = vext_vta(desc);                            \
4412     uint32_t i;                                               \
4413                                                               \
4414     for (i = env->vstart; i < vl; i++) {                      \
4415         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4416         *((ETYPE *)vd + H(i))                                 \
4417           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4418     }                                                         \
4419     env->vstart = 0;                                          \
4420     /* set tail elements to 1s */                             \
4421     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4422 }
4423 
4424 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4425 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4426 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4427 
4428 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4429 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4430 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4431 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4432 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4433 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4434 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4435 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4436 
4437 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4438 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4439 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4440 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4441 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4442 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4443 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4444 
4445 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4446 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4447 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4448 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4449 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4450 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4451 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4452 
4453 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4454 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4455 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4456 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4457 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4458 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4459 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4460 
4461 /* Widening Floating-Point/Integer Type-Convert Instructions */
4462 /* (TD, T2, TX2) */
4463 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4464 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4465 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4466 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4467 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4468 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4469 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4470 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4471 
4472 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4473 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4474 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4475 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4476 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4477 
4478 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4479 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4480 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4481 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4482 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4483 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4484 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4485 
4486 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4487 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4488 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4489 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4490 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4491 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4492 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4493 
4494 /*
4495  * vfwcvt.f.f.v vd, vs2, vm
4496  * Convert single-width float to double-width float.
4497  */
4498 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4499 {
4500     return float16_to_float32(a, true, s);
4501 }
4502 
4503 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4504 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4505 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4506 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4507 
4508 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4509 /* (TD, T2, TX2) */
4510 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4511 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4512 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4513 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4514 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4515 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4516 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4517 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4518 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4519 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4520 
4521 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4522 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4523 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4524 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4525 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4526 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4527 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4528 
4529 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4530 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4531 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4532 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4533 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4534 
4535 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4536 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4537 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4538 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4539 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4540 
4541 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4542 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4543 {
4544     return float32_to_float16(a, true, s);
4545 }
4546 
4547 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4548 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4549 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4550 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4551 
4552 /*
4553  *** Vector Reduction Operations
4554  */
4555 /* Vector Single-Width Integer Reduction Instructions */
4556 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4557 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4558         void *vs2, CPURISCVState *env, uint32_t desc)     \
4559 {                                                         \
4560     uint32_t vm = vext_vm(desc);                          \
4561     uint32_t vl = env->vl;                                \
4562     uint32_t esz = sizeof(TD);                            \
4563     uint32_t vlenb = simd_maxsz(desc);                    \
4564     uint32_t vta = vext_vta(desc);                        \
4565     uint32_t i;                                           \
4566     TD s1 =  *((TD *)vs1 + HD(0));                        \
4567                                                           \
4568     for (i = env->vstart; i < vl; i++) {                  \
4569         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4570         if (!vm && !vext_elem_mask(v0, i)) {              \
4571             continue;                                     \
4572         }                                                 \
4573         s1 = OP(s1, (TD)s2);                              \
4574     }                                                     \
4575     *((TD *)vd + HD(0)) = s1;                             \
4576     env->vstart = 0;                                      \
4577     /* set tail elements to 1s */                         \
4578     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4579 }
4580 
4581 /* vd[0] = sum(vs1[0], vs2[*]) */
4582 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4583 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4584 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4585 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4586 
4587 /* vd[0] = maxu(vs1[0], vs2[*]) */
4588 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4589 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4590 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4591 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4592 
4593 /* vd[0] = max(vs1[0], vs2[*]) */
4594 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4595 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4596 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4597 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4598 
4599 /* vd[0] = minu(vs1[0], vs2[*]) */
4600 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4601 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4602 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4603 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4604 
4605 /* vd[0] = min(vs1[0], vs2[*]) */
4606 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4607 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4608 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4609 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4610 
4611 /* vd[0] = and(vs1[0], vs2[*]) */
4612 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4613 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4614 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4615 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4616 
4617 /* vd[0] = or(vs1[0], vs2[*]) */
4618 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4619 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4620 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4621 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4622 
4623 /* vd[0] = xor(vs1[0], vs2[*]) */
4624 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4625 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4626 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4627 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4628 
4629 /* Vector Widening Integer Reduction Instructions */
4630 /* signed sum reduction into double-width accumulator */
4631 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4632 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4633 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4634 
4635 /* Unsigned sum reduction into double-width accumulator */
4636 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4637 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4638 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4639 
4640 /* Vector Single-Width Floating-Point Reduction Instructions */
4641 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4642 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4643                   void *vs2, CPURISCVState *env,           \
4644                   uint32_t desc)                           \
4645 {                                                          \
4646     uint32_t vm = vext_vm(desc);                           \
4647     uint32_t vl = env->vl;                                 \
4648     uint32_t esz = sizeof(TD);                             \
4649     uint32_t vlenb = simd_maxsz(desc);                     \
4650     uint32_t vta = vext_vta(desc);                         \
4651     uint32_t i;                                            \
4652     TD s1 =  *((TD *)vs1 + HD(0));                         \
4653                                                            \
4654     for (i = env->vstart; i < vl; i++) {                   \
4655         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4656         if (!vm && !vext_elem_mask(v0, i)) {               \
4657             continue;                                      \
4658         }                                                  \
4659         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4660     }                                                      \
4661     *((TD *)vd + HD(0)) = s1;                              \
4662     env->vstart = 0;                                       \
4663     /* set tail elements to 1s */                          \
4664     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4665 }
4666 
4667 /* Unordered sum */
4668 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4669 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4670 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4671 
4672 /* Maximum value */
4673 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4674 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4675 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4676 
4677 /* Minimum value */
4678 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4679 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4680 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4681 
4682 /* Vector Widening Floating-Point Reduction Instructions */
4683 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4684 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4685                             void *vs2, CPURISCVState *env, uint32_t desc)
4686 {
4687     uint32_t vm = vext_vm(desc);
4688     uint32_t vl = env->vl;
4689     uint32_t esz = sizeof(uint32_t);
4690     uint32_t vlenb = simd_maxsz(desc);
4691     uint32_t vta = vext_vta(desc);
4692     uint32_t i;
4693     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4694 
4695     for (i = env->vstart; i < vl; i++) {
4696         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4697         if (!vm && !vext_elem_mask(v0, i)) {
4698             continue;
4699         }
4700         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4701                          &env->fp_status);
4702     }
4703     *((uint32_t *)vd + H4(0)) = s1;
4704     env->vstart = 0;
4705     /* set tail elements to 1s */
4706     vext_set_elems_1s(vd, vta, esz, vlenb);
4707 }
4708 
4709 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4710                             void *vs2, CPURISCVState *env, uint32_t desc)
4711 {
4712     uint32_t vm = vext_vm(desc);
4713     uint32_t vl = env->vl;
4714     uint32_t esz = sizeof(uint64_t);
4715     uint32_t vlenb = simd_maxsz(desc);
4716     uint32_t vta = vext_vta(desc);
4717     uint32_t i;
4718     uint64_t s1 =  *((uint64_t *)vs1);
4719 
4720     for (i = env->vstart; i < vl; i++) {
4721         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4722         if (!vm && !vext_elem_mask(v0, i)) {
4723             continue;
4724         }
4725         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4726                          &env->fp_status);
4727     }
4728     *((uint64_t *)vd) = s1;
4729     env->vstart = 0;
4730     /* set tail elements to 1s */
4731     vext_set_elems_1s(vd, vta, esz, vlenb);
4732 }
4733 
4734 /*
4735  *** Vector Mask Operations
4736  */
4737 /* Vector Mask-Register Logical Instructions */
4738 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4739 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4740                   void *vs2, CPURISCVState *env,          \
4741                   uint32_t desc)                          \
4742 {                                                         \
4743     uint32_t vl = env->vl;                                \
4744     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4745     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4746     uint32_t i;                                           \
4747     int a, b;                                             \
4748                                                           \
4749     for (i = env->vstart; i < vl; i++) {                  \
4750         a = vext_elem_mask(vs1, i);                       \
4751         b = vext_elem_mask(vs2, i);                       \
4752         vext_set_elem_mask(vd, i, OP(b, a));              \
4753     }                                                     \
4754     env->vstart = 0;                                      \
4755     /* mask destination register are always tail-         \
4756      * agnostic                                           \
4757      */                                                   \
4758     /* set tail elements to 1s */                         \
4759     if (vta_all_1s) {                                     \
4760         for (; i < total_elems; i++) {                    \
4761             vext_set_elem_mask(vd, i, 1);                 \
4762         }                                                 \
4763     }                                                     \
4764 }
4765 
4766 #define DO_NAND(N, M)  (!(N & M))
4767 #define DO_ANDNOT(N, M)  (N & !M)
4768 #define DO_NOR(N, M)  (!(N | M))
4769 #define DO_ORNOT(N, M)  (N | !M)
4770 #define DO_XNOR(N, M)  (!(N ^ M))
4771 
4772 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4773 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4774 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4775 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4776 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4777 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4778 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4779 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4780 
4781 /* Vector count population in mask vcpop */
4782 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4783                              uint32_t desc)
4784 {
4785     target_ulong cnt = 0;
4786     uint32_t vm = vext_vm(desc);
4787     uint32_t vl = env->vl;
4788     int i;
4789 
4790     for (i = env->vstart; i < vl; i++) {
4791         if (vm || vext_elem_mask(v0, i)) {
4792             if (vext_elem_mask(vs2, i)) {
4793                 cnt++;
4794             }
4795         }
4796     }
4797     env->vstart = 0;
4798     return cnt;
4799 }
4800 
4801 /* vfirst find-first-set mask bit*/
4802 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4803                               uint32_t desc)
4804 {
4805     uint32_t vm = vext_vm(desc);
4806     uint32_t vl = env->vl;
4807     int i;
4808 
4809     for (i = env->vstart; i < vl; i++) {
4810         if (vm || vext_elem_mask(v0, i)) {
4811             if (vext_elem_mask(vs2, i)) {
4812                 return i;
4813             }
4814         }
4815     }
4816     env->vstart = 0;
4817     return -1LL;
4818 }
4819 
4820 enum set_mask_type {
4821     ONLY_FIRST = 1,
4822     INCLUDE_FIRST,
4823     BEFORE_FIRST,
4824 };
4825 
4826 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4827                    uint32_t desc, enum set_mask_type type)
4828 {
4829     uint32_t vm = vext_vm(desc);
4830     uint32_t vl = env->vl;
4831     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4832     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4833     int i;
4834     bool first_mask_bit = false;
4835 
4836     for (i = env->vstart; i < vl; i++) {
4837         if (!vm && !vext_elem_mask(v0, i)) {
4838             continue;
4839         }
4840         /* write a zero to all following active elements */
4841         if (first_mask_bit) {
4842             vext_set_elem_mask(vd, i, 0);
4843             continue;
4844         }
4845         if (vext_elem_mask(vs2, i)) {
4846             first_mask_bit = true;
4847             if (type == BEFORE_FIRST) {
4848                 vext_set_elem_mask(vd, i, 0);
4849             } else {
4850                 vext_set_elem_mask(vd, i, 1);
4851             }
4852         } else {
4853             if (type == ONLY_FIRST) {
4854                 vext_set_elem_mask(vd, i, 0);
4855             } else {
4856                 vext_set_elem_mask(vd, i, 1);
4857             }
4858         }
4859     }
4860     env->vstart = 0;
4861     /* mask destination register are always tail-agnostic */
4862     /* set tail elements to 1s */
4863     if (vta_all_1s) {
4864         for (; i < total_elems; i++) {
4865             vext_set_elem_mask(vd, i, 1);
4866         }
4867     }
4868 }
4869 
4870 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4871                      uint32_t desc)
4872 {
4873     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4874 }
4875 
4876 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4877                      uint32_t desc)
4878 {
4879     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4880 }
4881 
4882 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4883                      uint32_t desc)
4884 {
4885     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4886 }
4887 
4888 /* Vector Iota Instruction */
4889 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4890 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4891                   uint32_t desc)                                          \
4892 {                                                                         \
4893     uint32_t vm = vext_vm(desc);                                          \
4894     uint32_t vl = env->vl;                                                \
4895     uint32_t esz = sizeof(ETYPE);                                         \
4896     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4897     uint32_t vta = vext_vta(desc);                                        \
4898     uint32_t sum = 0;                                                     \
4899     int i;                                                                \
4900                                                                           \
4901     for (i = env->vstart; i < vl; i++) {                                  \
4902         if (!vm && !vext_elem_mask(v0, i)) {                              \
4903             continue;                                                     \
4904         }                                                                 \
4905         *((ETYPE *)vd + H(i)) = sum;                                      \
4906         if (vext_elem_mask(vs2, i)) {                                     \
4907             sum++;                                                        \
4908         }                                                                 \
4909     }                                                                     \
4910     env->vstart = 0;                                                      \
4911     /* set tail elements to 1s */                                         \
4912     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4913 }
4914 
4915 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4916 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4917 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4918 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4919 
4920 /* Vector Element Index Instruction */
4921 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4922 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4923 {                                                                         \
4924     uint32_t vm = vext_vm(desc);                                          \
4925     uint32_t vl = env->vl;                                                \
4926     uint32_t esz = sizeof(ETYPE);                                         \
4927     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4928     uint32_t vta = vext_vta(desc);                                        \
4929     int i;                                                                \
4930                                                                           \
4931     for (i = env->vstart; i < vl; i++) {                                  \
4932         if (!vm && !vext_elem_mask(v0, i)) {                              \
4933             continue;                                                     \
4934         }                                                                 \
4935         *((ETYPE *)vd + H(i)) = i;                                        \
4936     }                                                                     \
4937     env->vstart = 0;                                                      \
4938     /* set tail elements to 1s */                                         \
4939     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4940 }
4941 
4942 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4943 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4944 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4945 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4946 
4947 /*
4948  *** Vector Permutation Instructions
4949  */
4950 
4951 /* Vector Slide Instructions */
4952 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4953 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4954                   CPURISCVState *env, uint32_t desc)                      \
4955 {                                                                         \
4956     uint32_t vm = vext_vm(desc);                                          \
4957     uint32_t vl = env->vl;                                                \
4958     uint32_t esz = sizeof(ETYPE);                                         \
4959     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4960     uint32_t vta = vext_vta(desc);                                        \
4961     target_ulong offset = s1, i_min, i;                                   \
4962                                                                           \
4963     i_min = MAX(env->vstart, offset);                                     \
4964     for (i = i_min; i < vl; i++) {                                        \
4965         if (!vm && !vext_elem_mask(v0, i)) {                              \
4966             continue;                                                     \
4967         }                                                                 \
4968         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4969     }                                                                     \
4970     /* set tail elements to 1s */                                         \
4971     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4972 }
4973 
4974 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4975 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4976 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4977 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4978 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4979 
4980 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4981 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4982                   CPURISCVState *env, uint32_t desc)                      \
4983 {                                                                         \
4984     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4985     uint32_t vm = vext_vm(desc);                                          \
4986     uint32_t vl = env->vl;                                                \
4987     uint32_t esz = sizeof(ETYPE);                                         \
4988     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4989     uint32_t vta = vext_vta(desc);                                        \
4990     target_ulong i_max, i;                                                \
4991                                                                           \
4992     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4993     for (i = env->vstart; i < i_max; ++i) {                               \
4994         if (vm || vext_elem_mask(v0, i)) {                                \
4995             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4996         }                                                                 \
4997     }                                                                     \
4998                                                                           \
4999     for (i = i_max; i < vl; ++i) {                                        \
5000         if (vm || vext_elem_mask(v0, i)) {                                \
5001             *((ETYPE *)vd + H(i)) = 0;                                    \
5002         }                                                                 \
5003     }                                                                     \
5004                                                                           \
5005     env->vstart = 0;                                                      \
5006     /* set tail elements to 1s */                                         \
5007     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5008 }
5009 
5010 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5011 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5012 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5013 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5014 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5015 
5016 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5017 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5018                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5019 {                                                                           \
5020     typedef uint##BITWIDTH##_t ETYPE;                                       \
5021     uint32_t vm = vext_vm(desc);                                            \
5022     uint32_t vl = env->vl;                                                  \
5023     uint32_t esz = sizeof(ETYPE);                                           \
5024     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5025     uint32_t vta = vext_vta(desc);                                          \
5026     uint32_t i;                                                             \
5027                                                                             \
5028     for (i = env->vstart; i < vl; i++) {                                    \
5029         if (!vm && !vext_elem_mask(v0, i)) {                                \
5030             continue;                                                       \
5031         }                                                                   \
5032         if (i == 0) {                                                       \
5033             *((ETYPE *)vd + H(i)) = s1;                                     \
5034         } else {                                                            \
5035             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5036         }                                                                   \
5037     }                                                                       \
5038     env->vstart = 0;                                                        \
5039     /* set tail elements to 1s */                                           \
5040     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5041 }
5042 
5043 GEN_VEXT_VSLIE1UP(8,  H1)
5044 GEN_VEXT_VSLIE1UP(16, H2)
5045 GEN_VEXT_VSLIE1UP(32, H4)
5046 GEN_VEXT_VSLIE1UP(64, H8)
5047 
5048 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5049 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5050                   CPURISCVState *env, uint32_t desc)              \
5051 {                                                                 \
5052     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5053 }
5054 
5055 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5056 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5057 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5058 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5059 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5060 
5061 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5062 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5063                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5064 {                                                                             \
5065     typedef uint##BITWIDTH##_t ETYPE;                                         \
5066     uint32_t vm = vext_vm(desc);                                              \
5067     uint32_t vl = env->vl;                                                    \
5068     uint32_t esz = sizeof(ETYPE);                                             \
5069     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5070     uint32_t vta = vext_vta(desc);                                            \
5071     uint32_t i;                                                               \
5072                                                                               \
5073     for (i = env->vstart; i < vl; i++) {                                      \
5074         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5075             continue;                                                         \
5076         }                                                                     \
5077         if (i == vl - 1) {                                                    \
5078             *((ETYPE *)vd + H(i)) = s1;                                       \
5079         } else {                                                              \
5080             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5081         }                                                                     \
5082     }                                                                         \
5083     env->vstart = 0;                                                          \
5084     /* set tail elements to 1s */                                             \
5085     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5086 }
5087 
5088 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5089 GEN_VEXT_VSLIDE1DOWN(16, H2)
5090 GEN_VEXT_VSLIDE1DOWN(32, H4)
5091 GEN_VEXT_VSLIDE1DOWN(64, H8)
5092 
5093 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5094 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5095                   CPURISCVState *env, uint32_t desc)              \
5096 {                                                                 \
5097     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5098 }
5099 
5100 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5101 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5102 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5103 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5104 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5105 
5106 /* Vector Floating-Point Slide Instructions */
5107 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5108 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5109                   CPURISCVState *env, uint32_t desc)          \
5110 {                                                             \
5111     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5112 }
5113 
5114 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5115 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5116 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5117 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5118 
5119 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5120 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5121                   CPURISCVState *env, uint32_t desc)          \
5122 {                                                             \
5123     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5124 }
5125 
5126 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5127 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5128 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5129 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5130 
5131 /* Vector Register Gather Instruction */
5132 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5133 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5134                   CPURISCVState *env, uint32_t desc)                      \
5135 {                                                                         \
5136     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5137     uint32_t vm = vext_vm(desc);                                          \
5138     uint32_t vl = env->vl;                                                \
5139     uint32_t esz = sizeof(TS2);                                           \
5140     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5141     uint32_t vta = vext_vta(desc);                                        \
5142     uint64_t index;                                                       \
5143     uint32_t i;                                                           \
5144                                                                           \
5145     for (i = env->vstart; i < vl; i++) {                                  \
5146         if (!vm && !vext_elem_mask(v0, i)) {                              \
5147             continue;                                                     \
5148         }                                                                 \
5149         index = *((TS1 *)vs1 + HS1(i));                                   \
5150         if (index >= vlmax) {                                             \
5151             *((TS2 *)vd + HS2(i)) = 0;                                    \
5152         } else {                                                          \
5153             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5154         }                                                                 \
5155     }                                                                     \
5156     env->vstart = 0;                                                      \
5157     /* set tail elements to 1s */                                         \
5158     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5159 }
5160 
5161 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5162 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5163 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5164 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5165 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5166 
5167 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5168 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5169 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5170 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5171 
5172 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5173 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5174                   CPURISCVState *env, uint32_t desc)                      \
5175 {                                                                         \
5176     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5177     uint32_t vm = vext_vm(desc);                                          \
5178     uint32_t vl = env->vl;                                                \
5179     uint32_t esz = sizeof(ETYPE);                                         \
5180     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5181     uint32_t vta = vext_vta(desc);                                        \
5182     uint64_t index = s1;                                                  \
5183     uint32_t i;                                                           \
5184                                                                           \
5185     for (i = env->vstart; i < vl; i++) {                                  \
5186         if (!vm && !vext_elem_mask(v0, i)) {                              \
5187             continue;                                                     \
5188         }                                                                 \
5189         if (index >= vlmax) {                                             \
5190             *((ETYPE *)vd + H(i)) = 0;                                    \
5191         } else {                                                          \
5192             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5193         }                                                                 \
5194     }                                                                     \
5195     env->vstart = 0;                                                      \
5196     /* set tail elements to 1s */                                         \
5197     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5198 }
5199 
5200 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5201 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5202 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5203 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5204 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5205 
5206 /* Vector Compress Instruction */
5207 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5208 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5209                   CPURISCVState *env, uint32_t desc)                      \
5210 {                                                                         \
5211     uint32_t vl = env->vl;                                                \
5212     uint32_t esz = sizeof(ETYPE);                                         \
5213     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5214     uint32_t vta = vext_vta(desc);                                        \
5215     uint32_t num = 0, i;                                                  \
5216                                                                           \
5217     for (i = env->vstart; i < vl; i++) {                                  \
5218         if (!vext_elem_mask(vs1, i)) {                                    \
5219             continue;                                                     \
5220         }                                                                 \
5221         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5222         num++;                                                            \
5223     }                                                                     \
5224     env->vstart = 0;                                                      \
5225     /* set tail elements to 1s */                                         \
5226     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5227 }
5228 
5229 /* Compress into vd elements of vs2 where vs1 is enabled */
5230 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5231 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5232 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5233 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5234 
5235 /* Vector Whole Register Move */
5236 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5237 {
5238     /* EEW = SEW */
5239     uint32_t maxsz = simd_maxsz(desc);
5240     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5241     uint32_t startb = env->vstart * sewb;
5242     uint32_t i = startb;
5243 
5244     memcpy((uint8_t *)vd + H1(i),
5245            (uint8_t *)vs2 + H1(i),
5246            maxsz - startb);
5247 
5248     env->vstart = 0;
5249 }
5250 
5251 /* Vector Integer Extension */
5252 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5253 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5254                   CPURISCVState *env, uint32_t desc)             \
5255 {                                                                \
5256     uint32_t vl = env->vl;                                       \
5257     uint32_t vm = vext_vm(desc);                                 \
5258     uint32_t esz = sizeof(ETYPE);                                \
5259     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5260     uint32_t vta = vext_vta(desc);                               \
5261     uint32_t i;                                                  \
5262                                                                  \
5263     for (i = env->vstart; i < vl; i++) {                         \
5264         if (!vm && !vext_elem_mask(v0, i)) {                     \
5265             continue;                                            \
5266         }                                                        \
5267         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5268     }                                                            \
5269     env->vstart = 0;                                             \
5270     /* set tail elements to 1s */                                \
5271     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5272 }
5273 
5274 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5275 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5276 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5277 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5278 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5279 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5280 
5281 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5282 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5283 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5284 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5285 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5286 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5287