xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 265ecd4c)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286     uint32_t vma = vext_vma(desc);
287 
288     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289         k = 0;
290         while (k < nf) {
291             if (!vm && !vext_elem_mask(v0, i)) {
292                 /* set masked-off elements to 1s */
293                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294                                   (i + k * max_elems + 1) * esz);
295                 k++;
296                 continue;
297             }
298             target_ulong addr = base + stride * i + (k << log2_esz);
299             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300             k++;
301         }
302     }
303     env->vstart = 0;
304     /* set tail elements to 1s */
305     for (k = 0; k < nf; ++k) {
306         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307                           (k * max_elems + max_elems) * esz);
308     }
309     if (nf * max_elems % total_elems != 0) {
310         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311         uint32_t registers_used =
312             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314                           registers_used * vlenb);
315     }
316 }
317 
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
320                   target_ulong stride, CPURISCVState *env,              \
321                   uint32_t desc)                                        \
322 {                                                                       \
323     uint32_t vm = vext_vm(desc);                                        \
324     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
325                      ctzl(sizeof(ETYPE)), GETPC());                     \
326 }
327 
328 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332 
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
335                   target_ulong stride, CPURISCVState *env,              \
336                   uint32_t desc)                                        \
337 {                                                                       \
338     uint32_t vm = vext_vm(desc);                                        \
339     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
340                      ctzl(sizeof(ETYPE)), GETPC());                     \
341 }
342 
343 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347 
348 /*
349  *** unit-stride: access elements stored contiguously in memory
350  */
351 
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356              uintptr_t ra)
357 {
358     uint32_t i, k;
359     uint32_t nf = vext_nf(desc);
360     uint32_t max_elems = vext_max_elems(desc, log2_esz);
361     uint32_t esz = 1 << log2_esz;
362     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363     uint32_t vta = vext_vta(desc);
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375     /* set tail elements to 1s */
376     for (k = 0; k < nf; ++k) {
377         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378                           (k * max_elems + max_elems) * esz);
379     }
380     if (nf * max_elems % total_elems != 0) {
381         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382         uint32_t registers_used =
383             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385                           registers_used * vlenb);
386     }
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of stride,
391  * stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  *** unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  *** index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488     uint32_t vta = vext_vta(desc);
489     uint32_t vma = vext_vma(desc);
490 
491     /* load bytes from guest memory */
492     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493         k = 0;
494         while (k < nf) {
495             if (!vm && !vext_elem_mask(v0, i)) {
496                 /* set masked-off elements to 1s */
497                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498                                   (i + k * max_elems + 1) * esz);
499                 k++;
500                 continue;
501             }
502             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504             k++;
505         }
506     }
507     env->vstart = 0;
508     /* set tail elements to 1s */
509     for (k = 0; k < nf; ++k) {
510         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511                           (k * max_elems + max_elems) * esz);
512     }
513     if (nf * max_elems % total_elems != 0) {
514         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515         uint32_t registers_used =
516             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518                           registers_used * vlenb);
519     }
520 }
521 
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
524                   void *vs2, CPURISCVState *env, uint32_t desc)            \
525 {                                                                          \
526     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
527                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
528 }
529 
530 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546 
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
549                   void *vs2, CPURISCVState *env, uint32_t desc)  \
550 {                                                                \
551     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
552                     STORE_FN, ctzl(sizeof(ETYPE)),               \
553                     GETPC());                                    \
554 }
555 
556 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572 
573 /*
574  *** unit-stride fault-only-fisrt load instructions
575  */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578           CPURISCVState *env, uint32_t desc,
579           vext_ldst_elem_fn *ldst_elem,
580           uint32_t log2_esz, uintptr_t ra)
581 {
582     void *host;
583     uint32_t i, k, vl = 0;
584     uint32_t nf = vext_nf(desc);
585     uint32_t vm = vext_vm(desc);
586     uint32_t max_elems = vext_max_elems(desc, log2_esz);
587     uint32_t esz = 1 << log2_esz;
588     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589     uint32_t vta = vext_vta(desc);
590     uint32_t vma = vext_vma(desc);
591     target_ulong addr, offset, remain;
592 
593     /* probe every access*/
594     for (i = env->vstart; i < env->vl; i++) {
595         if (!vm && !vext_elem_mask(v0, i)) {
596             continue;
597         }
598         addr = adjust_addr(env, base + i * (nf << log2_esz));
599         if (i == 0) {
600             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601         } else {
602             /* if it triggers an exception, no need to check watchpoint */
603             remain = nf << log2_esz;
604             while (remain > 0) {
605                 offset = -(addr | TARGET_PAGE_MASK);
606                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607                                          cpu_mmu_index(env, false));
608                 if (host) {
609 #ifdef CONFIG_USER_ONLY
610                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
611                         vl = i;
612                         goto ProbeSuccess;
613                     }
614 #else
615                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617                 } else {
618                     vl = i;
619                     goto ProbeSuccess;
620                 }
621                 if (remain <=  offset) {
622                     break;
623                 }
624                 remain -= offset;
625                 addr = adjust_addr(env, addr + offset);
626             }
627         }
628     }
629 ProbeSuccess:
630     /* load bytes from guest memory */
631     if (vl != 0) {
632         env->vl = vl;
633     }
634     for (i = env->vstart; i < env->vl; i++) {
635         k = 0;
636         while (k < nf) {
637             if (!vm && !vext_elem_mask(v0, i)) {
638                 /* set masked-off elements to 1s */
639                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640                                   (i + k * max_elems + 1) * esz);
641                 k++;
642                 continue;
643             }
644             target_ulong addr = base + ((i * nf + k) << log2_esz);
645             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646             k++;
647         }
648     }
649     env->vstart = 0;
650     /* set tail elements to 1s */
651     for (k = 0; k < nf; ++k) {
652         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653                           (k * max_elems + max_elems) * esz);
654     }
655     if (nf * max_elems % total_elems != 0) {
656         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657         uint32_t registers_used =
658             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660                           registers_used * vlenb);
661     }
662 }
663 
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
666                   CPURISCVState *env, uint32_t desc)      \
667 {                                                         \
668     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
669               ctzl(sizeof(ETYPE)), GETPC());              \
670 }
671 
672 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676 
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M)  (N & M)
679 #define DO_XOR(N, M)  (N ^ M)
680 #define DO_OR(N, M)   (N | M)
681 #define DO_ADD(N, M)  (N + M)
682 
683 /* Signed min/max */
684 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
686 
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690 
691 /*
692  *** load and store whole register instructions
693  */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698     uint32_t i, k, off, pos;
699     uint32_t nf = vext_nf(desc);
700     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701     uint32_t max_elems = vlenb >> log2_esz;
702 
703     k = env->vstart / max_elems;
704     off = env->vstart % max_elems;
705 
706     if (off) {
707         /* load/store rest of elements of current segment pointed by vstart */
708         for (pos = off; pos < max_elems; pos++, env->vstart++) {
709             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711         }
712         k++;
713     }
714 
715     /* load/store elements for rest of segments */
716     for (; k < nf; k++) {
717         for (i = 0; i < max_elems; i++, env->vstart++) {
718             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720         }
721     }
722 
723     env->vstart = 0;
724 }
725 
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
727 void HELPER(NAME)(void *vd, target_ulong base,       \
728                   CPURISCVState *env, uint32_t desc) \
729 {                                                    \
730     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
731                     ctzl(sizeof(ETYPE)), GETPC());   \
732 }
733 
734 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750 
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
752 void HELPER(NAME)(void *vd, target_ulong base,       \
753                   CPURISCVState *env, uint32_t desc) \
754 {                                                    \
755     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
756                     ctzl(sizeof(ETYPE)), GETPC());   \
757 }
758 
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763 
764 /*
765  *** Vector Integer Arithmetic Instructions
766  */
767 
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
770 
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802 
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805 
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
808 {                                                               \
809     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
810     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
811     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815 
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824 
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826                        CPURISCVState *env, uint32_t desc,
827                        opivv2_fn *fn, uint32_t esz)
828 {
829     uint32_t vm = vext_vm(desc);
830     uint32_t vl = env->vl;
831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832     uint32_t vta = vext_vta(desc);
833     uint32_t vma = vext_vma(desc);
834     uint32_t i;
835 
836     for (i = env->vstart; i < vl; i++) {
837         if (!vm && !vext_elem_mask(v0, i)) {
838             /* set masked-off elements to 1s */
839             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840             continue;
841         }
842         fn(vd, vs1, vs2, i);
843     }
844     env->vstart = 0;
845     /* set tail elements to 1s */
846     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848 
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ)                            \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
852                   void *vs2, CPURISCVState *env,          \
853                   uint32_t desc)                          \
854 {                                                         \
855     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
856                do_##NAME, ESZ);                           \
857 }
858 
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867 
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869 
870 /*
871  * (T1)s1 gives the real operator type.
872  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873  */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
876 {                                                                   \
877     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
878     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
879 }
880 
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893 
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895                        CPURISCVState *env, uint32_t desc,
896                        opivx2_fn fn, uint32_t esz)
897 {
898     uint32_t vm = vext_vm(desc);
899     uint32_t vl = env->vl;
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901     uint32_t vta = vext_vta(desc);
902     uint32_t i;
903 
904     for (i = env->vstart; i < vl; i++) {
905         if (!vm && !vext_elem_mask(v0, i)) {
906             continue;
907         }
908         fn(vd, s1, vs2, i);
909     }
910     env->vstart = 0;
911     /* set tail elements to 1s */
912     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
913 }
914 
915 /* generate the helpers for OPIVX */
916 #define GEN_VEXT_VX(NAME, ESZ)                            \
917 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
918                   void *vs2, CPURISCVState *env,          \
919                   uint32_t desc)                          \
920 {                                                         \
921     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
922                do_##NAME, ESZ);                           \
923 }
924 
925 GEN_VEXT_VX(vadd_vx_b, 1)
926 GEN_VEXT_VX(vadd_vx_h, 2)
927 GEN_VEXT_VX(vadd_vx_w, 4)
928 GEN_VEXT_VX(vadd_vx_d, 8)
929 GEN_VEXT_VX(vsub_vx_b, 1)
930 GEN_VEXT_VX(vsub_vx_h, 2)
931 GEN_VEXT_VX(vsub_vx_w, 4)
932 GEN_VEXT_VX(vsub_vx_d, 8)
933 GEN_VEXT_VX(vrsub_vx_b, 1)
934 GEN_VEXT_VX(vrsub_vx_h, 2)
935 GEN_VEXT_VX(vrsub_vx_w, 4)
936 GEN_VEXT_VX(vrsub_vx_d, 8)
937 
938 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
939 {
940     intptr_t oprsz = simd_oprsz(desc);
941     intptr_t i;
942 
943     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
944         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
945     }
946 }
947 
948 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
949 {
950     intptr_t oprsz = simd_oprsz(desc);
951     intptr_t i;
952 
953     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
954         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
955     }
956 }
957 
958 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
959 {
960     intptr_t oprsz = simd_oprsz(desc);
961     intptr_t i;
962 
963     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
964         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
965     }
966 }
967 
968 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
969 {
970     intptr_t oprsz = simd_oprsz(desc);
971     intptr_t i;
972 
973     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
974         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
975     }
976 }
977 
978 /* Vector Widening Integer Add/Subtract */
979 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
980 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
981 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
982 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
983 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
984 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
985 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
986 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
987 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
988 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
989 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
990 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
991 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
992 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
993 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
994 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
995 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
996 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
997 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
998 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
999 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1000 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1001 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1002 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1003 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1004 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1005 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1006 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1007 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1008 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1009 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1010 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1011 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1012 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1013 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1014 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1015 GEN_VEXT_VV(vwaddu_vv_b, 2)
1016 GEN_VEXT_VV(vwaddu_vv_h, 4)
1017 GEN_VEXT_VV(vwaddu_vv_w, 8)
1018 GEN_VEXT_VV(vwsubu_vv_b, 2)
1019 GEN_VEXT_VV(vwsubu_vv_h, 4)
1020 GEN_VEXT_VV(vwsubu_vv_w, 8)
1021 GEN_VEXT_VV(vwadd_vv_b, 2)
1022 GEN_VEXT_VV(vwadd_vv_h, 4)
1023 GEN_VEXT_VV(vwadd_vv_w, 8)
1024 GEN_VEXT_VV(vwsub_vv_b, 2)
1025 GEN_VEXT_VV(vwsub_vv_h, 4)
1026 GEN_VEXT_VV(vwsub_vv_w, 8)
1027 GEN_VEXT_VV(vwaddu_wv_b, 2)
1028 GEN_VEXT_VV(vwaddu_wv_h, 4)
1029 GEN_VEXT_VV(vwaddu_wv_w, 8)
1030 GEN_VEXT_VV(vwsubu_wv_b, 2)
1031 GEN_VEXT_VV(vwsubu_wv_h, 4)
1032 GEN_VEXT_VV(vwsubu_wv_w, 8)
1033 GEN_VEXT_VV(vwadd_wv_b, 2)
1034 GEN_VEXT_VV(vwadd_wv_h, 4)
1035 GEN_VEXT_VV(vwadd_wv_w, 8)
1036 GEN_VEXT_VV(vwsub_wv_b, 2)
1037 GEN_VEXT_VV(vwsub_wv_h, 4)
1038 GEN_VEXT_VV(vwsub_wv_w, 8)
1039 
1040 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1041 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1042 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1043 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1044 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1045 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1046 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1047 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1048 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1049 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1050 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1051 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1052 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1053 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1054 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1055 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1056 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1057 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1058 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1059 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1060 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1061 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1062 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1063 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1064 GEN_VEXT_VX(vwaddu_vx_b, 2)
1065 GEN_VEXT_VX(vwaddu_vx_h, 4)
1066 GEN_VEXT_VX(vwaddu_vx_w, 8)
1067 GEN_VEXT_VX(vwsubu_vx_b, 2)
1068 GEN_VEXT_VX(vwsubu_vx_h, 4)
1069 GEN_VEXT_VX(vwsubu_vx_w, 8)
1070 GEN_VEXT_VX(vwadd_vx_b, 2)
1071 GEN_VEXT_VX(vwadd_vx_h, 4)
1072 GEN_VEXT_VX(vwadd_vx_w, 8)
1073 GEN_VEXT_VX(vwsub_vx_b, 2)
1074 GEN_VEXT_VX(vwsub_vx_h, 4)
1075 GEN_VEXT_VX(vwsub_vx_w, 8)
1076 GEN_VEXT_VX(vwaddu_wx_b, 2)
1077 GEN_VEXT_VX(vwaddu_wx_h, 4)
1078 GEN_VEXT_VX(vwaddu_wx_w, 8)
1079 GEN_VEXT_VX(vwsubu_wx_b, 2)
1080 GEN_VEXT_VX(vwsubu_wx_h, 4)
1081 GEN_VEXT_VX(vwsubu_wx_w, 8)
1082 GEN_VEXT_VX(vwadd_wx_b, 2)
1083 GEN_VEXT_VX(vwadd_wx_h, 4)
1084 GEN_VEXT_VX(vwadd_wx_w, 8)
1085 GEN_VEXT_VX(vwsub_wx_b, 2)
1086 GEN_VEXT_VX(vwsub_wx_h, 4)
1087 GEN_VEXT_VX(vwsub_wx_w, 8)
1088 
1089 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1090 #define DO_VADC(N, M, C) (N + M + C)
1091 #define DO_VSBC(N, M, C) (N - M - C)
1092 
1093 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1094 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1095                   CPURISCVState *env, uint32_t desc)          \
1096 {                                                             \
1097     uint32_t vl = env->vl;                                    \
1098     uint32_t esz = sizeof(ETYPE);                             \
1099     uint32_t total_elems =                                    \
1100         vext_get_total_elems(env, desc, esz);                 \
1101     uint32_t vta = vext_vta(desc);                            \
1102     uint32_t i;                                               \
1103                                                               \
1104     for (i = env->vstart; i < vl; i++) {                      \
1105         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1106         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1107         ETYPE carry = vext_elem_mask(v0, i);                  \
1108                                                               \
1109         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1110     }                                                         \
1111     env->vstart = 0;                                          \
1112     /* set tail elements to 1s */                             \
1113     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1114 }
1115 
1116 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1117 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1118 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1119 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1120 
1121 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1122 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1123 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1125 
1126 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1127 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1128                   CPURISCVState *env, uint32_t desc)                     \
1129 {                                                                        \
1130     uint32_t vl = env->vl;                                               \
1131     uint32_t esz = sizeof(ETYPE);                                        \
1132     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1133     uint32_t vta = vext_vta(desc);                                       \
1134     uint32_t i;                                                          \
1135                                                                          \
1136     for (i = env->vstart; i < vl; i++) {                                 \
1137         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1138         ETYPE carry = vext_elem_mask(v0, i);                             \
1139                                                                          \
1140         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1141     }                                                                    \
1142     env->vstart = 0;                                          \
1143     /* set tail elements to 1s */                                        \
1144     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1145 }
1146 
1147 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1148 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1149 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1150 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1151 
1152 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1153 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1154 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1156 
1157 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1158                           (__typeof(N))(N + M) < N)
1159 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1160 
1161 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1162 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1163                   CPURISCVState *env, uint32_t desc)          \
1164 {                                                             \
1165     uint32_t vl = env->vl;                                    \
1166     uint32_t vm = vext_vm(desc);                              \
1167     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1168     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1169     uint32_t i;                                               \
1170                                                               \
1171     for (i = env->vstart; i < vl; i++) {                      \
1172         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1173         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1174         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1175         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1176     }                                                         \
1177     env->vstart = 0;                                          \
1178     /* mask destination register are always tail-agnostic */  \
1179     /* set tail elements to 1s */                             \
1180     if (vta_all_1s) {                                         \
1181         for (; i < total_elems; i++) {                        \
1182             vext_set_elem_mask(vd, i, 1);                     \
1183         }                                                     \
1184     }                                                         \
1185 }
1186 
1187 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1188 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1189 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1191 
1192 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1193 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1194 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1196 
1197 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1198 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1199                   void *vs2, CPURISCVState *env, uint32_t desc) \
1200 {                                                               \
1201     uint32_t vl = env->vl;                                      \
1202     uint32_t vm = vext_vm(desc);                                \
1203     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1204     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1205     uint32_t i;                                                 \
1206                                                                 \
1207     for (i = env->vstart; i < vl; i++) {                        \
1208         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1209         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1210         vext_set_elem_mask(vd, i,                               \
1211                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1212     }                                                           \
1213     env->vstart = 0;                                            \
1214     /* mask destination register are always tail-agnostic */    \
1215     /* set tail elements to 1s */                               \
1216     if (vta_all_1s) {                                           \
1217         for (; i < total_elems; i++) {                          \
1218             vext_set_elem_mask(vd, i, 1);                       \
1219         }                                                       \
1220     }                                                           \
1221 }
1222 
1223 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1224 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1225 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1227 
1228 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1229 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1230 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1232 
1233 /* Vector Bitwise Logical Instructions */
1234 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1235 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1236 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1237 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1238 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1239 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1240 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1241 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1242 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1243 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1244 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1245 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1246 GEN_VEXT_VV(vand_vv_b, 1)
1247 GEN_VEXT_VV(vand_vv_h, 2)
1248 GEN_VEXT_VV(vand_vv_w, 4)
1249 GEN_VEXT_VV(vand_vv_d, 8)
1250 GEN_VEXT_VV(vor_vv_b, 1)
1251 GEN_VEXT_VV(vor_vv_h, 2)
1252 GEN_VEXT_VV(vor_vv_w, 4)
1253 GEN_VEXT_VV(vor_vv_d, 8)
1254 GEN_VEXT_VV(vxor_vv_b, 1)
1255 GEN_VEXT_VV(vxor_vv_h, 2)
1256 GEN_VEXT_VV(vxor_vv_w, 4)
1257 GEN_VEXT_VV(vxor_vv_d, 8)
1258 
1259 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1260 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1261 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1262 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1263 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1264 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1265 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1266 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1267 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1268 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1269 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1270 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1271 GEN_VEXT_VX(vand_vx_b, 1)
1272 GEN_VEXT_VX(vand_vx_h, 2)
1273 GEN_VEXT_VX(vand_vx_w, 4)
1274 GEN_VEXT_VX(vand_vx_d, 8)
1275 GEN_VEXT_VX(vor_vx_b, 1)
1276 GEN_VEXT_VX(vor_vx_h, 2)
1277 GEN_VEXT_VX(vor_vx_w, 4)
1278 GEN_VEXT_VX(vor_vx_d, 8)
1279 GEN_VEXT_VX(vxor_vx_b, 1)
1280 GEN_VEXT_VX(vxor_vx_h, 2)
1281 GEN_VEXT_VX(vxor_vx_w, 4)
1282 GEN_VEXT_VX(vxor_vx_d, 8)
1283 
1284 /* Vector Single-Width Bit Shift Instructions */
1285 #define DO_SLL(N, M)  (N << (M))
1286 #define DO_SRL(N, M)  (N >> (M))
1287 
1288 /* generate the helpers for shift instructions with two vector operators */
1289 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1290 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1291                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1292 {                                                                         \
1293     uint32_t vm = vext_vm(desc);                                          \
1294     uint32_t vl = env->vl;                                                \
1295     uint32_t esz = sizeof(TS1);                                           \
1296     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1297     uint32_t vta = vext_vta(desc);                                        \
1298     uint32_t i;                                                           \
1299                                                                           \
1300     for (i = env->vstart; i < vl; i++) {                                  \
1301         if (!vm && !vext_elem_mask(v0, i)) {                              \
1302             continue;                                                     \
1303         }                                                                 \
1304         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1305         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1306         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1307     }                                                                     \
1308     env->vstart = 0;                                                      \
1309     /* set tail elements to 1s */                                         \
1310     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1311 }
1312 
1313 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1314 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1315 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1316 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1317 
1318 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1319 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1320 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1321 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1322 
1323 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1324 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1325 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1326 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1327 
1328 /* generate the helpers for shift instructions with one vector and one scalar */
1329 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1330 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1331         void *vs2, CPURISCVState *env, uint32_t desc)       \
1332 {                                                           \
1333     uint32_t vm = vext_vm(desc);                            \
1334     uint32_t vl = env->vl;                                  \
1335     uint32_t esz = sizeof(TD);                              \
1336     uint32_t total_elems =                                  \
1337         vext_get_total_elems(env, desc, esz);               \
1338     uint32_t vta = vext_vta(desc);                          \
1339     uint32_t i;                                             \
1340                                                             \
1341     for (i = env->vstart; i < vl; i++) {                    \
1342         if (!vm && !vext_elem_mask(v0, i)) {                \
1343             continue;                                       \
1344         }                                                   \
1345         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1346         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1347     }                                                       \
1348     env->vstart = 0;                                        \
1349     /* set tail elements to 1s */                           \
1350     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1351 }
1352 
1353 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1354 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1355 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1356 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1357 
1358 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1359 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1360 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1361 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1362 
1363 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1364 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1367 
1368 /* Vector Narrowing Integer Right Shift Instructions */
1369 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1372 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1373 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1374 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1375 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1376 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1377 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1378 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1379 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1380 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1381 
1382 /* Vector Integer Comparison Instructions */
1383 #define DO_MSEQ(N, M) (N == M)
1384 #define DO_MSNE(N, M) (N != M)
1385 #define DO_MSLT(N, M) (N < M)
1386 #define DO_MSLE(N, M) (N <= M)
1387 #define DO_MSGT(N, M) (N > M)
1388 
1389 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1390 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1391                   CPURISCVState *env, uint32_t desc)          \
1392 {                                                             \
1393     uint32_t vm = vext_vm(desc);                              \
1394     uint32_t vl = env->vl;                                    \
1395     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1396     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1397     uint32_t i;                                               \
1398                                                               \
1399     for (i = env->vstart; i < vl; i++) {                      \
1400         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1401         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1402         if (!vm && !vext_elem_mask(v0, i)) {                  \
1403             continue;                                         \
1404         }                                                     \
1405         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1406     }                                                         \
1407     env->vstart = 0;                                          \
1408     /* mask destination register are always tail-agnostic */  \
1409     /* set tail elements to 1s */                             \
1410     if (vta_all_1s) {                                         \
1411         for (; i < total_elems; i++) {                        \
1412             vext_set_elem_mask(vd, i, 1);                     \
1413         }                                                     \
1414     }                                                         \
1415 }
1416 
1417 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1418 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1419 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1420 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1421 
1422 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1423 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1424 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1425 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1426 
1427 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1428 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1429 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1430 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1431 
1432 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1433 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1434 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1435 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1436 
1437 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1438 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1439 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1440 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1441 
1442 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1443 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1444 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1445 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1446 
1447 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1448 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1449                   CPURISCVState *env, uint32_t desc)                \
1450 {                                                                   \
1451     uint32_t vm = vext_vm(desc);                                    \
1452     uint32_t vl = env->vl;                                          \
1453     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1454     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1455     uint32_t i;                                                     \
1456                                                                     \
1457     for (i = env->vstart; i < vl; i++) {                            \
1458         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1459         if (!vm && !vext_elem_mask(v0, i)) {                        \
1460             continue;                                               \
1461         }                                                           \
1462         vext_set_elem_mask(vd, i,                                   \
1463                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1464     }                                                               \
1465     env->vstart = 0;                                                \
1466     /* mask destination register are always tail-agnostic */        \
1467     /* set tail elements to 1s */                                   \
1468     if (vta_all_1s) {                                               \
1469         for (; i < total_elems; i++) {                              \
1470             vext_set_elem_mask(vd, i, 1);                           \
1471         }                                                           \
1472     }                                                               \
1473 }
1474 
1475 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1476 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1477 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1478 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1479 
1480 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1481 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1482 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1483 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1484 
1485 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1488 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1489 
1490 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1491 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1492 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1493 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1494 
1495 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1498 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1499 
1500 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1501 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1502 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1503 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1504 
1505 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1508 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1509 
1510 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1511 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1512 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1513 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1514 
1515 /* Vector Integer Min/Max Instructions */
1516 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1517 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1518 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1519 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1520 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1521 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1522 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1523 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1524 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1525 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1526 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1527 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1528 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1529 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1530 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1531 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1532 GEN_VEXT_VV(vminu_vv_b, 1)
1533 GEN_VEXT_VV(vminu_vv_h, 2)
1534 GEN_VEXT_VV(vminu_vv_w, 4)
1535 GEN_VEXT_VV(vminu_vv_d, 8)
1536 GEN_VEXT_VV(vmin_vv_b, 1)
1537 GEN_VEXT_VV(vmin_vv_h, 2)
1538 GEN_VEXT_VV(vmin_vv_w, 4)
1539 GEN_VEXT_VV(vmin_vv_d, 8)
1540 GEN_VEXT_VV(vmaxu_vv_b, 1)
1541 GEN_VEXT_VV(vmaxu_vv_h, 2)
1542 GEN_VEXT_VV(vmaxu_vv_w, 4)
1543 GEN_VEXT_VV(vmaxu_vv_d, 8)
1544 GEN_VEXT_VV(vmax_vv_b, 1)
1545 GEN_VEXT_VV(vmax_vv_h, 2)
1546 GEN_VEXT_VV(vmax_vv_w, 4)
1547 GEN_VEXT_VV(vmax_vv_d, 8)
1548 
1549 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1550 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1551 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1552 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1553 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1554 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1555 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1556 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1557 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1558 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1559 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1560 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1561 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1562 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1563 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1564 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1565 GEN_VEXT_VX(vminu_vx_b, 1)
1566 GEN_VEXT_VX(vminu_vx_h, 2)
1567 GEN_VEXT_VX(vminu_vx_w, 4)
1568 GEN_VEXT_VX(vminu_vx_d, 8)
1569 GEN_VEXT_VX(vmin_vx_b, 1)
1570 GEN_VEXT_VX(vmin_vx_h, 2)
1571 GEN_VEXT_VX(vmin_vx_w, 4)
1572 GEN_VEXT_VX(vmin_vx_d, 8)
1573 GEN_VEXT_VX(vmaxu_vx_b, 1)
1574 GEN_VEXT_VX(vmaxu_vx_h, 2)
1575 GEN_VEXT_VX(vmaxu_vx_w, 4)
1576 GEN_VEXT_VX(vmaxu_vx_d, 8)
1577 GEN_VEXT_VX(vmax_vx_b, 1)
1578 GEN_VEXT_VX(vmax_vx_h, 2)
1579 GEN_VEXT_VX(vmax_vx_w, 4)
1580 GEN_VEXT_VX(vmax_vx_d, 8)
1581 
1582 /* Vector Single-Width Integer Multiply Instructions */
1583 #define DO_MUL(N, M) (N * M)
1584 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1585 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1586 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1587 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1588 GEN_VEXT_VV(vmul_vv_b, 1)
1589 GEN_VEXT_VV(vmul_vv_h, 2)
1590 GEN_VEXT_VV(vmul_vv_w, 4)
1591 GEN_VEXT_VV(vmul_vv_d, 8)
1592 
1593 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1594 {
1595     return (int16_t)s2 * (int16_t)s1 >> 8;
1596 }
1597 
1598 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1599 {
1600     return (int32_t)s2 * (int32_t)s1 >> 16;
1601 }
1602 
1603 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1604 {
1605     return (int64_t)s2 * (int64_t)s1 >> 32;
1606 }
1607 
1608 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1609 {
1610     uint64_t hi_64, lo_64;
1611 
1612     muls64(&lo_64, &hi_64, s1, s2);
1613     return hi_64;
1614 }
1615 
1616 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1617 {
1618     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1619 }
1620 
1621 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1622 {
1623     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1624 }
1625 
1626 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1627 {
1628     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1629 }
1630 
1631 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1632 {
1633     uint64_t hi_64, lo_64;
1634 
1635     mulu64(&lo_64, &hi_64, s2, s1);
1636     return hi_64;
1637 }
1638 
1639 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1640 {
1641     return (int16_t)s2 * (uint16_t)s1 >> 8;
1642 }
1643 
1644 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1645 {
1646     return (int32_t)s2 * (uint32_t)s1 >> 16;
1647 }
1648 
1649 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1650 {
1651     return (int64_t)s2 * (uint64_t)s1 >> 32;
1652 }
1653 
1654 /*
1655  * Let  A = signed operand,
1656  *      B = unsigned operand
1657  *      P = mulu64(A, B), unsigned product
1658  *
1659  * LET  X = 2 ** 64  - A, 2's complement of A
1660  *      SP = signed product
1661  * THEN
1662  *      IF A < 0
1663  *          SP = -X * B
1664  *             = -(2 ** 64 - A) * B
1665  *             = A * B - 2 ** 64 * B
1666  *             = P - 2 ** 64 * B
1667  *      ELSE
1668  *          SP = P
1669  * THEN
1670  *      HI_P -= (A < 0 ? B : 0)
1671  */
1672 
1673 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1674 {
1675     uint64_t hi_64, lo_64;
1676 
1677     mulu64(&lo_64, &hi_64, s2, s1);
1678 
1679     hi_64 -= s2 < 0 ? s1 : 0;
1680     return hi_64;
1681 }
1682 
1683 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1684 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1685 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1686 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1687 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1688 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1689 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1690 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1691 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1692 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1693 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1694 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1695 GEN_VEXT_VV(vmulh_vv_b, 1)
1696 GEN_VEXT_VV(vmulh_vv_h, 2)
1697 GEN_VEXT_VV(vmulh_vv_w, 4)
1698 GEN_VEXT_VV(vmulh_vv_d, 8)
1699 GEN_VEXT_VV(vmulhu_vv_b, 1)
1700 GEN_VEXT_VV(vmulhu_vv_h, 2)
1701 GEN_VEXT_VV(vmulhu_vv_w, 4)
1702 GEN_VEXT_VV(vmulhu_vv_d, 8)
1703 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1704 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1705 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1706 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1707 
1708 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1709 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1710 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1711 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1712 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1713 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1714 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1715 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1716 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1717 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1718 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1719 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1720 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1721 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1722 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1723 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1724 GEN_VEXT_VX(vmul_vx_b, 1)
1725 GEN_VEXT_VX(vmul_vx_h, 2)
1726 GEN_VEXT_VX(vmul_vx_w, 4)
1727 GEN_VEXT_VX(vmul_vx_d, 8)
1728 GEN_VEXT_VX(vmulh_vx_b, 1)
1729 GEN_VEXT_VX(vmulh_vx_h, 2)
1730 GEN_VEXT_VX(vmulh_vx_w, 4)
1731 GEN_VEXT_VX(vmulh_vx_d, 8)
1732 GEN_VEXT_VX(vmulhu_vx_b, 1)
1733 GEN_VEXT_VX(vmulhu_vx_h, 2)
1734 GEN_VEXT_VX(vmulhu_vx_w, 4)
1735 GEN_VEXT_VX(vmulhu_vx_d, 8)
1736 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1737 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1738 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1739 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1740 
1741 /* Vector Integer Divide Instructions */
1742 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1743 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1744 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1745         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1746 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1747         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1748 
1749 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1750 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1751 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1752 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1753 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1754 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1755 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1756 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1757 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1758 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1759 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1760 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1761 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1762 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1763 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1764 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1765 GEN_VEXT_VV(vdivu_vv_b, 1)
1766 GEN_VEXT_VV(vdivu_vv_h, 2)
1767 GEN_VEXT_VV(vdivu_vv_w, 4)
1768 GEN_VEXT_VV(vdivu_vv_d, 8)
1769 GEN_VEXT_VV(vdiv_vv_b, 1)
1770 GEN_VEXT_VV(vdiv_vv_h, 2)
1771 GEN_VEXT_VV(vdiv_vv_w, 4)
1772 GEN_VEXT_VV(vdiv_vv_d, 8)
1773 GEN_VEXT_VV(vremu_vv_b, 1)
1774 GEN_VEXT_VV(vremu_vv_h, 2)
1775 GEN_VEXT_VV(vremu_vv_w, 4)
1776 GEN_VEXT_VV(vremu_vv_d, 8)
1777 GEN_VEXT_VV(vrem_vv_b, 1)
1778 GEN_VEXT_VV(vrem_vv_h, 2)
1779 GEN_VEXT_VV(vrem_vv_w, 4)
1780 GEN_VEXT_VV(vrem_vv_d, 8)
1781 
1782 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1783 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1784 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1785 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1786 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1787 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1788 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1789 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1790 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1791 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1792 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1793 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1794 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1795 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1796 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1797 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1798 GEN_VEXT_VX(vdivu_vx_b, 1)
1799 GEN_VEXT_VX(vdivu_vx_h, 2)
1800 GEN_VEXT_VX(vdivu_vx_w, 4)
1801 GEN_VEXT_VX(vdivu_vx_d, 8)
1802 GEN_VEXT_VX(vdiv_vx_b, 1)
1803 GEN_VEXT_VX(vdiv_vx_h, 2)
1804 GEN_VEXT_VX(vdiv_vx_w, 4)
1805 GEN_VEXT_VX(vdiv_vx_d, 8)
1806 GEN_VEXT_VX(vremu_vx_b, 1)
1807 GEN_VEXT_VX(vremu_vx_h, 2)
1808 GEN_VEXT_VX(vremu_vx_w, 4)
1809 GEN_VEXT_VX(vremu_vx_d, 8)
1810 GEN_VEXT_VX(vrem_vx_b, 1)
1811 GEN_VEXT_VX(vrem_vx_h, 2)
1812 GEN_VEXT_VX(vrem_vx_w, 4)
1813 GEN_VEXT_VX(vrem_vx_d, 8)
1814 
1815 /* Vector Widening Integer Multiply Instructions */
1816 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1819 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1820 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1821 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1822 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1823 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1824 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1825 GEN_VEXT_VV(vwmul_vv_b, 2)
1826 GEN_VEXT_VV(vwmul_vv_h, 4)
1827 GEN_VEXT_VV(vwmul_vv_w, 8)
1828 GEN_VEXT_VV(vwmulu_vv_b, 2)
1829 GEN_VEXT_VV(vwmulu_vv_h, 4)
1830 GEN_VEXT_VV(vwmulu_vv_w, 8)
1831 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1832 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1833 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1834 
1835 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1838 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1839 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1840 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1841 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1842 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1843 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1844 GEN_VEXT_VX(vwmul_vx_b, 2)
1845 GEN_VEXT_VX(vwmul_vx_h, 4)
1846 GEN_VEXT_VX(vwmul_vx_w, 8)
1847 GEN_VEXT_VX(vwmulu_vx_b, 2)
1848 GEN_VEXT_VX(vwmulu_vx_h, 4)
1849 GEN_VEXT_VX(vwmulu_vx_w, 8)
1850 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1851 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1852 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1853 
1854 /* Vector Single-Width Integer Multiply-Add Instructions */
1855 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1856 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1857 {                                                                  \
1858     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1859     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1860     TD d = *((TD *)vd + HD(i));                                    \
1861     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1862 }
1863 
1864 #define DO_MACC(N, M, D) (M * N + D)
1865 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1866 #define DO_MADD(N, M, D) (M * D + N)
1867 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1868 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1869 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1870 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1871 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1872 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1873 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1874 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1875 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1876 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1877 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1878 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1879 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1880 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1881 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1882 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1883 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1884 GEN_VEXT_VV(vmacc_vv_b, 1)
1885 GEN_VEXT_VV(vmacc_vv_h, 2)
1886 GEN_VEXT_VV(vmacc_vv_w, 4)
1887 GEN_VEXT_VV(vmacc_vv_d, 8)
1888 GEN_VEXT_VV(vnmsac_vv_b, 1)
1889 GEN_VEXT_VV(vnmsac_vv_h, 2)
1890 GEN_VEXT_VV(vnmsac_vv_w, 4)
1891 GEN_VEXT_VV(vnmsac_vv_d, 8)
1892 GEN_VEXT_VV(vmadd_vv_b, 1)
1893 GEN_VEXT_VV(vmadd_vv_h, 2)
1894 GEN_VEXT_VV(vmadd_vv_w, 4)
1895 GEN_VEXT_VV(vmadd_vv_d, 8)
1896 GEN_VEXT_VV(vnmsub_vv_b, 1)
1897 GEN_VEXT_VV(vnmsub_vv_h, 2)
1898 GEN_VEXT_VV(vnmsub_vv_w, 4)
1899 GEN_VEXT_VV(vnmsub_vv_d, 8)
1900 
1901 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1902 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1903 {                                                                   \
1904     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1905     TD d = *((TD *)vd + HD(i));                                     \
1906     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1907 }
1908 
1909 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1910 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1911 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1912 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1913 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1914 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1915 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1916 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1917 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1918 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1919 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1920 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1921 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1922 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1923 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1924 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1925 GEN_VEXT_VX(vmacc_vx_b, 1)
1926 GEN_VEXT_VX(vmacc_vx_h, 2)
1927 GEN_VEXT_VX(vmacc_vx_w, 4)
1928 GEN_VEXT_VX(vmacc_vx_d, 8)
1929 GEN_VEXT_VX(vnmsac_vx_b, 1)
1930 GEN_VEXT_VX(vnmsac_vx_h, 2)
1931 GEN_VEXT_VX(vnmsac_vx_w, 4)
1932 GEN_VEXT_VX(vnmsac_vx_d, 8)
1933 GEN_VEXT_VX(vmadd_vx_b, 1)
1934 GEN_VEXT_VX(vmadd_vx_h, 2)
1935 GEN_VEXT_VX(vmadd_vx_w, 4)
1936 GEN_VEXT_VX(vmadd_vx_d, 8)
1937 GEN_VEXT_VX(vnmsub_vx_b, 1)
1938 GEN_VEXT_VX(vnmsub_vx_h, 2)
1939 GEN_VEXT_VX(vnmsub_vx_w, 4)
1940 GEN_VEXT_VX(vnmsub_vx_d, 8)
1941 
1942 /* Vector Widening Integer Multiply-Add Instructions */
1943 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1946 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1947 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1948 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1949 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1950 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1951 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1952 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1955 GEN_VEXT_VV(vwmacc_vv_b, 2)
1956 GEN_VEXT_VV(vwmacc_vv_h, 4)
1957 GEN_VEXT_VV(vwmacc_vv_w, 8)
1958 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1959 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1960 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1961 
1962 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1968 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1969 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1970 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1971 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1972 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1973 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1974 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1977 GEN_VEXT_VX(vwmacc_vx_b, 2)
1978 GEN_VEXT_VX(vwmacc_vx_h, 4)
1979 GEN_VEXT_VX(vwmacc_vx_w, 8)
1980 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1981 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1982 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1983 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1984 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1985 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1986 
1987 /* Vector Integer Merge and Move Instructions */
1988 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1989 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1990                   uint32_t desc)                                     \
1991 {                                                                    \
1992     uint32_t vl = env->vl;                                           \
1993     uint32_t esz = sizeof(ETYPE);                                    \
1994     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1995     uint32_t vta = vext_vta(desc);                                   \
1996     uint32_t i;                                                      \
1997                                                                      \
1998     for (i = env->vstart; i < vl; i++) {                             \
1999         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2000         *((ETYPE *)vd + H(i)) = s1;                                  \
2001     }                                                                \
2002     env->vstart = 0;                                                 \
2003     /* set tail elements to 1s */                                    \
2004     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2005 }
2006 
2007 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2008 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2009 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2010 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2011 
2012 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2013 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2014                   uint32_t desc)                                     \
2015 {                                                                    \
2016     uint32_t vl = env->vl;                                           \
2017     uint32_t esz = sizeof(ETYPE);                                    \
2018     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2019     uint32_t vta = vext_vta(desc);                                   \
2020     uint32_t i;                                                      \
2021                                                                      \
2022     for (i = env->vstart; i < vl; i++) {                             \
2023         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2024     }                                                                \
2025     env->vstart = 0;                                                 \
2026     /* set tail elements to 1s */                                    \
2027     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2028 }
2029 
2030 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2031 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2032 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2033 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2034 
2035 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2036 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2037                   CPURISCVState *env, uint32_t desc)                 \
2038 {                                                                    \
2039     uint32_t vl = env->vl;                                           \
2040     uint32_t esz = sizeof(ETYPE);                                    \
2041     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2042     uint32_t vta = vext_vta(desc);                                   \
2043     uint32_t i;                                                      \
2044                                                                      \
2045     for (i = env->vstart; i < vl; i++) {                             \
2046         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2047         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2048     }                                                                \
2049     env->vstart = 0;                                                 \
2050     /* set tail elements to 1s */                                    \
2051     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2052 }
2053 
2054 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2055 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2056 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2057 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2058 
2059 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2060 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2061                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2062 {                                                                    \
2063     uint32_t vl = env->vl;                                           \
2064     uint32_t esz = sizeof(ETYPE);                                    \
2065     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2066     uint32_t vta = vext_vta(desc);                                   \
2067     uint32_t i;                                                      \
2068                                                                      \
2069     for (i = env->vstart; i < vl; i++) {                             \
2070         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2071         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2072                    (ETYPE)(target_long)s1);                          \
2073         *((ETYPE *)vd + H(i)) = d;                                   \
2074     }                                                                \
2075     env->vstart = 0;                                                 \
2076     /* set tail elements to 1s */                                    \
2077     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2078 }
2079 
2080 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2081 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2082 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2083 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2084 
2085 /*
2086  *** Vector Fixed-Point Arithmetic Instructions
2087  */
2088 
2089 /* Vector Single-Width Saturating Add and Subtract */
2090 
2091 /*
2092  * As fixed point instructions probably have round mode and saturation,
2093  * define common macros for fixed point here.
2094  */
2095 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2096                           CPURISCVState *env, int vxrm);
2097 
2098 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2099 static inline void                                                  \
2100 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2101           CPURISCVState *env, int vxrm)                             \
2102 {                                                                   \
2103     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2104     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2105     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2106 }
2107 
2108 static inline void
2109 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2110              CPURISCVState *env,
2111              uint32_t vl, uint32_t vm, int vxrm,
2112              opivv2_rm_fn *fn)
2113 {
2114     for (uint32_t i = env->vstart; i < vl; i++) {
2115         if (!vm && !vext_elem_mask(v0, i)) {
2116             continue;
2117         }
2118         fn(vd, vs1, vs2, i, env, vxrm);
2119     }
2120     env->vstart = 0;
2121 }
2122 
2123 static inline void
2124 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2125              CPURISCVState *env,
2126              uint32_t desc,
2127              opivv2_rm_fn *fn, uint32_t esz)
2128 {
2129     uint32_t vm = vext_vm(desc);
2130     uint32_t vl = env->vl;
2131     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2132     uint32_t vta = vext_vta(desc);
2133 
2134     switch (env->vxrm) {
2135     case 0: /* rnu */
2136         vext_vv_rm_1(vd, v0, vs1, vs2,
2137                      env, vl, vm, 0, fn);
2138         break;
2139     case 1: /* rne */
2140         vext_vv_rm_1(vd, v0, vs1, vs2,
2141                      env, vl, vm, 1, fn);
2142         break;
2143     case 2: /* rdn */
2144         vext_vv_rm_1(vd, v0, vs1, vs2,
2145                      env, vl, vm, 2, fn);
2146         break;
2147     default: /* rod */
2148         vext_vv_rm_1(vd, v0, vs1, vs2,
2149                      env, vl, vm, 3, fn);
2150         break;
2151     }
2152     /* set tail elements to 1s */
2153     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2154 }
2155 
2156 /* generate helpers for fixed point instructions with OPIVV format */
2157 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2158 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2159                   CPURISCVState *env, uint32_t desc)            \
2160 {                                                               \
2161     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2162                  do_##NAME, ESZ);                               \
2163 }
2164 
2165 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2166 {
2167     uint8_t res = a + b;
2168     if (res < a) {
2169         res = UINT8_MAX;
2170         env->vxsat = 0x1;
2171     }
2172     return res;
2173 }
2174 
2175 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2176                                uint16_t b)
2177 {
2178     uint16_t res = a + b;
2179     if (res < a) {
2180         res = UINT16_MAX;
2181         env->vxsat = 0x1;
2182     }
2183     return res;
2184 }
2185 
2186 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2187                                uint32_t b)
2188 {
2189     uint32_t res = a + b;
2190     if (res < a) {
2191         res = UINT32_MAX;
2192         env->vxsat = 0x1;
2193     }
2194     return res;
2195 }
2196 
2197 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2198                                uint64_t b)
2199 {
2200     uint64_t res = a + b;
2201     if (res < a) {
2202         res = UINT64_MAX;
2203         env->vxsat = 0x1;
2204     }
2205     return res;
2206 }
2207 
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2209 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2210 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2211 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2212 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2213 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2214 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2215 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2216 
2217 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2218                           CPURISCVState *env, int vxrm);
2219 
2220 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2221 static inline void                                                  \
2222 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2223           CPURISCVState *env, int vxrm)                             \
2224 {                                                                   \
2225     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2226     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2227 }
2228 
2229 static inline void
2230 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2231              CPURISCVState *env,
2232              uint32_t vl, uint32_t vm, int vxrm,
2233              opivx2_rm_fn *fn)
2234 {
2235     for (uint32_t i = env->vstart; i < vl; i++) {
2236         if (!vm && !vext_elem_mask(v0, i)) {
2237             continue;
2238         }
2239         fn(vd, s1, vs2, i, env, vxrm);
2240     }
2241     env->vstart = 0;
2242 }
2243 
2244 static inline void
2245 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2246              CPURISCVState *env,
2247              uint32_t desc,
2248              opivx2_rm_fn *fn, uint32_t esz)
2249 {
2250     uint32_t vm = vext_vm(desc);
2251     uint32_t vl = env->vl;
2252     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2253     uint32_t vta = vext_vta(desc);
2254 
2255     switch (env->vxrm) {
2256     case 0: /* rnu */
2257         vext_vx_rm_1(vd, v0, s1, vs2,
2258                      env, vl, vm, 0, fn);
2259         break;
2260     case 1: /* rne */
2261         vext_vx_rm_1(vd, v0, s1, vs2,
2262                      env, vl, vm, 1, fn);
2263         break;
2264     case 2: /* rdn */
2265         vext_vx_rm_1(vd, v0, s1, vs2,
2266                      env, vl, vm, 2, fn);
2267         break;
2268     default: /* rod */
2269         vext_vx_rm_1(vd, v0, s1, vs2,
2270                      env, vl, vm, 3, fn);
2271         break;
2272     }
2273     /* set tail elements to 1s */
2274     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2275 }
2276 
2277 /* generate helpers for fixed point instructions with OPIVX format */
2278 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2279 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2280         void *vs2, CPURISCVState *env, uint32_t desc)     \
2281 {                                                         \
2282     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2283                  do_##NAME, ESZ);                         \
2284 }
2285 
2286 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2287 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2290 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2291 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2292 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2293 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2294 
2295 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2296 {
2297     int8_t res = a + b;
2298     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2299         res = a > 0 ? INT8_MAX : INT8_MIN;
2300         env->vxsat = 0x1;
2301     }
2302     return res;
2303 }
2304 
2305 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2306 {
2307     int16_t res = a + b;
2308     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2309         res = a > 0 ? INT16_MAX : INT16_MIN;
2310         env->vxsat = 0x1;
2311     }
2312     return res;
2313 }
2314 
2315 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2316 {
2317     int32_t res = a + b;
2318     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2319         res = a > 0 ? INT32_MAX : INT32_MIN;
2320         env->vxsat = 0x1;
2321     }
2322     return res;
2323 }
2324 
2325 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2326 {
2327     int64_t res = a + b;
2328     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2329         res = a > 0 ? INT64_MAX : INT64_MIN;
2330         env->vxsat = 0x1;
2331     }
2332     return res;
2333 }
2334 
2335 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2336 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2337 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2338 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2339 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2340 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2341 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2342 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2343 
2344 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2345 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2346 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2347 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2348 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2349 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2350 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2351 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2352 
2353 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2354 {
2355     uint8_t res = a - b;
2356     if (res > a) {
2357         res = 0;
2358         env->vxsat = 0x1;
2359     }
2360     return res;
2361 }
2362 
2363 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2364                                uint16_t b)
2365 {
2366     uint16_t res = a - b;
2367     if (res > a) {
2368         res = 0;
2369         env->vxsat = 0x1;
2370     }
2371     return res;
2372 }
2373 
2374 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2375                                uint32_t b)
2376 {
2377     uint32_t res = a - b;
2378     if (res > a) {
2379         res = 0;
2380         env->vxsat = 0x1;
2381     }
2382     return res;
2383 }
2384 
2385 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2386                                uint64_t b)
2387 {
2388     uint64_t res = a - b;
2389     if (res > a) {
2390         res = 0;
2391         env->vxsat = 0x1;
2392     }
2393     return res;
2394 }
2395 
2396 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2397 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2398 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2399 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2400 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2401 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2402 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2403 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2404 
2405 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2406 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2407 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2408 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2409 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2410 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2411 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2412 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2413 
2414 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2415 {
2416     int8_t res = a - b;
2417     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2418         res = a >= 0 ? INT8_MAX : INT8_MIN;
2419         env->vxsat = 0x1;
2420     }
2421     return res;
2422 }
2423 
2424 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2425 {
2426     int16_t res = a - b;
2427     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2428         res = a >= 0 ? INT16_MAX : INT16_MIN;
2429         env->vxsat = 0x1;
2430     }
2431     return res;
2432 }
2433 
2434 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2435 {
2436     int32_t res = a - b;
2437     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2438         res = a >= 0 ? INT32_MAX : INT32_MIN;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2445 {
2446     int64_t res = a - b;
2447     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2448         res = a >= 0 ? INT64_MAX : INT64_MIN;
2449         env->vxsat = 0x1;
2450     }
2451     return res;
2452 }
2453 
2454 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2455 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2456 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2457 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2458 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2459 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2460 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2461 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2462 
2463 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2464 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2465 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2466 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2467 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2468 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2469 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2470 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2471 
2472 /* Vector Single-Width Averaging Add and Subtract */
2473 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2474 {
2475     uint8_t d = extract64(v, shift, 1);
2476     uint8_t d1;
2477     uint64_t D1, D2;
2478 
2479     if (shift == 0 || shift > 64) {
2480         return 0;
2481     }
2482 
2483     d1 = extract64(v, shift - 1, 1);
2484     D1 = extract64(v, 0, shift);
2485     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2486         return d1;
2487     } else if (vxrm == 1) { /* round-to-nearest-even */
2488         if (shift > 1) {
2489             D2 = extract64(v, 0, shift - 1);
2490             return d1 & ((D2 != 0) | d);
2491         } else {
2492             return d1 & d;
2493         }
2494     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2495         return !d & (D1 != 0);
2496     }
2497     return 0; /* round-down (truncate) */
2498 }
2499 
2500 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2501 {
2502     int64_t res = (int64_t)a + b;
2503     uint8_t round = get_round(vxrm, res, 1);
2504 
2505     return (res >> 1) + round;
2506 }
2507 
2508 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2509 {
2510     int64_t res = a + b;
2511     uint8_t round = get_round(vxrm, res, 1);
2512     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2513 
2514     /* With signed overflow, bit 64 is inverse of bit 63. */
2515     return ((res >> 1) ^ over) + round;
2516 }
2517 
2518 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2519 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2520 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2521 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2522 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2523 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2524 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2525 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2526 
2527 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2528 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2529 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2530 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2531 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2532 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2533 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2534 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2535 
2536 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2537                                uint32_t a, uint32_t b)
2538 {
2539     uint64_t res = (uint64_t)a + b;
2540     uint8_t round = get_round(vxrm, res, 1);
2541 
2542     return (res >> 1) + round;
2543 }
2544 
2545 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2546                                uint64_t a, uint64_t b)
2547 {
2548     uint64_t res = a + b;
2549     uint8_t round = get_round(vxrm, res, 1);
2550     uint64_t over = (uint64_t)(res < a) << 63;
2551 
2552     return ((res >> 1) | over) + round;
2553 }
2554 
2555 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2556 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2557 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2558 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2559 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2560 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2561 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2562 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2563 
2564 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2565 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2566 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2567 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2568 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2569 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2570 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2571 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2572 
2573 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2574 {
2575     int64_t res = (int64_t)a - b;
2576     uint8_t round = get_round(vxrm, res, 1);
2577 
2578     return (res >> 1) + round;
2579 }
2580 
2581 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2582 {
2583     int64_t res = (int64_t)a - b;
2584     uint8_t round = get_round(vxrm, res, 1);
2585     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2586 
2587     /* With signed overflow, bit 64 is inverse of bit 63. */
2588     return ((res >> 1) ^ over) + round;
2589 }
2590 
2591 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2592 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2593 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2594 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2595 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2596 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2597 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2598 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2599 
2600 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2601 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2602 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2603 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2604 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2605 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2606 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2607 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2608 
2609 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2610                                uint32_t a, uint32_t b)
2611 {
2612     int64_t res = (int64_t)a - b;
2613     uint8_t round = get_round(vxrm, res, 1);
2614 
2615     return (res >> 1) + round;
2616 }
2617 
2618 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2619                                uint64_t a, uint64_t b)
2620 {
2621     uint64_t res = (uint64_t)a - b;
2622     uint8_t round = get_round(vxrm, res, 1);
2623     uint64_t over = (uint64_t)(res > a) << 63;
2624 
2625     return ((res >> 1) | over) + round;
2626 }
2627 
2628 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2629 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2630 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2631 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2632 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2633 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2634 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2635 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2636 
2637 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2638 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2639 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2640 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2641 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2642 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2643 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2644 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2645 
2646 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2647 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2648 {
2649     uint8_t round;
2650     int16_t res;
2651 
2652     res = (int16_t)a * (int16_t)b;
2653     round = get_round(vxrm, res, 7);
2654     res   = (res >> 7) + round;
2655 
2656     if (res > INT8_MAX) {
2657         env->vxsat = 0x1;
2658         return INT8_MAX;
2659     } else if (res < INT8_MIN) {
2660         env->vxsat = 0x1;
2661         return INT8_MIN;
2662     } else {
2663         return res;
2664     }
2665 }
2666 
2667 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2668 {
2669     uint8_t round;
2670     int32_t res;
2671 
2672     res = (int32_t)a * (int32_t)b;
2673     round = get_round(vxrm, res, 15);
2674     res   = (res >> 15) + round;
2675 
2676     if (res > INT16_MAX) {
2677         env->vxsat = 0x1;
2678         return INT16_MAX;
2679     } else if (res < INT16_MIN) {
2680         env->vxsat = 0x1;
2681         return INT16_MIN;
2682     } else {
2683         return res;
2684     }
2685 }
2686 
2687 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2688 {
2689     uint8_t round;
2690     int64_t res;
2691 
2692     res = (int64_t)a * (int64_t)b;
2693     round = get_round(vxrm, res, 31);
2694     res   = (res >> 31) + round;
2695 
2696     if (res > INT32_MAX) {
2697         env->vxsat = 0x1;
2698         return INT32_MAX;
2699     } else if (res < INT32_MIN) {
2700         env->vxsat = 0x1;
2701         return INT32_MIN;
2702     } else {
2703         return res;
2704     }
2705 }
2706 
2707 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2708 {
2709     uint8_t round;
2710     uint64_t hi_64, lo_64;
2711     int64_t res;
2712 
2713     if (a == INT64_MIN && b == INT64_MIN) {
2714         env->vxsat = 1;
2715         return INT64_MAX;
2716     }
2717 
2718     muls64(&lo_64, &hi_64, a, b);
2719     round = get_round(vxrm, lo_64, 63);
2720     /*
2721      * Cannot overflow, as there are always
2722      * 2 sign bits after multiply.
2723      */
2724     res = (hi_64 << 1) | (lo_64 >> 63);
2725     if (round) {
2726         if (res == INT64_MAX) {
2727             env->vxsat = 1;
2728         } else {
2729             res += 1;
2730         }
2731     }
2732     return res;
2733 }
2734 
2735 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2736 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2737 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2738 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2739 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2740 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2741 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2742 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2743 
2744 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2745 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2746 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2747 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2748 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2749 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2750 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2751 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2752 
2753 /* Vector Single-Width Scaling Shift Instructions */
2754 static inline uint8_t
2755 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2756 {
2757     uint8_t round, shift = b & 0x7;
2758     uint8_t res;
2759 
2760     round = get_round(vxrm, a, shift);
2761     res   = (a >> shift)  + round;
2762     return res;
2763 }
2764 static inline uint16_t
2765 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2766 {
2767     uint8_t round, shift = b & 0xf;
2768     uint16_t res;
2769 
2770     round = get_round(vxrm, a, shift);
2771     res   = (a >> shift)  + round;
2772     return res;
2773 }
2774 static inline uint32_t
2775 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2776 {
2777     uint8_t round, shift = b & 0x1f;
2778     uint32_t res;
2779 
2780     round = get_round(vxrm, a, shift);
2781     res   = (a >> shift)  + round;
2782     return res;
2783 }
2784 static inline uint64_t
2785 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2786 {
2787     uint8_t round, shift = b & 0x3f;
2788     uint64_t res;
2789 
2790     round = get_round(vxrm, a, shift);
2791     res   = (a >> shift)  + round;
2792     return res;
2793 }
2794 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2795 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2796 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2797 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2798 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2799 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2800 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2801 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2802 
2803 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2804 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2805 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2806 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2807 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2808 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2809 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2810 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2811 
2812 static inline int8_t
2813 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2814 {
2815     uint8_t round, shift = b & 0x7;
2816     int8_t res;
2817 
2818     round = get_round(vxrm, a, shift);
2819     res   = (a >> shift)  + round;
2820     return res;
2821 }
2822 static inline int16_t
2823 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2824 {
2825     uint8_t round, shift = b & 0xf;
2826     int16_t res;
2827 
2828     round = get_round(vxrm, a, shift);
2829     res   = (a >> shift)  + round;
2830     return res;
2831 }
2832 static inline int32_t
2833 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2834 {
2835     uint8_t round, shift = b & 0x1f;
2836     int32_t res;
2837 
2838     round = get_round(vxrm, a, shift);
2839     res   = (a >> shift)  + round;
2840     return res;
2841 }
2842 static inline int64_t
2843 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2844 {
2845     uint8_t round, shift = b & 0x3f;
2846     int64_t res;
2847 
2848     round = get_round(vxrm, a, shift);
2849     res   = (a >> shift)  + round;
2850     return res;
2851 }
2852 
2853 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2854 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2855 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2856 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2857 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2858 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2859 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2860 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2861 
2862 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2863 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2864 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2865 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2866 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2867 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2868 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2869 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2870 
2871 /* Vector Narrowing Fixed-Point Clip Instructions */
2872 static inline int8_t
2873 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2874 {
2875     uint8_t round, shift = b & 0xf;
2876     int16_t res;
2877 
2878     round = get_round(vxrm, a, shift);
2879     res   = (a >> shift)  + round;
2880     if (res > INT8_MAX) {
2881         env->vxsat = 0x1;
2882         return INT8_MAX;
2883     } else if (res < INT8_MIN) {
2884         env->vxsat = 0x1;
2885         return INT8_MIN;
2886     } else {
2887         return res;
2888     }
2889 }
2890 
2891 static inline int16_t
2892 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2893 {
2894     uint8_t round, shift = b & 0x1f;
2895     int32_t res;
2896 
2897     round = get_round(vxrm, a, shift);
2898     res   = (a >> shift)  + round;
2899     if (res > INT16_MAX) {
2900         env->vxsat = 0x1;
2901         return INT16_MAX;
2902     } else if (res < INT16_MIN) {
2903         env->vxsat = 0x1;
2904         return INT16_MIN;
2905     } else {
2906         return res;
2907     }
2908 }
2909 
2910 static inline int32_t
2911 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2912 {
2913     uint8_t round, shift = b & 0x3f;
2914     int64_t res;
2915 
2916     round = get_round(vxrm, a, shift);
2917     res   = (a >> shift)  + round;
2918     if (res > INT32_MAX) {
2919         env->vxsat = 0x1;
2920         return INT32_MAX;
2921     } else if (res < INT32_MIN) {
2922         env->vxsat = 0x1;
2923         return INT32_MIN;
2924     } else {
2925         return res;
2926     }
2927 }
2928 
2929 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2930 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2931 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2932 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2933 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2934 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2935 
2936 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2937 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2938 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2939 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2940 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2941 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2942 
2943 static inline uint8_t
2944 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2945 {
2946     uint8_t round, shift = b & 0xf;
2947     uint16_t res;
2948 
2949     round = get_round(vxrm, a, shift);
2950     res   = (a >> shift)  + round;
2951     if (res > UINT8_MAX) {
2952         env->vxsat = 0x1;
2953         return UINT8_MAX;
2954     } else {
2955         return res;
2956     }
2957 }
2958 
2959 static inline uint16_t
2960 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2961 {
2962     uint8_t round, shift = b & 0x1f;
2963     uint32_t res;
2964 
2965     round = get_round(vxrm, a, shift);
2966     res   = (a >> shift)  + round;
2967     if (res > UINT16_MAX) {
2968         env->vxsat = 0x1;
2969         return UINT16_MAX;
2970     } else {
2971         return res;
2972     }
2973 }
2974 
2975 static inline uint32_t
2976 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2977 {
2978     uint8_t round, shift = b & 0x3f;
2979     uint64_t res;
2980 
2981     round = get_round(vxrm, a, shift);
2982     res   = (a >> shift)  + round;
2983     if (res > UINT32_MAX) {
2984         env->vxsat = 0x1;
2985         return UINT32_MAX;
2986     } else {
2987         return res;
2988     }
2989 }
2990 
2991 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2992 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2993 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2994 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2995 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2996 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2997 
2998 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2999 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3000 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3001 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3002 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3003 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3004 
3005 /*
3006  *** Vector Float Point Arithmetic Instructions
3007  */
3008 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3009 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3010 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3011                       CPURISCVState *env)                      \
3012 {                                                              \
3013     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3014     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3015     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3016 }
3017 
3018 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3019 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3020                   void *vs2, CPURISCVState *env,          \
3021                   uint32_t desc)                          \
3022 {                                                         \
3023     uint32_t vm = vext_vm(desc);                          \
3024     uint32_t vl = env->vl;                                \
3025     uint32_t total_elems =                                \
3026         vext_get_total_elems(env, desc, ESZ);             \
3027     uint32_t vta = vext_vta(desc);                        \
3028     uint32_t i;                                           \
3029                                                           \
3030     for (i = env->vstart; i < vl; i++) {                  \
3031         if (!vm && !vext_elem_mask(v0, i)) {              \
3032             continue;                                     \
3033         }                                                 \
3034         do_##NAME(vd, vs1, vs2, i, env);                  \
3035     }                                                     \
3036     env->vstart = 0;                                      \
3037     /* set tail elements to 1s */                         \
3038     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3039                       total_elems * ESZ);                 \
3040 }
3041 
3042 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3043 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3044 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3045 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3046 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3047 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3048 
3049 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3050 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3051                       CPURISCVState *env)                      \
3052 {                                                              \
3053     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3054     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3055 }
3056 
3057 #define GEN_VEXT_VF(NAME, ESZ)                            \
3058 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3059                   void *vs2, CPURISCVState *env,          \
3060                   uint32_t desc)                          \
3061 {                                                         \
3062     uint32_t vm = vext_vm(desc);                          \
3063     uint32_t vl = env->vl;                                \
3064     uint32_t total_elems =                                \
3065         vext_get_total_elems(env, desc, ESZ);              \
3066     uint32_t vta = vext_vta(desc);                        \
3067     uint32_t i;                                           \
3068                                                           \
3069     for (i = env->vstart; i < vl; i++) {                  \
3070         if (!vm && !vext_elem_mask(v0, i)) {              \
3071             continue;                                     \
3072         }                                                 \
3073         do_##NAME(vd, s1, vs2, i, env);                   \
3074     }                                                     \
3075     env->vstart = 0;                                      \
3076     /* set tail elements to 1s */                         \
3077     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3078                       total_elems * ESZ);                 \
3079 }
3080 
3081 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3082 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3083 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3084 GEN_VEXT_VF(vfadd_vf_h, 2)
3085 GEN_VEXT_VF(vfadd_vf_w, 4)
3086 GEN_VEXT_VF(vfadd_vf_d, 8)
3087 
3088 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3089 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3090 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3091 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3092 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3093 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3094 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3095 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3096 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3097 GEN_VEXT_VF(vfsub_vf_h, 2)
3098 GEN_VEXT_VF(vfsub_vf_w, 4)
3099 GEN_VEXT_VF(vfsub_vf_d, 8)
3100 
3101 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3102 {
3103     return float16_sub(b, a, s);
3104 }
3105 
3106 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3107 {
3108     return float32_sub(b, a, s);
3109 }
3110 
3111 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3112 {
3113     return float64_sub(b, a, s);
3114 }
3115 
3116 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3117 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3118 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3119 GEN_VEXT_VF(vfrsub_vf_h, 2)
3120 GEN_VEXT_VF(vfrsub_vf_w, 4)
3121 GEN_VEXT_VF(vfrsub_vf_d, 8)
3122 
3123 /* Vector Widening Floating-Point Add/Subtract Instructions */
3124 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3125 {
3126     return float32_add(float16_to_float32(a, true, s),
3127             float16_to_float32(b, true, s), s);
3128 }
3129 
3130 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3131 {
3132     return float64_add(float32_to_float64(a, s),
3133             float32_to_float64(b, s), s);
3134 
3135 }
3136 
3137 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3138 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3139 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3140 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3141 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3142 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3143 GEN_VEXT_VF(vfwadd_vf_h, 4)
3144 GEN_VEXT_VF(vfwadd_vf_w, 8)
3145 
3146 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3147 {
3148     return float32_sub(float16_to_float32(a, true, s),
3149             float16_to_float32(b, true, s), s);
3150 }
3151 
3152 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3153 {
3154     return float64_sub(float32_to_float64(a, s),
3155             float32_to_float64(b, s), s);
3156 
3157 }
3158 
3159 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3160 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3161 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3162 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3163 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3164 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3165 GEN_VEXT_VF(vfwsub_vf_h, 4)
3166 GEN_VEXT_VF(vfwsub_vf_w, 8)
3167 
3168 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3169 {
3170     return float32_add(a, float16_to_float32(b, true, s), s);
3171 }
3172 
3173 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3174 {
3175     return float64_add(a, float32_to_float64(b, s), s);
3176 }
3177 
3178 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3179 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3180 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3181 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3182 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3183 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3184 GEN_VEXT_VF(vfwadd_wf_h, 4)
3185 GEN_VEXT_VF(vfwadd_wf_w, 8)
3186 
3187 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3188 {
3189     return float32_sub(a, float16_to_float32(b, true, s), s);
3190 }
3191 
3192 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3193 {
3194     return float64_sub(a, float32_to_float64(b, s), s);
3195 }
3196 
3197 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3198 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3199 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3200 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3201 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3202 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3203 GEN_VEXT_VF(vfwsub_wf_h, 4)
3204 GEN_VEXT_VF(vfwsub_wf_w, 8)
3205 
3206 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3207 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3208 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3209 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3210 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3211 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3212 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3213 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3214 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3215 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3216 GEN_VEXT_VF(vfmul_vf_h, 2)
3217 GEN_VEXT_VF(vfmul_vf_w, 4)
3218 GEN_VEXT_VF(vfmul_vf_d, 8)
3219 
3220 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3221 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3222 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3223 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3224 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3225 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3226 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3227 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3228 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3229 GEN_VEXT_VF(vfdiv_vf_h, 2)
3230 GEN_VEXT_VF(vfdiv_vf_w, 4)
3231 GEN_VEXT_VF(vfdiv_vf_d, 8)
3232 
3233 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3234 {
3235     return float16_div(b, a, s);
3236 }
3237 
3238 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3239 {
3240     return float32_div(b, a, s);
3241 }
3242 
3243 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3244 {
3245     return float64_div(b, a, s);
3246 }
3247 
3248 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3249 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3250 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3251 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3252 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3253 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3254 
3255 /* Vector Widening Floating-Point Multiply */
3256 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3257 {
3258     return float32_mul(float16_to_float32(a, true, s),
3259             float16_to_float32(b, true, s), s);
3260 }
3261 
3262 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3263 {
3264     return float64_mul(float32_to_float64(a, s),
3265             float32_to_float64(b, s), s);
3266 
3267 }
3268 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3269 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3270 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3271 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3272 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3273 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3274 GEN_VEXT_VF(vfwmul_vf_h, 4)
3275 GEN_VEXT_VF(vfwmul_vf_w, 8)
3276 
3277 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3278 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3279 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3280         CPURISCVState *env)                                        \
3281 {                                                                  \
3282     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3283     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3284     TD d = *((TD *)vd + HD(i));                                    \
3285     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3286 }
3287 
3288 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3289 {
3290     return float16_muladd(a, b, d, 0, s);
3291 }
3292 
3293 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3294 {
3295     return float32_muladd(a, b, d, 0, s);
3296 }
3297 
3298 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3299 {
3300     return float64_muladd(a, b, d, 0, s);
3301 }
3302 
3303 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3304 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3305 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3306 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3307 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3308 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3309 
3310 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3311 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3312         CPURISCVState *env)                                       \
3313 {                                                                 \
3314     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3315     TD d = *((TD *)vd + HD(i));                                   \
3316     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3317 }
3318 
3319 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3320 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3321 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3322 GEN_VEXT_VF(vfmacc_vf_h, 2)
3323 GEN_VEXT_VF(vfmacc_vf_w, 4)
3324 GEN_VEXT_VF(vfmacc_vf_d, 8)
3325 
3326 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3327 {
3328     return float16_muladd(a, b, d,
3329             float_muladd_negate_c | float_muladd_negate_product, s);
3330 }
3331 
3332 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3333 {
3334     return float32_muladd(a, b, d,
3335             float_muladd_negate_c | float_muladd_negate_product, s);
3336 }
3337 
3338 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3339 {
3340     return float64_muladd(a, b, d,
3341             float_muladd_negate_c | float_muladd_negate_product, s);
3342 }
3343 
3344 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3345 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3346 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3347 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3348 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3349 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3350 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3351 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3352 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3353 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3354 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3355 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3356 
3357 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3358 {
3359     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3360 }
3361 
3362 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3363 {
3364     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3365 }
3366 
3367 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3368 {
3369     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3370 }
3371 
3372 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3373 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3374 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3375 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3376 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3377 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3378 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3379 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3380 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3381 GEN_VEXT_VF(vfmsac_vf_h, 2)
3382 GEN_VEXT_VF(vfmsac_vf_w, 4)
3383 GEN_VEXT_VF(vfmsac_vf_d, 8)
3384 
3385 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3386 {
3387     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3388 }
3389 
3390 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3391 {
3392     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3393 }
3394 
3395 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3396 {
3397     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3398 }
3399 
3400 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3401 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3402 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3403 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3404 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3405 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3406 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3407 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3408 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3409 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3410 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3411 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3412 
3413 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3414 {
3415     return float16_muladd(d, b, a, 0, s);
3416 }
3417 
3418 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3419 {
3420     return float32_muladd(d, b, a, 0, s);
3421 }
3422 
3423 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3424 {
3425     return float64_muladd(d, b, a, 0, s);
3426 }
3427 
3428 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3429 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3430 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3431 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3432 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3433 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3434 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3435 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3436 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3437 GEN_VEXT_VF(vfmadd_vf_h, 2)
3438 GEN_VEXT_VF(vfmadd_vf_w, 4)
3439 GEN_VEXT_VF(vfmadd_vf_d, 8)
3440 
3441 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3442 {
3443     return float16_muladd(d, b, a,
3444             float_muladd_negate_c | float_muladd_negate_product, s);
3445 }
3446 
3447 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3448 {
3449     return float32_muladd(d, b, a,
3450             float_muladd_negate_c | float_muladd_negate_product, s);
3451 }
3452 
3453 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3454 {
3455     return float64_muladd(d, b, a,
3456             float_muladd_negate_c | float_muladd_negate_product, s);
3457 }
3458 
3459 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3460 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3461 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3462 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3463 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3464 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3465 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3466 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3467 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3468 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3469 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3470 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3471 
3472 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3473 {
3474     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3475 }
3476 
3477 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3478 {
3479     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3480 }
3481 
3482 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3483 {
3484     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3485 }
3486 
3487 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3488 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3489 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3490 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3491 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3492 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3493 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3494 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3495 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3496 GEN_VEXT_VF(vfmsub_vf_h, 2)
3497 GEN_VEXT_VF(vfmsub_vf_w, 4)
3498 GEN_VEXT_VF(vfmsub_vf_d, 8)
3499 
3500 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3501 {
3502     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3503 }
3504 
3505 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3506 {
3507     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3508 }
3509 
3510 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3511 {
3512     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3513 }
3514 
3515 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3516 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3517 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3518 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3519 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3520 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3521 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3522 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3523 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3524 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3525 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3526 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3527 
3528 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3529 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3530 {
3531     return float32_muladd(float16_to_float32(a, true, s),
3532                         float16_to_float32(b, true, s), d, 0, s);
3533 }
3534 
3535 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3536 {
3537     return float64_muladd(float32_to_float64(a, s),
3538                         float32_to_float64(b, s), d, 0, s);
3539 }
3540 
3541 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3542 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3543 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3544 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3545 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3546 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3547 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3548 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3549 
3550 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3551 {
3552     return float32_muladd(float16_to_float32(a, true, s),
3553                         float16_to_float32(b, true, s), d,
3554                         float_muladd_negate_c | float_muladd_negate_product, s);
3555 }
3556 
3557 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3558 {
3559     return float64_muladd(float32_to_float64(a, s),
3560                         float32_to_float64(b, s), d,
3561                         float_muladd_negate_c | float_muladd_negate_product, s);
3562 }
3563 
3564 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3565 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3566 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3567 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3568 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3569 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3570 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3571 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3572 
3573 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3574 {
3575     return float32_muladd(float16_to_float32(a, true, s),
3576                         float16_to_float32(b, true, s), d,
3577                         float_muladd_negate_c, s);
3578 }
3579 
3580 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3581 {
3582     return float64_muladd(float32_to_float64(a, s),
3583                         float32_to_float64(b, s), d,
3584                         float_muladd_negate_c, s);
3585 }
3586 
3587 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3588 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3589 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3590 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3591 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3592 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3593 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3594 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3595 
3596 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3597 {
3598     return float32_muladd(float16_to_float32(a, true, s),
3599                         float16_to_float32(b, true, s), d,
3600                         float_muladd_negate_product, s);
3601 }
3602 
3603 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3604 {
3605     return float64_muladd(float32_to_float64(a, s),
3606                         float32_to_float64(b, s), d,
3607                         float_muladd_negate_product, s);
3608 }
3609 
3610 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3611 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3612 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3613 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3614 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3615 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3616 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3617 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3618 
3619 /* Vector Floating-Point Square-Root Instruction */
3620 /* (TD, T2, TX2) */
3621 #define OP_UU_H uint16_t, uint16_t, uint16_t
3622 #define OP_UU_W uint32_t, uint32_t, uint32_t
3623 #define OP_UU_D uint64_t, uint64_t, uint64_t
3624 
3625 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3626 static void do_##NAME(void *vd, void *vs2, int i,      \
3627         CPURISCVState *env)                            \
3628 {                                                      \
3629     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3630     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3631 }
3632 
3633 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3634 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3635         CPURISCVState *env, uint32_t desc)             \
3636 {                                                      \
3637     uint32_t vm = vext_vm(desc);                       \
3638     uint32_t vl = env->vl;                             \
3639     uint32_t total_elems =                             \
3640         vext_get_total_elems(env, desc, ESZ);          \
3641     uint32_t vta = vext_vta(desc);                     \
3642     uint32_t i;                                        \
3643                                                        \
3644     if (vl == 0) {                                     \
3645         return;                                        \
3646     }                                                  \
3647     for (i = env->vstart; i < vl; i++) {               \
3648         if (!vm && !vext_elem_mask(v0, i)) {           \
3649             continue;                                  \
3650         }                                              \
3651         do_##NAME(vd, vs2, i, env);                    \
3652     }                                                  \
3653     env->vstart = 0;                                   \
3654     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3655                       total_elems * ESZ);              \
3656 }
3657 
3658 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3659 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3660 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3661 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3662 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3663 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3664 
3665 /*
3666  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3667  *
3668  * Adapted from riscv-v-spec recip.c:
3669  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3670  */
3671 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3672 {
3673     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3674     uint64_t exp = extract64(f, frac_size, exp_size);
3675     uint64_t frac = extract64(f, 0, frac_size);
3676 
3677     const uint8_t lookup_table[] = {
3678         52, 51, 50, 48, 47, 46, 44, 43,
3679         42, 41, 40, 39, 38, 36, 35, 34,
3680         33, 32, 31, 30, 30, 29, 28, 27,
3681         26, 25, 24, 23, 23, 22, 21, 20,
3682         19, 19, 18, 17, 16, 16, 15, 14,
3683         14, 13, 12, 12, 11, 10, 10, 9,
3684         9, 8, 7, 7, 6, 6, 5, 4,
3685         4, 3, 3, 2, 2, 1, 1, 0,
3686         127, 125, 123, 121, 119, 118, 116, 114,
3687         113, 111, 109, 108, 106, 105, 103, 102,
3688         100, 99, 97, 96, 95, 93, 92, 91,
3689         90, 88, 87, 86, 85, 84, 83, 82,
3690         80, 79, 78, 77, 76, 75, 74, 73,
3691         72, 71, 70, 70, 69, 68, 67, 66,
3692         65, 64, 63, 63, 62, 61, 60, 59,
3693         59, 58, 57, 56, 56, 55, 54, 53
3694     };
3695     const int precision = 7;
3696 
3697     if (exp == 0 && frac != 0) { /* subnormal */
3698         /* Normalize the subnormal. */
3699         while (extract64(frac, frac_size - 1, 1) == 0) {
3700             exp--;
3701             frac <<= 1;
3702         }
3703 
3704         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3705     }
3706 
3707     int idx = ((exp & 1) << (precision - 1)) |
3708                 (frac >> (frac_size - precision + 1));
3709     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3710                             (frac_size - precision);
3711     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3712 
3713     uint64_t val = 0;
3714     val = deposit64(val, 0, frac_size, out_frac);
3715     val = deposit64(val, frac_size, exp_size, out_exp);
3716     val = deposit64(val, frac_size + exp_size, 1, sign);
3717     return val;
3718 }
3719 
3720 static float16 frsqrt7_h(float16 f, float_status *s)
3721 {
3722     int exp_size = 5, frac_size = 10;
3723     bool sign = float16_is_neg(f);
3724 
3725     /*
3726      * frsqrt7(sNaN) = canonical NaN
3727      * frsqrt7(-inf) = canonical NaN
3728      * frsqrt7(-normal) = canonical NaN
3729      * frsqrt7(-subnormal) = canonical NaN
3730      */
3731     if (float16_is_signaling_nan(f, s) ||
3732             (float16_is_infinity(f) && sign) ||
3733             (float16_is_normal(f) && sign) ||
3734             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3735         s->float_exception_flags |= float_flag_invalid;
3736         return float16_default_nan(s);
3737     }
3738 
3739     /* frsqrt7(qNaN) = canonical NaN */
3740     if (float16_is_quiet_nan(f, s)) {
3741         return float16_default_nan(s);
3742     }
3743 
3744     /* frsqrt7(+-0) = +-inf */
3745     if (float16_is_zero(f)) {
3746         s->float_exception_flags |= float_flag_divbyzero;
3747         return float16_set_sign(float16_infinity, sign);
3748     }
3749 
3750     /* frsqrt7(+inf) = +0 */
3751     if (float16_is_infinity(f) && !sign) {
3752         return float16_set_sign(float16_zero, sign);
3753     }
3754 
3755     /* +normal, +subnormal */
3756     uint64_t val = frsqrt7(f, exp_size, frac_size);
3757     return make_float16(val);
3758 }
3759 
3760 static float32 frsqrt7_s(float32 f, float_status *s)
3761 {
3762     int exp_size = 8, frac_size = 23;
3763     bool sign = float32_is_neg(f);
3764 
3765     /*
3766      * frsqrt7(sNaN) = canonical NaN
3767      * frsqrt7(-inf) = canonical NaN
3768      * frsqrt7(-normal) = canonical NaN
3769      * frsqrt7(-subnormal) = canonical NaN
3770      */
3771     if (float32_is_signaling_nan(f, s) ||
3772             (float32_is_infinity(f) && sign) ||
3773             (float32_is_normal(f) && sign) ||
3774             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3775         s->float_exception_flags |= float_flag_invalid;
3776         return float32_default_nan(s);
3777     }
3778 
3779     /* frsqrt7(qNaN) = canonical NaN */
3780     if (float32_is_quiet_nan(f, s)) {
3781         return float32_default_nan(s);
3782     }
3783 
3784     /* frsqrt7(+-0) = +-inf */
3785     if (float32_is_zero(f)) {
3786         s->float_exception_flags |= float_flag_divbyzero;
3787         return float32_set_sign(float32_infinity, sign);
3788     }
3789 
3790     /* frsqrt7(+inf) = +0 */
3791     if (float32_is_infinity(f) && !sign) {
3792         return float32_set_sign(float32_zero, sign);
3793     }
3794 
3795     /* +normal, +subnormal */
3796     uint64_t val = frsqrt7(f, exp_size, frac_size);
3797     return make_float32(val);
3798 }
3799 
3800 static float64 frsqrt7_d(float64 f, float_status *s)
3801 {
3802     int exp_size = 11, frac_size = 52;
3803     bool sign = float64_is_neg(f);
3804 
3805     /*
3806      * frsqrt7(sNaN) = canonical NaN
3807      * frsqrt7(-inf) = canonical NaN
3808      * frsqrt7(-normal) = canonical NaN
3809      * frsqrt7(-subnormal) = canonical NaN
3810      */
3811     if (float64_is_signaling_nan(f, s) ||
3812             (float64_is_infinity(f) && sign) ||
3813             (float64_is_normal(f) && sign) ||
3814             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3815         s->float_exception_flags |= float_flag_invalid;
3816         return float64_default_nan(s);
3817     }
3818 
3819     /* frsqrt7(qNaN) = canonical NaN */
3820     if (float64_is_quiet_nan(f, s)) {
3821         return float64_default_nan(s);
3822     }
3823 
3824     /* frsqrt7(+-0) = +-inf */
3825     if (float64_is_zero(f)) {
3826         s->float_exception_flags |= float_flag_divbyzero;
3827         return float64_set_sign(float64_infinity, sign);
3828     }
3829 
3830     /* frsqrt7(+inf) = +0 */
3831     if (float64_is_infinity(f) && !sign) {
3832         return float64_set_sign(float64_zero, sign);
3833     }
3834 
3835     /* +normal, +subnormal */
3836     uint64_t val = frsqrt7(f, exp_size, frac_size);
3837     return make_float64(val);
3838 }
3839 
3840 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3841 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3842 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3843 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3844 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3845 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3846 
3847 /*
3848  * Vector Floating-Point Reciprocal Estimate Instruction
3849  *
3850  * Adapted from riscv-v-spec recip.c:
3851  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3852  */
3853 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3854                       float_status *s)
3855 {
3856     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3857     uint64_t exp = extract64(f, frac_size, exp_size);
3858     uint64_t frac = extract64(f, 0, frac_size);
3859 
3860     const uint8_t lookup_table[] = {
3861         127, 125, 123, 121, 119, 117, 116, 114,
3862         112, 110, 109, 107, 105, 104, 102, 100,
3863         99, 97, 96, 94, 93, 91, 90, 88,
3864         87, 85, 84, 83, 81, 80, 79, 77,
3865         76, 75, 74, 72, 71, 70, 69, 68,
3866         66, 65, 64, 63, 62, 61, 60, 59,
3867         58, 57, 56, 55, 54, 53, 52, 51,
3868         50, 49, 48, 47, 46, 45, 44, 43,
3869         42, 41, 40, 40, 39, 38, 37, 36,
3870         35, 35, 34, 33, 32, 31, 31, 30,
3871         29, 28, 28, 27, 26, 25, 25, 24,
3872         23, 23, 22, 21, 21, 20, 19, 19,
3873         18, 17, 17, 16, 15, 15, 14, 14,
3874         13, 12, 12, 11, 11, 10, 9, 9,
3875         8, 8, 7, 7, 6, 5, 5, 4,
3876         4, 3, 3, 2, 2, 1, 1, 0
3877     };
3878     const int precision = 7;
3879 
3880     if (exp == 0 && frac != 0) { /* subnormal */
3881         /* Normalize the subnormal. */
3882         while (extract64(frac, frac_size - 1, 1) == 0) {
3883             exp--;
3884             frac <<= 1;
3885         }
3886 
3887         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3888 
3889         if (exp != 0 && exp != UINT64_MAX) {
3890             /*
3891              * Overflow to inf or max value of same sign,
3892              * depending on sign and rounding mode.
3893              */
3894             s->float_exception_flags |= (float_flag_inexact |
3895                                          float_flag_overflow);
3896 
3897             if ((s->float_rounding_mode == float_round_to_zero) ||
3898                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3899                 ((s->float_rounding_mode == float_round_up) && sign)) {
3900                 /* Return greatest/negative finite value. */
3901                 return (sign << (exp_size + frac_size)) |
3902                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3903             } else {
3904                 /* Return +-inf. */
3905                 return (sign << (exp_size + frac_size)) |
3906                     MAKE_64BIT_MASK(frac_size, exp_size);
3907             }
3908         }
3909     }
3910 
3911     int idx = frac >> (frac_size - precision);
3912     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3913                             (frac_size - precision);
3914     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3915 
3916     if (out_exp == 0 || out_exp == UINT64_MAX) {
3917         /*
3918          * The result is subnormal, but don't raise the underflow exception,
3919          * because there's no additional loss of precision.
3920          */
3921         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3922         if (out_exp == UINT64_MAX) {
3923             out_frac >>= 1;
3924             out_exp = 0;
3925         }
3926     }
3927 
3928     uint64_t val = 0;
3929     val = deposit64(val, 0, frac_size, out_frac);
3930     val = deposit64(val, frac_size, exp_size, out_exp);
3931     val = deposit64(val, frac_size + exp_size, 1, sign);
3932     return val;
3933 }
3934 
3935 static float16 frec7_h(float16 f, float_status *s)
3936 {
3937     int exp_size = 5, frac_size = 10;
3938     bool sign = float16_is_neg(f);
3939 
3940     /* frec7(+-inf) = +-0 */
3941     if (float16_is_infinity(f)) {
3942         return float16_set_sign(float16_zero, sign);
3943     }
3944 
3945     /* frec7(+-0) = +-inf */
3946     if (float16_is_zero(f)) {
3947         s->float_exception_flags |= float_flag_divbyzero;
3948         return float16_set_sign(float16_infinity, sign);
3949     }
3950 
3951     /* frec7(sNaN) = canonical NaN */
3952     if (float16_is_signaling_nan(f, s)) {
3953         s->float_exception_flags |= float_flag_invalid;
3954         return float16_default_nan(s);
3955     }
3956 
3957     /* frec7(qNaN) = canonical NaN */
3958     if (float16_is_quiet_nan(f, s)) {
3959         return float16_default_nan(s);
3960     }
3961 
3962     /* +-normal, +-subnormal */
3963     uint64_t val = frec7(f, exp_size, frac_size, s);
3964     return make_float16(val);
3965 }
3966 
3967 static float32 frec7_s(float32 f, float_status *s)
3968 {
3969     int exp_size = 8, frac_size = 23;
3970     bool sign = float32_is_neg(f);
3971 
3972     /* frec7(+-inf) = +-0 */
3973     if (float32_is_infinity(f)) {
3974         return float32_set_sign(float32_zero, sign);
3975     }
3976 
3977     /* frec7(+-0) = +-inf */
3978     if (float32_is_zero(f)) {
3979         s->float_exception_flags |= float_flag_divbyzero;
3980         return float32_set_sign(float32_infinity, sign);
3981     }
3982 
3983     /* frec7(sNaN) = canonical NaN */
3984     if (float32_is_signaling_nan(f, s)) {
3985         s->float_exception_flags |= float_flag_invalid;
3986         return float32_default_nan(s);
3987     }
3988 
3989     /* frec7(qNaN) = canonical NaN */
3990     if (float32_is_quiet_nan(f, s)) {
3991         return float32_default_nan(s);
3992     }
3993 
3994     /* +-normal, +-subnormal */
3995     uint64_t val = frec7(f, exp_size, frac_size, s);
3996     return make_float32(val);
3997 }
3998 
3999 static float64 frec7_d(float64 f, float_status *s)
4000 {
4001     int exp_size = 11, frac_size = 52;
4002     bool sign = float64_is_neg(f);
4003 
4004     /* frec7(+-inf) = +-0 */
4005     if (float64_is_infinity(f)) {
4006         return float64_set_sign(float64_zero, sign);
4007     }
4008 
4009     /* frec7(+-0) = +-inf */
4010     if (float64_is_zero(f)) {
4011         s->float_exception_flags |= float_flag_divbyzero;
4012         return float64_set_sign(float64_infinity, sign);
4013     }
4014 
4015     /* frec7(sNaN) = canonical NaN */
4016     if (float64_is_signaling_nan(f, s)) {
4017         s->float_exception_flags |= float_flag_invalid;
4018         return float64_default_nan(s);
4019     }
4020 
4021     /* frec7(qNaN) = canonical NaN */
4022     if (float64_is_quiet_nan(f, s)) {
4023         return float64_default_nan(s);
4024     }
4025 
4026     /* +-normal, +-subnormal */
4027     uint64_t val = frec7(f, exp_size, frac_size, s);
4028     return make_float64(val);
4029 }
4030 
4031 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4032 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4033 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4034 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4035 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4036 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4037 
4038 /* Vector Floating-Point MIN/MAX Instructions */
4039 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4040 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4041 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4042 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4043 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4044 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4045 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4046 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4047 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4048 GEN_VEXT_VF(vfmin_vf_h, 2)
4049 GEN_VEXT_VF(vfmin_vf_w, 4)
4050 GEN_VEXT_VF(vfmin_vf_d, 8)
4051 
4052 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4053 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4054 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4055 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4056 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4057 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4058 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4059 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4060 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4061 GEN_VEXT_VF(vfmax_vf_h, 2)
4062 GEN_VEXT_VF(vfmax_vf_w, 4)
4063 GEN_VEXT_VF(vfmax_vf_d, 8)
4064 
4065 /* Vector Floating-Point Sign-Injection Instructions */
4066 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4067 {
4068     return deposit64(b, 0, 15, a);
4069 }
4070 
4071 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4072 {
4073     return deposit64(b, 0, 31, a);
4074 }
4075 
4076 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4077 {
4078     return deposit64(b, 0, 63, a);
4079 }
4080 
4081 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4082 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4083 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4084 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4085 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4086 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4087 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4088 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4089 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4090 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4091 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4092 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4093 
4094 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4095 {
4096     return deposit64(~b, 0, 15, a);
4097 }
4098 
4099 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4100 {
4101     return deposit64(~b, 0, 31, a);
4102 }
4103 
4104 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4105 {
4106     return deposit64(~b, 0, 63, a);
4107 }
4108 
4109 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4110 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4111 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4112 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4113 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4114 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4115 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4116 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4117 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4118 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4119 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4120 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4121 
4122 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4123 {
4124     return deposit64(b ^ a, 0, 15, a);
4125 }
4126 
4127 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4128 {
4129     return deposit64(b ^ a, 0, 31, a);
4130 }
4131 
4132 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4133 {
4134     return deposit64(b ^ a, 0, 63, a);
4135 }
4136 
4137 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4138 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4139 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4140 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4141 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4142 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4143 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4144 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4145 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4146 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4147 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4148 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4149 
4150 /* Vector Floating-Point Compare Instructions */
4151 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4152 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4153                   CPURISCVState *env, uint32_t desc)          \
4154 {                                                             \
4155     uint32_t vm = vext_vm(desc);                              \
4156     uint32_t vl = env->vl;                                    \
4157     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4158     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4159     uint32_t i;                                               \
4160                                                               \
4161     for (i = env->vstart; i < vl; i++) {                      \
4162         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4163         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4164         if (!vm && !vext_elem_mask(v0, i)) {                  \
4165             continue;                                         \
4166         }                                                     \
4167         vext_set_elem_mask(vd, i,                             \
4168                            DO_OP(s2, s1, &env->fp_status));   \
4169     }                                                         \
4170     env->vstart = 0;                                          \
4171     /* mask destination register are always tail-agnostic */  \
4172     /* set tail elements to 1s */                             \
4173     if (vta_all_1s) {                                         \
4174         for (; i < total_elems; i++) {                        \
4175             vext_set_elem_mask(vd, i, 1);                     \
4176         }                                                     \
4177     }                                                         \
4178 }
4179 
4180 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4181 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4182 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4183 
4184 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4185 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4186                   CPURISCVState *env, uint32_t desc)                \
4187 {                                                                   \
4188     uint32_t vm = vext_vm(desc);                                    \
4189     uint32_t vl = env->vl;                                          \
4190     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4191     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4192     uint32_t i;                                                     \
4193                                                                     \
4194     for (i = env->vstart; i < vl; i++) {                            \
4195         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4196         if (!vm && !vext_elem_mask(v0, i)) {                        \
4197             continue;                                               \
4198         }                                                           \
4199         vext_set_elem_mask(vd, i,                                   \
4200                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4201     }                                                               \
4202     env->vstart = 0;                                                \
4203     /* mask destination register are always tail-agnostic */        \
4204     /* set tail elements to 1s */                                   \
4205     if (vta_all_1s) {                                               \
4206         for (; i < total_elems; i++) {                              \
4207             vext_set_elem_mask(vd, i, 1);                           \
4208         }                                                           \
4209     }                                                               \
4210 }
4211 
4212 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4213 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4214 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4215 
4216 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4217 {
4218     FloatRelation compare = float16_compare_quiet(a, b, s);
4219     return compare != float_relation_equal;
4220 }
4221 
4222 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4223 {
4224     FloatRelation compare = float32_compare_quiet(a, b, s);
4225     return compare != float_relation_equal;
4226 }
4227 
4228 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4229 {
4230     FloatRelation compare = float64_compare_quiet(a, b, s);
4231     return compare != float_relation_equal;
4232 }
4233 
4234 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4235 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4236 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4237 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4238 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4239 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4240 
4241 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4242 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4243 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4244 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4245 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4246 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4247 
4248 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4249 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4250 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4251 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4252 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4253 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4254 
4255 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4256 {
4257     FloatRelation compare = float16_compare(a, b, s);
4258     return compare == float_relation_greater;
4259 }
4260 
4261 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4262 {
4263     FloatRelation compare = float32_compare(a, b, s);
4264     return compare == float_relation_greater;
4265 }
4266 
4267 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4268 {
4269     FloatRelation compare = float64_compare(a, b, s);
4270     return compare == float_relation_greater;
4271 }
4272 
4273 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4274 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4275 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4276 
4277 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4278 {
4279     FloatRelation compare = float16_compare(a, b, s);
4280     return compare == float_relation_greater ||
4281            compare == float_relation_equal;
4282 }
4283 
4284 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4285 {
4286     FloatRelation compare = float32_compare(a, b, s);
4287     return compare == float_relation_greater ||
4288            compare == float_relation_equal;
4289 }
4290 
4291 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4292 {
4293     FloatRelation compare = float64_compare(a, b, s);
4294     return compare == float_relation_greater ||
4295            compare == float_relation_equal;
4296 }
4297 
4298 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4299 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4300 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4301 
4302 /* Vector Floating-Point Classify Instruction */
4303 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4304 static void do_##NAME(void *vd, void *vs2, int i)      \
4305 {                                                      \
4306     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4307     *((TD *)vd + HD(i)) = OP(s2);                      \
4308 }
4309 
4310 #define GEN_VEXT_V(NAME, ESZ)                          \
4311 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4312                   CPURISCVState *env, uint32_t desc)   \
4313 {                                                      \
4314     uint32_t vm = vext_vm(desc);                       \
4315     uint32_t vl = env->vl;                             \
4316     uint32_t total_elems =                             \
4317         vext_get_total_elems(env, desc, ESZ);          \
4318     uint32_t vta = vext_vta(desc);                     \
4319     uint32_t i;                                        \
4320                                                        \
4321     for (i = env->vstart; i < vl; i++) {               \
4322         if (!vm && !vext_elem_mask(v0, i)) {           \
4323             continue;                                  \
4324         }                                              \
4325         do_##NAME(vd, vs2, i);                         \
4326     }                                                  \
4327     env->vstart = 0;                                   \
4328     /* set tail elements to 1s */                      \
4329     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4330                       total_elems * ESZ);              \
4331 }
4332 
4333 target_ulong fclass_h(uint64_t frs1)
4334 {
4335     float16 f = frs1;
4336     bool sign = float16_is_neg(f);
4337 
4338     if (float16_is_infinity(f)) {
4339         return sign ? 1 << 0 : 1 << 7;
4340     } else if (float16_is_zero(f)) {
4341         return sign ? 1 << 3 : 1 << 4;
4342     } else if (float16_is_zero_or_denormal(f)) {
4343         return sign ? 1 << 2 : 1 << 5;
4344     } else if (float16_is_any_nan(f)) {
4345         float_status s = { }; /* for snan_bit_is_one */
4346         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4347     } else {
4348         return sign ? 1 << 1 : 1 << 6;
4349     }
4350 }
4351 
4352 target_ulong fclass_s(uint64_t frs1)
4353 {
4354     float32 f = frs1;
4355     bool sign = float32_is_neg(f);
4356 
4357     if (float32_is_infinity(f)) {
4358         return sign ? 1 << 0 : 1 << 7;
4359     } else if (float32_is_zero(f)) {
4360         return sign ? 1 << 3 : 1 << 4;
4361     } else if (float32_is_zero_or_denormal(f)) {
4362         return sign ? 1 << 2 : 1 << 5;
4363     } else if (float32_is_any_nan(f)) {
4364         float_status s = { }; /* for snan_bit_is_one */
4365         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4366     } else {
4367         return sign ? 1 << 1 : 1 << 6;
4368     }
4369 }
4370 
4371 target_ulong fclass_d(uint64_t frs1)
4372 {
4373     float64 f = frs1;
4374     bool sign = float64_is_neg(f);
4375 
4376     if (float64_is_infinity(f)) {
4377         return sign ? 1 << 0 : 1 << 7;
4378     } else if (float64_is_zero(f)) {
4379         return sign ? 1 << 3 : 1 << 4;
4380     } else if (float64_is_zero_or_denormal(f)) {
4381         return sign ? 1 << 2 : 1 << 5;
4382     } else if (float64_is_any_nan(f)) {
4383         float_status s = { }; /* for snan_bit_is_one */
4384         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4385     } else {
4386         return sign ? 1 << 1 : 1 << 6;
4387     }
4388 }
4389 
4390 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4391 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4392 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4393 GEN_VEXT_V(vfclass_v_h, 2)
4394 GEN_VEXT_V(vfclass_v_w, 4)
4395 GEN_VEXT_V(vfclass_v_d, 8)
4396 
4397 /* Vector Floating-Point Merge Instruction */
4398 
4399 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4400 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4401                   CPURISCVState *env, uint32_t desc)          \
4402 {                                                             \
4403     uint32_t vm = vext_vm(desc);                              \
4404     uint32_t vl = env->vl;                                    \
4405     uint32_t esz = sizeof(ETYPE);                             \
4406     uint32_t total_elems =                                    \
4407         vext_get_total_elems(env, desc, esz);                 \
4408     uint32_t vta = vext_vta(desc);                            \
4409     uint32_t i;                                               \
4410                                                               \
4411     for (i = env->vstart; i < vl; i++) {                      \
4412         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4413         *((ETYPE *)vd + H(i))                                 \
4414           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4415     }                                                         \
4416     env->vstart = 0;                                          \
4417     /* set tail elements to 1s */                             \
4418     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4419 }
4420 
4421 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4422 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4423 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4424 
4425 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4426 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4427 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4428 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4429 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4430 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4431 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4432 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4433 
4434 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4435 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4436 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4437 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4438 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4439 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4440 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4441 
4442 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4443 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4444 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4445 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4446 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4447 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4448 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4449 
4450 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4451 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4452 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4453 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4454 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4455 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4456 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4457 
4458 /* Widening Floating-Point/Integer Type-Convert Instructions */
4459 /* (TD, T2, TX2) */
4460 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4461 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4462 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4463 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4464 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4465 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4466 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4467 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4468 
4469 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4470 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4471 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4472 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4473 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4474 
4475 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4476 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4477 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4478 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4479 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4480 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4481 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4482 
4483 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4484 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4485 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4486 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4487 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4488 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4489 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4490 
4491 /*
4492  * vfwcvt.f.f.v vd, vs2, vm
4493  * Convert single-width float to double-width float.
4494  */
4495 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4496 {
4497     return float16_to_float32(a, true, s);
4498 }
4499 
4500 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4501 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4502 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4503 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4504 
4505 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4506 /* (TD, T2, TX2) */
4507 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4508 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4509 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4510 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4511 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4512 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4513 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4514 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4515 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4516 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4517 
4518 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4519 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4520 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4521 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4522 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4523 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4524 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4525 
4526 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4527 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4528 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4529 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4530 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4531 
4532 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4533 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4534 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4535 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4536 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4537 
4538 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4539 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4540 {
4541     return float32_to_float16(a, true, s);
4542 }
4543 
4544 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4545 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4546 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4547 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4548 
4549 /*
4550  *** Vector Reduction Operations
4551  */
4552 /* Vector Single-Width Integer Reduction Instructions */
4553 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4554 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4555         void *vs2, CPURISCVState *env, uint32_t desc)     \
4556 {                                                         \
4557     uint32_t vm = vext_vm(desc);                          \
4558     uint32_t vl = env->vl;                                \
4559     uint32_t esz = sizeof(TD);                            \
4560     uint32_t vlenb = simd_maxsz(desc);                    \
4561     uint32_t vta = vext_vta(desc);                        \
4562     uint32_t i;                                           \
4563     TD s1 =  *((TD *)vs1 + HD(0));                        \
4564                                                           \
4565     for (i = env->vstart; i < vl; i++) {                  \
4566         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4567         if (!vm && !vext_elem_mask(v0, i)) {              \
4568             continue;                                     \
4569         }                                                 \
4570         s1 = OP(s1, (TD)s2);                              \
4571     }                                                     \
4572     *((TD *)vd + HD(0)) = s1;                             \
4573     env->vstart = 0;                                      \
4574     /* set tail elements to 1s */                         \
4575     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4576 }
4577 
4578 /* vd[0] = sum(vs1[0], vs2[*]) */
4579 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4580 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4581 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4582 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4583 
4584 /* vd[0] = maxu(vs1[0], vs2[*]) */
4585 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4586 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4587 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4588 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4589 
4590 /* vd[0] = max(vs1[0], vs2[*]) */
4591 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4592 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4593 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4594 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4595 
4596 /* vd[0] = minu(vs1[0], vs2[*]) */
4597 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4598 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4599 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4600 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4601 
4602 /* vd[0] = min(vs1[0], vs2[*]) */
4603 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4604 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4605 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4606 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4607 
4608 /* vd[0] = and(vs1[0], vs2[*]) */
4609 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4610 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4611 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4612 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4613 
4614 /* vd[0] = or(vs1[0], vs2[*]) */
4615 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4616 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4617 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4618 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4619 
4620 /* vd[0] = xor(vs1[0], vs2[*]) */
4621 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4622 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4623 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4624 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4625 
4626 /* Vector Widening Integer Reduction Instructions */
4627 /* signed sum reduction into double-width accumulator */
4628 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4629 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4630 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4631 
4632 /* Unsigned sum reduction into double-width accumulator */
4633 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4634 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4635 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4636 
4637 /* Vector Single-Width Floating-Point Reduction Instructions */
4638 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4639 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4640                   void *vs2, CPURISCVState *env,           \
4641                   uint32_t desc)                           \
4642 {                                                          \
4643     uint32_t vm = vext_vm(desc);                           \
4644     uint32_t vl = env->vl;                                 \
4645     uint32_t esz = sizeof(TD);                             \
4646     uint32_t vlenb = simd_maxsz(desc);                     \
4647     uint32_t vta = vext_vta(desc);                         \
4648     uint32_t i;                                            \
4649     TD s1 =  *((TD *)vs1 + HD(0));                         \
4650                                                            \
4651     for (i = env->vstart; i < vl; i++) {                   \
4652         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4653         if (!vm && !vext_elem_mask(v0, i)) {               \
4654             continue;                                      \
4655         }                                                  \
4656         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4657     }                                                      \
4658     *((TD *)vd + HD(0)) = s1;                              \
4659     env->vstart = 0;                                       \
4660     /* set tail elements to 1s */                          \
4661     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4662 }
4663 
4664 /* Unordered sum */
4665 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4666 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4667 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4668 
4669 /* Maximum value */
4670 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4671 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4672 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4673 
4674 /* Minimum value */
4675 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4676 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4677 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4678 
4679 /* Vector Widening Floating-Point Reduction Instructions */
4680 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4681 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4682                             void *vs2, CPURISCVState *env, uint32_t desc)
4683 {
4684     uint32_t vm = vext_vm(desc);
4685     uint32_t vl = env->vl;
4686     uint32_t esz = sizeof(uint32_t);
4687     uint32_t vlenb = simd_maxsz(desc);
4688     uint32_t vta = vext_vta(desc);
4689     uint32_t i;
4690     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4691 
4692     for (i = env->vstart; i < vl; i++) {
4693         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4694         if (!vm && !vext_elem_mask(v0, i)) {
4695             continue;
4696         }
4697         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4698                          &env->fp_status);
4699     }
4700     *((uint32_t *)vd + H4(0)) = s1;
4701     env->vstart = 0;
4702     /* set tail elements to 1s */
4703     vext_set_elems_1s(vd, vta, esz, vlenb);
4704 }
4705 
4706 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4707                             void *vs2, CPURISCVState *env, uint32_t desc)
4708 {
4709     uint32_t vm = vext_vm(desc);
4710     uint32_t vl = env->vl;
4711     uint32_t esz = sizeof(uint64_t);
4712     uint32_t vlenb = simd_maxsz(desc);
4713     uint32_t vta = vext_vta(desc);
4714     uint32_t i;
4715     uint64_t s1 =  *((uint64_t *)vs1);
4716 
4717     for (i = env->vstart; i < vl; i++) {
4718         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4719         if (!vm && !vext_elem_mask(v0, i)) {
4720             continue;
4721         }
4722         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4723                          &env->fp_status);
4724     }
4725     *((uint64_t *)vd) = s1;
4726     env->vstart = 0;
4727     /* set tail elements to 1s */
4728     vext_set_elems_1s(vd, vta, esz, vlenb);
4729 }
4730 
4731 /*
4732  *** Vector Mask Operations
4733  */
4734 /* Vector Mask-Register Logical Instructions */
4735 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4736 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4737                   void *vs2, CPURISCVState *env,          \
4738                   uint32_t desc)                          \
4739 {                                                         \
4740     uint32_t vl = env->vl;                                \
4741     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4742     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4743     uint32_t i;                                           \
4744     int a, b;                                             \
4745                                                           \
4746     for (i = env->vstart; i < vl; i++) {                  \
4747         a = vext_elem_mask(vs1, i);                       \
4748         b = vext_elem_mask(vs2, i);                       \
4749         vext_set_elem_mask(vd, i, OP(b, a));              \
4750     }                                                     \
4751     env->vstart = 0;                                      \
4752     /* mask destination register are always tail-         \
4753      * agnostic                                           \
4754      */                                                   \
4755     /* set tail elements to 1s */                         \
4756     if (vta_all_1s) {                                     \
4757         for (; i < total_elems; i++) {                    \
4758             vext_set_elem_mask(vd, i, 1);                 \
4759         }                                                 \
4760     }                                                     \
4761 }
4762 
4763 #define DO_NAND(N, M)  (!(N & M))
4764 #define DO_ANDNOT(N, M)  (N & !M)
4765 #define DO_NOR(N, M)  (!(N | M))
4766 #define DO_ORNOT(N, M)  (N | !M)
4767 #define DO_XNOR(N, M)  (!(N ^ M))
4768 
4769 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4770 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4771 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4772 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4773 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4774 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4775 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4776 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4777 
4778 /* Vector count population in mask vcpop */
4779 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4780                              uint32_t desc)
4781 {
4782     target_ulong cnt = 0;
4783     uint32_t vm = vext_vm(desc);
4784     uint32_t vl = env->vl;
4785     int i;
4786 
4787     for (i = env->vstart; i < vl; i++) {
4788         if (vm || vext_elem_mask(v0, i)) {
4789             if (vext_elem_mask(vs2, i)) {
4790                 cnt++;
4791             }
4792         }
4793     }
4794     env->vstart = 0;
4795     return cnt;
4796 }
4797 
4798 /* vfirst find-first-set mask bit*/
4799 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4800                               uint32_t desc)
4801 {
4802     uint32_t vm = vext_vm(desc);
4803     uint32_t vl = env->vl;
4804     int i;
4805 
4806     for (i = env->vstart; i < vl; i++) {
4807         if (vm || vext_elem_mask(v0, i)) {
4808             if (vext_elem_mask(vs2, i)) {
4809                 return i;
4810             }
4811         }
4812     }
4813     env->vstart = 0;
4814     return -1LL;
4815 }
4816 
4817 enum set_mask_type {
4818     ONLY_FIRST = 1,
4819     INCLUDE_FIRST,
4820     BEFORE_FIRST,
4821 };
4822 
4823 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4824                    uint32_t desc, enum set_mask_type type)
4825 {
4826     uint32_t vm = vext_vm(desc);
4827     uint32_t vl = env->vl;
4828     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4829     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4830     int i;
4831     bool first_mask_bit = false;
4832 
4833     for (i = env->vstart; i < vl; i++) {
4834         if (!vm && !vext_elem_mask(v0, i)) {
4835             continue;
4836         }
4837         /* write a zero to all following active elements */
4838         if (first_mask_bit) {
4839             vext_set_elem_mask(vd, i, 0);
4840             continue;
4841         }
4842         if (vext_elem_mask(vs2, i)) {
4843             first_mask_bit = true;
4844             if (type == BEFORE_FIRST) {
4845                 vext_set_elem_mask(vd, i, 0);
4846             } else {
4847                 vext_set_elem_mask(vd, i, 1);
4848             }
4849         } else {
4850             if (type == ONLY_FIRST) {
4851                 vext_set_elem_mask(vd, i, 0);
4852             } else {
4853                 vext_set_elem_mask(vd, i, 1);
4854             }
4855         }
4856     }
4857     env->vstart = 0;
4858     /* mask destination register are always tail-agnostic */
4859     /* set tail elements to 1s */
4860     if (vta_all_1s) {
4861         for (; i < total_elems; i++) {
4862             vext_set_elem_mask(vd, i, 1);
4863         }
4864     }
4865 }
4866 
4867 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4868                      uint32_t desc)
4869 {
4870     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4871 }
4872 
4873 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4874                      uint32_t desc)
4875 {
4876     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4877 }
4878 
4879 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4880                      uint32_t desc)
4881 {
4882     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4883 }
4884 
4885 /* Vector Iota Instruction */
4886 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4887 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4888                   uint32_t desc)                                          \
4889 {                                                                         \
4890     uint32_t vm = vext_vm(desc);                                          \
4891     uint32_t vl = env->vl;                                                \
4892     uint32_t esz = sizeof(ETYPE);                                         \
4893     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4894     uint32_t vta = vext_vta(desc);                                        \
4895     uint32_t sum = 0;                                                     \
4896     int i;                                                                \
4897                                                                           \
4898     for (i = env->vstart; i < vl; i++) {                                  \
4899         if (!vm && !vext_elem_mask(v0, i)) {                              \
4900             continue;                                                     \
4901         }                                                                 \
4902         *((ETYPE *)vd + H(i)) = sum;                                      \
4903         if (vext_elem_mask(vs2, i)) {                                     \
4904             sum++;                                                        \
4905         }                                                                 \
4906     }                                                                     \
4907     env->vstart = 0;                                                      \
4908     /* set tail elements to 1s */                                         \
4909     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4910 }
4911 
4912 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4913 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4914 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4915 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4916 
4917 /* Vector Element Index Instruction */
4918 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4919 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4920 {                                                                         \
4921     uint32_t vm = vext_vm(desc);                                          \
4922     uint32_t vl = env->vl;                                                \
4923     uint32_t esz = sizeof(ETYPE);                                         \
4924     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4925     uint32_t vta = vext_vta(desc);                                        \
4926     int i;                                                                \
4927                                                                           \
4928     for (i = env->vstart; i < vl; i++) {                                  \
4929         if (!vm && !vext_elem_mask(v0, i)) {                              \
4930             continue;                                                     \
4931         }                                                                 \
4932         *((ETYPE *)vd + H(i)) = i;                                        \
4933     }                                                                     \
4934     env->vstart = 0;                                                      \
4935     /* set tail elements to 1s */                                         \
4936     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4937 }
4938 
4939 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4940 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4941 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4942 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4943 
4944 /*
4945  *** Vector Permutation Instructions
4946  */
4947 
4948 /* Vector Slide Instructions */
4949 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4950 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4951                   CPURISCVState *env, uint32_t desc)                      \
4952 {                                                                         \
4953     uint32_t vm = vext_vm(desc);                                          \
4954     uint32_t vl = env->vl;                                                \
4955     uint32_t esz = sizeof(ETYPE);                                         \
4956     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4957     uint32_t vta = vext_vta(desc);                                        \
4958     target_ulong offset = s1, i_min, i;                                   \
4959                                                                           \
4960     i_min = MAX(env->vstart, offset);                                     \
4961     for (i = i_min; i < vl; i++) {                                        \
4962         if (!vm && !vext_elem_mask(v0, i)) {                              \
4963             continue;                                                     \
4964         }                                                                 \
4965         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4966     }                                                                     \
4967     /* set tail elements to 1s */                                         \
4968     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4969 }
4970 
4971 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4972 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4973 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4974 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4975 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4976 
4977 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4978 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4979                   CPURISCVState *env, uint32_t desc)                      \
4980 {                                                                         \
4981     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4982     uint32_t vm = vext_vm(desc);                                          \
4983     uint32_t vl = env->vl;                                                \
4984     uint32_t esz = sizeof(ETYPE);                                         \
4985     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4986     uint32_t vta = vext_vta(desc);                                        \
4987     target_ulong i_max, i;                                                \
4988                                                                           \
4989     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4990     for (i = env->vstart; i < i_max; ++i) {                               \
4991         if (vm || vext_elem_mask(v0, i)) {                                \
4992             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4993         }                                                                 \
4994     }                                                                     \
4995                                                                           \
4996     for (i = i_max; i < vl; ++i) {                                        \
4997         if (vm || vext_elem_mask(v0, i)) {                                \
4998             *((ETYPE *)vd + H(i)) = 0;                                    \
4999         }                                                                 \
5000     }                                                                     \
5001                                                                           \
5002     env->vstart = 0;                                                      \
5003     /* set tail elements to 1s */                                         \
5004     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5005 }
5006 
5007 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5008 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5009 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5010 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5011 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5012 
5013 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5014 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5015                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5016 {                                                                           \
5017     typedef uint##BITWIDTH##_t ETYPE;                                       \
5018     uint32_t vm = vext_vm(desc);                                            \
5019     uint32_t vl = env->vl;                                                  \
5020     uint32_t esz = sizeof(ETYPE);                                           \
5021     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5022     uint32_t vta = vext_vta(desc);                                          \
5023     uint32_t i;                                                             \
5024                                                                             \
5025     for (i = env->vstart; i < vl; i++) {                                    \
5026         if (!vm && !vext_elem_mask(v0, i)) {                                \
5027             continue;                                                       \
5028         }                                                                   \
5029         if (i == 0) {                                                       \
5030             *((ETYPE *)vd + H(i)) = s1;                                     \
5031         } else {                                                            \
5032             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5033         }                                                                   \
5034     }                                                                       \
5035     env->vstart = 0;                                                        \
5036     /* set tail elements to 1s */                                           \
5037     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5038 }
5039 
5040 GEN_VEXT_VSLIE1UP(8,  H1)
5041 GEN_VEXT_VSLIE1UP(16, H2)
5042 GEN_VEXT_VSLIE1UP(32, H4)
5043 GEN_VEXT_VSLIE1UP(64, H8)
5044 
5045 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5046 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5047                   CPURISCVState *env, uint32_t desc)              \
5048 {                                                                 \
5049     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5050 }
5051 
5052 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5053 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5054 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5055 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5056 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5057 
5058 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5059 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5060                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5061 {                                                                             \
5062     typedef uint##BITWIDTH##_t ETYPE;                                         \
5063     uint32_t vm = vext_vm(desc);                                              \
5064     uint32_t vl = env->vl;                                                    \
5065     uint32_t esz = sizeof(ETYPE);                                             \
5066     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5067     uint32_t vta = vext_vta(desc);                                            \
5068     uint32_t i;                                                               \
5069                                                                               \
5070     for (i = env->vstart; i < vl; i++) {                                      \
5071         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5072             continue;                                                         \
5073         }                                                                     \
5074         if (i == vl - 1) {                                                    \
5075             *((ETYPE *)vd + H(i)) = s1;                                       \
5076         } else {                                                              \
5077             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5078         }                                                                     \
5079     }                                                                         \
5080     env->vstart = 0;                                                          \
5081     /* set tail elements to 1s */                                             \
5082     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5083 }
5084 
5085 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5086 GEN_VEXT_VSLIDE1DOWN(16, H2)
5087 GEN_VEXT_VSLIDE1DOWN(32, H4)
5088 GEN_VEXT_VSLIDE1DOWN(64, H8)
5089 
5090 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5091 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5092                   CPURISCVState *env, uint32_t desc)              \
5093 {                                                                 \
5094     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5095 }
5096 
5097 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5098 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5099 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5100 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5101 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5102 
5103 /* Vector Floating-Point Slide Instructions */
5104 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5105 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5106                   CPURISCVState *env, uint32_t desc)          \
5107 {                                                             \
5108     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5109 }
5110 
5111 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5112 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5113 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5114 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5115 
5116 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5117 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5118                   CPURISCVState *env, uint32_t desc)          \
5119 {                                                             \
5120     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5121 }
5122 
5123 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5124 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5125 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5126 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5127 
5128 /* Vector Register Gather Instruction */
5129 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5130 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5131                   CPURISCVState *env, uint32_t desc)                      \
5132 {                                                                         \
5133     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5134     uint32_t vm = vext_vm(desc);                                          \
5135     uint32_t vl = env->vl;                                                \
5136     uint32_t esz = sizeof(TS2);                                           \
5137     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5138     uint32_t vta = vext_vta(desc);                                        \
5139     uint64_t index;                                                       \
5140     uint32_t i;                                                           \
5141                                                                           \
5142     for (i = env->vstart; i < vl; i++) {                                  \
5143         if (!vm && !vext_elem_mask(v0, i)) {                              \
5144             continue;                                                     \
5145         }                                                                 \
5146         index = *((TS1 *)vs1 + HS1(i));                                   \
5147         if (index >= vlmax) {                                             \
5148             *((TS2 *)vd + HS2(i)) = 0;                                    \
5149         } else {                                                          \
5150             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5151         }                                                                 \
5152     }                                                                     \
5153     env->vstart = 0;                                                      \
5154     /* set tail elements to 1s */                                         \
5155     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5156 }
5157 
5158 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5159 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5160 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5161 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5162 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5163 
5164 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5165 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5166 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5167 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5168 
5169 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5170 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5171                   CPURISCVState *env, uint32_t desc)                      \
5172 {                                                                         \
5173     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5174     uint32_t vm = vext_vm(desc);                                          \
5175     uint32_t vl = env->vl;                                                \
5176     uint32_t esz = sizeof(ETYPE);                                         \
5177     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5178     uint32_t vta = vext_vta(desc);                                        \
5179     uint64_t index = s1;                                                  \
5180     uint32_t i;                                                           \
5181                                                                           \
5182     for (i = env->vstart; i < vl; i++) {                                  \
5183         if (!vm && !vext_elem_mask(v0, i)) {                              \
5184             continue;                                                     \
5185         }                                                                 \
5186         if (index >= vlmax) {                                             \
5187             *((ETYPE *)vd + H(i)) = 0;                                    \
5188         } else {                                                          \
5189             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5190         }                                                                 \
5191     }                                                                     \
5192     env->vstart = 0;                                                      \
5193     /* set tail elements to 1s */                                         \
5194     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5195 }
5196 
5197 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5198 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5199 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5200 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5201 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5202 
5203 /* Vector Compress Instruction */
5204 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5205 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5206                   CPURISCVState *env, uint32_t desc)                      \
5207 {                                                                         \
5208     uint32_t vl = env->vl;                                                \
5209     uint32_t esz = sizeof(ETYPE);                                         \
5210     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5211     uint32_t vta = vext_vta(desc);                                        \
5212     uint32_t num = 0, i;                                                  \
5213                                                                           \
5214     for (i = env->vstart; i < vl; i++) {                                  \
5215         if (!vext_elem_mask(vs1, i)) {                                    \
5216             continue;                                                     \
5217         }                                                                 \
5218         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5219         num++;                                                            \
5220     }                                                                     \
5221     env->vstart = 0;                                                      \
5222     /* set tail elements to 1s */                                         \
5223     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5224 }
5225 
5226 /* Compress into vd elements of vs2 where vs1 is enabled */
5227 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5228 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5229 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5230 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5231 
5232 /* Vector Whole Register Move */
5233 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5234 {
5235     /* EEW = SEW */
5236     uint32_t maxsz = simd_maxsz(desc);
5237     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5238     uint32_t startb = env->vstart * sewb;
5239     uint32_t i = startb;
5240 
5241     memcpy((uint8_t *)vd + H1(i),
5242            (uint8_t *)vs2 + H1(i),
5243            maxsz - startb);
5244 
5245     env->vstart = 0;
5246 }
5247 
5248 /* Vector Integer Extension */
5249 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5250 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5251                   CPURISCVState *env, uint32_t desc)             \
5252 {                                                                \
5253     uint32_t vl = env->vl;                                       \
5254     uint32_t vm = vext_vm(desc);                                 \
5255     uint32_t esz = sizeof(ETYPE);                                \
5256     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5257     uint32_t vta = vext_vta(desc);                               \
5258     uint32_t i;                                                  \
5259                                                                  \
5260     for (i = env->vstart; i < vl; i++) {                         \
5261         if (!vm && !vext_elem_mask(v0, i)) {                     \
5262             continue;                                            \
5263         }                                                        \
5264         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5265     }                                                            \
5266     env->vstart = 0;                                             \
5267     /* set tail elements to 1s */                                \
5268     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5269 }
5270 
5271 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5272 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5273 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5274 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5275 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5276 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5277 
5278 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5279 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5280 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5281 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5282 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5283 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5284