xref: /openbmc/qemu/target/riscv/vector_helper.c (revision fd93045e)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286     uint32_t vma = vext_vma(desc);
287 
288     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289         k = 0;
290         while (k < nf) {
291             if (!vm && !vext_elem_mask(v0, i)) {
292                 /* set masked-off elements to 1s */
293                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294                                   (i + k * max_elems + 1) * esz);
295                 k++;
296                 continue;
297             }
298             target_ulong addr = base + stride * i + (k << log2_esz);
299             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300             k++;
301         }
302     }
303     env->vstart = 0;
304     /* set tail elements to 1s */
305     for (k = 0; k < nf; ++k) {
306         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307                           (k * max_elems + max_elems) * esz);
308     }
309     if (nf * max_elems % total_elems != 0) {
310         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311         uint32_t registers_used =
312             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314                           registers_used * vlenb);
315     }
316 }
317 
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
320                   target_ulong stride, CPURISCVState *env,              \
321                   uint32_t desc)                                        \
322 {                                                                       \
323     uint32_t vm = vext_vm(desc);                                        \
324     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
325                      ctzl(sizeof(ETYPE)), GETPC());                     \
326 }
327 
328 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332 
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
335                   target_ulong stride, CPURISCVState *env,              \
336                   uint32_t desc)                                        \
337 {                                                                       \
338     uint32_t vm = vext_vm(desc);                                        \
339     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
340                      ctzl(sizeof(ETYPE)), GETPC());                     \
341 }
342 
343 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347 
348 /*
349  *** unit-stride: access elements stored contiguously in memory
350  */
351 
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356              uintptr_t ra)
357 {
358     uint32_t i, k;
359     uint32_t nf = vext_nf(desc);
360     uint32_t max_elems = vext_max_elems(desc, log2_esz);
361     uint32_t esz = 1 << log2_esz;
362     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363     uint32_t vta = vext_vta(desc);
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375     /* set tail elements to 1s */
376     for (k = 0; k < nf; ++k) {
377         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378                           (k * max_elems + max_elems) * esz);
379     }
380     if (nf * max_elems % total_elems != 0) {
381         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382         uint32_t registers_used =
383             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385                           registers_used * vlenb);
386     }
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of stride,
391  * stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  *** unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  *** index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488     uint32_t vta = vext_vta(desc);
489     uint32_t vma = vext_vma(desc);
490 
491     /* load bytes from guest memory */
492     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493         k = 0;
494         while (k < nf) {
495             if (!vm && !vext_elem_mask(v0, i)) {
496                 /* set masked-off elements to 1s */
497                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498                                   (i + k * max_elems + 1) * esz);
499                 k++;
500                 continue;
501             }
502             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504             k++;
505         }
506     }
507     env->vstart = 0;
508     /* set tail elements to 1s */
509     for (k = 0; k < nf; ++k) {
510         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511                           (k * max_elems + max_elems) * esz);
512     }
513     if (nf * max_elems % total_elems != 0) {
514         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515         uint32_t registers_used =
516             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518                           registers_used * vlenb);
519     }
520 }
521 
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
524                   void *vs2, CPURISCVState *env, uint32_t desc)            \
525 {                                                                          \
526     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
527                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
528 }
529 
530 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546 
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
549                   void *vs2, CPURISCVState *env, uint32_t desc)  \
550 {                                                                \
551     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
552                     STORE_FN, ctzl(sizeof(ETYPE)),               \
553                     GETPC());                                    \
554 }
555 
556 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572 
573 /*
574  *** unit-stride fault-only-fisrt load instructions
575  */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578           CPURISCVState *env, uint32_t desc,
579           vext_ldst_elem_fn *ldst_elem,
580           uint32_t log2_esz, uintptr_t ra)
581 {
582     void *host;
583     uint32_t i, k, vl = 0;
584     uint32_t nf = vext_nf(desc);
585     uint32_t vm = vext_vm(desc);
586     uint32_t max_elems = vext_max_elems(desc, log2_esz);
587     uint32_t esz = 1 << log2_esz;
588     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589     uint32_t vta = vext_vta(desc);
590     uint32_t vma = vext_vma(desc);
591     target_ulong addr, offset, remain;
592 
593     /* probe every access*/
594     for (i = env->vstart; i < env->vl; i++) {
595         if (!vm && !vext_elem_mask(v0, i)) {
596             continue;
597         }
598         addr = adjust_addr(env, base + i * (nf << log2_esz));
599         if (i == 0) {
600             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601         } else {
602             /* if it triggers an exception, no need to check watchpoint */
603             remain = nf << log2_esz;
604             while (remain > 0) {
605                 offset = -(addr | TARGET_PAGE_MASK);
606                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607                                          cpu_mmu_index(env, false));
608                 if (host) {
609 #ifdef CONFIG_USER_ONLY
610                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
611                         vl = i;
612                         goto ProbeSuccess;
613                     }
614 #else
615                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617                 } else {
618                     vl = i;
619                     goto ProbeSuccess;
620                 }
621                 if (remain <=  offset) {
622                     break;
623                 }
624                 remain -= offset;
625                 addr = adjust_addr(env, addr + offset);
626             }
627         }
628     }
629 ProbeSuccess:
630     /* load bytes from guest memory */
631     if (vl != 0) {
632         env->vl = vl;
633     }
634     for (i = env->vstart; i < env->vl; i++) {
635         k = 0;
636         while (k < nf) {
637             if (!vm && !vext_elem_mask(v0, i)) {
638                 /* set masked-off elements to 1s */
639                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640                                   (i + k * max_elems + 1) * esz);
641                 k++;
642                 continue;
643             }
644             target_ulong addr = base + ((i * nf + k) << log2_esz);
645             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646             k++;
647         }
648     }
649     env->vstart = 0;
650     /* set tail elements to 1s */
651     for (k = 0; k < nf; ++k) {
652         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653                           (k * max_elems + max_elems) * esz);
654     }
655     if (nf * max_elems % total_elems != 0) {
656         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657         uint32_t registers_used =
658             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660                           registers_used * vlenb);
661     }
662 }
663 
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
666                   CPURISCVState *env, uint32_t desc)      \
667 {                                                         \
668     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
669               ctzl(sizeof(ETYPE)), GETPC());              \
670 }
671 
672 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676 
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M)  (N & M)
679 #define DO_XOR(N, M)  (N ^ M)
680 #define DO_OR(N, M)   (N | M)
681 #define DO_ADD(N, M)  (N + M)
682 
683 /* Signed min/max */
684 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
686 
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690 
691 /*
692  *** load and store whole register instructions
693  */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698     uint32_t i, k, off, pos;
699     uint32_t nf = vext_nf(desc);
700     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701     uint32_t max_elems = vlenb >> log2_esz;
702 
703     k = env->vstart / max_elems;
704     off = env->vstart % max_elems;
705 
706     if (off) {
707         /* load/store rest of elements of current segment pointed by vstart */
708         for (pos = off; pos < max_elems; pos++, env->vstart++) {
709             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711         }
712         k++;
713     }
714 
715     /* load/store elements for rest of segments */
716     for (; k < nf; k++) {
717         for (i = 0; i < max_elems; i++, env->vstart++) {
718             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720         }
721     }
722 
723     env->vstart = 0;
724 }
725 
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
727 void HELPER(NAME)(void *vd, target_ulong base,       \
728                   CPURISCVState *env, uint32_t desc) \
729 {                                                    \
730     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
731                     ctzl(sizeof(ETYPE)), GETPC());   \
732 }
733 
734 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750 
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
752 void HELPER(NAME)(void *vd, target_ulong base,       \
753                   CPURISCVState *env, uint32_t desc) \
754 {                                                    \
755     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
756                     ctzl(sizeof(ETYPE)), GETPC());   \
757 }
758 
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763 
764 /*
765  *** Vector Integer Arithmetic Instructions
766  */
767 
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
770 
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802 
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805 
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
808 {                                                               \
809     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
810     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
811     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815 
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824 
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826                        CPURISCVState *env, uint32_t desc,
827                        opivv2_fn *fn, uint32_t esz)
828 {
829     uint32_t vm = vext_vm(desc);
830     uint32_t vl = env->vl;
831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832     uint32_t vta = vext_vta(desc);
833     uint32_t vma = vext_vma(desc);
834     uint32_t i;
835 
836     for (i = env->vstart; i < vl; i++) {
837         if (!vm && !vext_elem_mask(v0, i)) {
838             /* set masked-off elements to 1s */
839             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840             continue;
841         }
842         fn(vd, vs1, vs2, i);
843     }
844     env->vstart = 0;
845     /* set tail elements to 1s */
846     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848 
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ)                            \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
852                   void *vs2, CPURISCVState *env,          \
853                   uint32_t desc)                          \
854 {                                                         \
855     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
856                do_##NAME, ESZ);                           \
857 }
858 
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867 
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869 
870 /*
871  * (T1)s1 gives the real operator type.
872  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873  */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
876 {                                                                   \
877     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
878     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
879 }
880 
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893 
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895                        CPURISCVState *env, uint32_t desc,
896                        opivx2_fn fn, uint32_t esz)
897 {
898     uint32_t vm = vext_vm(desc);
899     uint32_t vl = env->vl;
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901     uint32_t vta = vext_vta(desc);
902     uint32_t vma = vext_vma(desc);
903     uint32_t i;
904 
905     for (i = env->vstart; i < vl; i++) {
906         if (!vm && !vext_elem_mask(v0, i)) {
907             /* set masked-off elements to 1s */
908             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
909             continue;
910         }
911         fn(vd, s1, vs2, i);
912     }
913     env->vstart = 0;
914     /* set tail elements to 1s */
915     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
916 }
917 
918 /* generate the helpers for OPIVX */
919 #define GEN_VEXT_VX(NAME, ESZ)                            \
920 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
921                   void *vs2, CPURISCVState *env,          \
922                   uint32_t desc)                          \
923 {                                                         \
924     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
925                do_##NAME, ESZ);                           \
926 }
927 
928 GEN_VEXT_VX(vadd_vx_b, 1)
929 GEN_VEXT_VX(vadd_vx_h, 2)
930 GEN_VEXT_VX(vadd_vx_w, 4)
931 GEN_VEXT_VX(vadd_vx_d, 8)
932 GEN_VEXT_VX(vsub_vx_b, 1)
933 GEN_VEXT_VX(vsub_vx_h, 2)
934 GEN_VEXT_VX(vsub_vx_w, 4)
935 GEN_VEXT_VX(vsub_vx_d, 8)
936 GEN_VEXT_VX(vrsub_vx_b, 1)
937 GEN_VEXT_VX(vrsub_vx_h, 2)
938 GEN_VEXT_VX(vrsub_vx_w, 4)
939 GEN_VEXT_VX(vrsub_vx_d, 8)
940 
941 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
942 {
943     intptr_t oprsz = simd_oprsz(desc);
944     intptr_t i;
945 
946     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
947         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
948     }
949 }
950 
951 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
952 {
953     intptr_t oprsz = simd_oprsz(desc);
954     intptr_t i;
955 
956     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
957         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
958     }
959 }
960 
961 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
962 {
963     intptr_t oprsz = simd_oprsz(desc);
964     intptr_t i;
965 
966     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
967         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
968     }
969 }
970 
971 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
972 {
973     intptr_t oprsz = simd_oprsz(desc);
974     intptr_t i;
975 
976     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
977         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
978     }
979 }
980 
981 /* Vector Widening Integer Add/Subtract */
982 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
983 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
984 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
985 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
986 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
987 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
988 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
989 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
990 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
991 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
992 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
993 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
994 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
996 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
997 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
999 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1000 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1002 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1003 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1005 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1006 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1009 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1012 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1015 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1018 GEN_VEXT_VV(vwaddu_vv_b, 2)
1019 GEN_VEXT_VV(vwaddu_vv_h, 4)
1020 GEN_VEXT_VV(vwaddu_vv_w, 8)
1021 GEN_VEXT_VV(vwsubu_vv_b, 2)
1022 GEN_VEXT_VV(vwsubu_vv_h, 4)
1023 GEN_VEXT_VV(vwsubu_vv_w, 8)
1024 GEN_VEXT_VV(vwadd_vv_b, 2)
1025 GEN_VEXT_VV(vwadd_vv_h, 4)
1026 GEN_VEXT_VV(vwadd_vv_w, 8)
1027 GEN_VEXT_VV(vwsub_vv_b, 2)
1028 GEN_VEXT_VV(vwsub_vv_h, 4)
1029 GEN_VEXT_VV(vwsub_vv_w, 8)
1030 GEN_VEXT_VV(vwaddu_wv_b, 2)
1031 GEN_VEXT_VV(vwaddu_wv_h, 4)
1032 GEN_VEXT_VV(vwaddu_wv_w, 8)
1033 GEN_VEXT_VV(vwsubu_wv_b, 2)
1034 GEN_VEXT_VV(vwsubu_wv_h, 4)
1035 GEN_VEXT_VV(vwsubu_wv_w, 8)
1036 GEN_VEXT_VV(vwadd_wv_b, 2)
1037 GEN_VEXT_VV(vwadd_wv_h, 4)
1038 GEN_VEXT_VV(vwadd_wv_w, 8)
1039 GEN_VEXT_VV(vwsub_wv_b, 2)
1040 GEN_VEXT_VV(vwsub_wv_h, 4)
1041 GEN_VEXT_VV(vwsub_wv_w, 8)
1042 
1043 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1045 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1046 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1048 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1049 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1051 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1052 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1054 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1055 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1058 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1061 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1064 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1067 GEN_VEXT_VX(vwaddu_vx_b, 2)
1068 GEN_VEXT_VX(vwaddu_vx_h, 4)
1069 GEN_VEXT_VX(vwaddu_vx_w, 8)
1070 GEN_VEXT_VX(vwsubu_vx_b, 2)
1071 GEN_VEXT_VX(vwsubu_vx_h, 4)
1072 GEN_VEXT_VX(vwsubu_vx_w, 8)
1073 GEN_VEXT_VX(vwadd_vx_b, 2)
1074 GEN_VEXT_VX(vwadd_vx_h, 4)
1075 GEN_VEXT_VX(vwadd_vx_w, 8)
1076 GEN_VEXT_VX(vwsub_vx_b, 2)
1077 GEN_VEXT_VX(vwsub_vx_h, 4)
1078 GEN_VEXT_VX(vwsub_vx_w, 8)
1079 GEN_VEXT_VX(vwaddu_wx_b, 2)
1080 GEN_VEXT_VX(vwaddu_wx_h, 4)
1081 GEN_VEXT_VX(vwaddu_wx_w, 8)
1082 GEN_VEXT_VX(vwsubu_wx_b, 2)
1083 GEN_VEXT_VX(vwsubu_wx_h, 4)
1084 GEN_VEXT_VX(vwsubu_wx_w, 8)
1085 GEN_VEXT_VX(vwadd_wx_b, 2)
1086 GEN_VEXT_VX(vwadd_wx_h, 4)
1087 GEN_VEXT_VX(vwadd_wx_w, 8)
1088 GEN_VEXT_VX(vwsub_wx_b, 2)
1089 GEN_VEXT_VX(vwsub_wx_h, 4)
1090 GEN_VEXT_VX(vwsub_wx_w, 8)
1091 
1092 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1093 #define DO_VADC(N, M, C) (N + M + C)
1094 #define DO_VSBC(N, M, C) (N - M - C)
1095 
1096 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1097 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1098                   CPURISCVState *env, uint32_t desc)          \
1099 {                                                             \
1100     uint32_t vl = env->vl;                                    \
1101     uint32_t esz = sizeof(ETYPE);                             \
1102     uint32_t total_elems =                                    \
1103         vext_get_total_elems(env, desc, esz);                 \
1104     uint32_t vta = vext_vta(desc);                            \
1105     uint32_t i;                                               \
1106                                                               \
1107     for (i = env->vstart; i < vl; i++) {                      \
1108         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1109         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1110         ETYPE carry = vext_elem_mask(v0, i);                  \
1111                                                               \
1112         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1113     }                                                         \
1114     env->vstart = 0;                                          \
1115     /* set tail elements to 1s */                             \
1116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1117 }
1118 
1119 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1120 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1121 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1122 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1123 
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1125 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1126 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1127 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1128 
1129 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1130 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1131                   CPURISCVState *env, uint32_t desc)                     \
1132 {                                                                        \
1133     uint32_t vl = env->vl;                                               \
1134     uint32_t esz = sizeof(ETYPE);                                        \
1135     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1136     uint32_t vta = vext_vta(desc);                                       \
1137     uint32_t i;                                                          \
1138                                                                          \
1139     for (i = env->vstart; i < vl; i++) {                                 \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1141         ETYPE carry = vext_elem_mask(v0, i);                             \
1142                                                                          \
1143         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1144     }                                                                    \
1145     env->vstart = 0;                                          \
1146     /* set tail elements to 1s */                                        \
1147     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1148 }
1149 
1150 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1151 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1152 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1153 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1154 
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1156 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1157 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1158 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1159 
1160 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1161                           (__typeof(N))(N + M) < N)
1162 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1163 
1164 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1165 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1166                   CPURISCVState *env, uint32_t desc)          \
1167 {                                                             \
1168     uint32_t vl = env->vl;                                    \
1169     uint32_t vm = vext_vm(desc);                              \
1170     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1171     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1172     uint32_t i;                                               \
1173                                                               \
1174     for (i = env->vstart; i < vl; i++) {                      \
1175         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1178         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1179     }                                                         \
1180     env->vstart = 0;                                          \
1181     /* mask destination register are always tail-agnostic */  \
1182     /* set tail elements to 1s */                             \
1183     if (vta_all_1s) {                                         \
1184         for (; i < total_elems; i++) {                        \
1185             vext_set_elem_mask(vd, i, 1);                     \
1186         }                                                     \
1187     }                                                         \
1188 }
1189 
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1191 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1192 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1193 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1194 
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1196 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1197 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1198 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1199 
1200 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1201 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1202                   void *vs2, CPURISCVState *env, uint32_t desc) \
1203 {                                                               \
1204     uint32_t vl = env->vl;                                      \
1205     uint32_t vm = vext_vm(desc);                                \
1206     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1207     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1208     uint32_t i;                                                 \
1209                                                                 \
1210     for (i = env->vstart; i < vl; i++) {                        \
1211         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1212         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1213         vext_set_elem_mask(vd, i,                               \
1214                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1215     }                                                           \
1216     env->vstart = 0;                                            \
1217     /* mask destination register are always tail-agnostic */    \
1218     /* set tail elements to 1s */                               \
1219     if (vta_all_1s) {                                           \
1220         for (; i < total_elems; i++) {                          \
1221             vext_set_elem_mask(vd, i, 1);                       \
1222         }                                                       \
1223     }                                                           \
1224 }
1225 
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230 
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235 
1236 /* Vector Bitwise Logical Instructions */
1237 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249 GEN_VEXT_VV(vand_vv_b, 1)
1250 GEN_VEXT_VV(vand_vv_h, 2)
1251 GEN_VEXT_VV(vand_vv_w, 4)
1252 GEN_VEXT_VV(vand_vv_d, 8)
1253 GEN_VEXT_VV(vor_vv_b, 1)
1254 GEN_VEXT_VV(vor_vv_h, 2)
1255 GEN_VEXT_VV(vor_vv_w, 4)
1256 GEN_VEXT_VV(vor_vv_d, 8)
1257 GEN_VEXT_VV(vxor_vv_b, 1)
1258 GEN_VEXT_VV(vxor_vv_h, 2)
1259 GEN_VEXT_VV(vxor_vv_w, 4)
1260 GEN_VEXT_VV(vxor_vv_d, 8)
1261 
1262 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274 GEN_VEXT_VX(vand_vx_b, 1)
1275 GEN_VEXT_VX(vand_vx_h, 2)
1276 GEN_VEXT_VX(vand_vx_w, 4)
1277 GEN_VEXT_VX(vand_vx_d, 8)
1278 GEN_VEXT_VX(vor_vx_b, 1)
1279 GEN_VEXT_VX(vor_vx_h, 2)
1280 GEN_VEXT_VX(vor_vx_w, 4)
1281 GEN_VEXT_VX(vor_vx_d, 8)
1282 GEN_VEXT_VX(vxor_vx_b, 1)
1283 GEN_VEXT_VX(vxor_vx_h, 2)
1284 GEN_VEXT_VX(vxor_vx_w, 4)
1285 GEN_VEXT_VX(vxor_vx_d, 8)
1286 
1287 /* Vector Single-Width Bit Shift Instructions */
1288 #define DO_SLL(N, M)  (N << (M))
1289 #define DO_SRL(N, M)  (N >> (M))
1290 
1291 /* generate the helpers for shift instructions with two vector operators */
1292 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1295 {                                                                         \
1296     uint32_t vm = vext_vm(desc);                                          \
1297     uint32_t vl = env->vl;                                                \
1298     uint32_t esz = sizeof(TS1);                                           \
1299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300     uint32_t vta = vext_vta(desc);                                        \
1301     uint32_t vma = vext_vma(desc);                                        \
1302     uint32_t i;                                                           \
1303                                                                           \
1304     for (i = env->vstart; i < vl; i++) {                                  \
1305         if (!vm && !vext_elem_mask(v0, i)) {                              \
1306             /* set masked-off elements to 1s */                           \
1307             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1308             continue;                                                     \
1309         }                                                                 \
1310         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1311         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1312         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1313     }                                                                     \
1314     env->vstart = 0;                                                      \
1315     /* set tail elements to 1s */                                         \
1316     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1317 }
1318 
1319 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1320 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1321 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1322 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1323 
1324 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1325 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1326 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1327 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1328 
1329 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1330 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1331 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1332 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1333 
1334 /* generate the helpers for shift instructions with one vector and one scalar */
1335 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1336 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1337         void *vs2, CPURISCVState *env, uint32_t desc)       \
1338 {                                                           \
1339     uint32_t vm = vext_vm(desc);                            \
1340     uint32_t vl = env->vl;                                  \
1341     uint32_t esz = sizeof(TD);                              \
1342     uint32_t total_elems =                                  \
1343         vext_get_total_elems(env, desc, esz);               \
1344     uint32_t vta = vext_vta(desc);                          \
1345     uint32_t vma = vext_vma(desc);                          \
1346     uint32_t i;                                             \
1347                                                             \
1348     for (i = env->vstart; i < vl; i++) {                    \
1349         if (!vm && !vext_elem_mask(v0, i)) {                \
1350             /* set masked-off elements to 1s */             \
1351             vext_set_elems_1s(vd, vma, i * esz,             \
1352                               (i + 1) * esz);               \
1353             continue;                                       \
1354         }                                                   \
1355         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1356         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1357     }                                                       \
1358     env->vstart = 0;                                        \
1359     /* set tail elements to 1s */                           \
1360     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1361 }
1362 
1363 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1364 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1367 
1368 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1369 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1372 
1373 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1374 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1375 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1376 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1377 
1378 /* Vector Narrowing Integer Right Shift Instructions */
1379 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1380 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1381 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1382 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1383 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1384 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1385 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1386 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1387 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1388 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1389 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1390 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1391 
1392 /* Vector Integer Comparison Instructions */
1393 #define DO_MSEQ(N, M) (N == M)
1394 #define DO_MSNE(N, M) (N != M)
1395 #define DO_MSLT(N, M) (N < M)
1396 #define DO_MSLE(N, M) (N <= M)
1397 #define DO_MSGT(N, M) (N > M)
1398 
1399 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1400 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1401                   CPURISCVState *env, uint32_t desc)          \
1402 {                                                             \
1403     uint32_t vm = vext_vm(desc);                              \
1404     uint32_t vl = env->vl;                                    \
1405     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1406     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1407     uint32_t i;                                               \
1408                                                               \
1409     for (i = env->vstart; i < vl; i++) {                      \
1410         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1411         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1412         if (!vm && !vext_elem_mask(v0, i)) {                  \
1413             continue;                                         \
1414         }                                                     \
1415         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1416     }                                                         \
1417     env->vstart = 0;                                          \
1418     /* mask destination register are always tail-agnostic */  \
1419     /* set tail elements to 1s */                             \
1420     if (vta_all_1s) {                                         \
1421         for (; i < total_elems; i++) {                        \
1422             vext_set_elem_mask(vd, i, 1);                     \
1423         }                                                     \
1424     }                                                         \
1425 }
1426 
1427 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1428 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1429 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1430 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1431 
1432 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1433 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1434 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1435 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1436 
1437 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1438 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1439 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1440 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1441 
1442 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1443 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1444 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1445 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1446 
1447 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1448 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1449 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1450 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1451 
1452 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1453 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1454 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1455 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1456 
1457 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1458 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1459                   CPURISCVState *env, uint32_t desc)                \
1460 {                                                                   \
1461     uint32_t vm = vext_vm(desc);                                    \
1462     uint32_t vl = env->vl;                                          \
1463     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1464     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1465     uint32_t i;                                                     \
1466                                                                     \
1467     for (i = env->vstart; i < vl; i++) {                            \
1468         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1469         if (!vm && !vext_elem_mask(v0, i)) {                        \
1470             continue;                                               \
1471         }                                                           \
1472         vext_set_elem_mask(vd, i,                                   \
1473                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1474     }                                                               \
1475     env->vstart = 0;                                                \
1476     /* mask destination register are always tail-agnostic */        \
1477     /* set tail elements to 1s */                                   \
1478     if (vta_all_1s) {                                               \
1479         for (; i < total_elems; i++) {                              \
1480             vext_set_elem_mask(vd, i, 1);                           \
1481         }                                                           \
1482     }                                                               \
1483 }
1484 
1485 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1486 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1487 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1488 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1489 
1490 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1491 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1492 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1493 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1494 
1495 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1496 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1497 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1498 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1499 
1500 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1501 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1502 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1503 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1504 
1505 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1506 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1507 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1508 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1509 
1510 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1511 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1512 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1513 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1514 
1515 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1516 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1517 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1518 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1519 
1520 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1521 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1522 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1523 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1524 
1525 /* Vector Integer Min/Max Instructions */
1526 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1527 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1528 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1529 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1530 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1531 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1532 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1533 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1534 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1535 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1536 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1537 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1538 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1539 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1540 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1541 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1542 GEN_VEXT_VV(vminu_vv_b, 1)
1543 GEN_VEXT_VV(vminu_vv_h, 2)
1544 GEN_VEXT_VV(vminu_vv_w, 4)
1545 GEN_VEXT_VV(vminu_vv_d, 8)
1546 GEN_VEXT_VV(vmin_vv_b, 1)
1547 GEN_VEXT_VV(vmin_vv_h, 2)
1548 GEN_VEXT_VV(vmin_vv_w, 4)
1549 GEN_VEXT_VV(vmin_vv_d, 8)
1550 GEN_VEXT_VV(vmaxu_vv_b, 1)
1551 GEN_VEXT_VV(vmaxu_vv_h, 2)
1552 GEN_VEXT_VV(vmaxu_vv_w, 4)
1553 GEN_VEXT_VV(vmaxu_vv_d, 8)
1554 GEN_VEXT_VV(vmax_vv_b, 1)
1555 GEN_VEXT_VV(vmax_vv_h, 2)
1556 GEN_VEXT_VV(vmax_vv_w, 4)
1557 GEN_VEXT_VV(vmax_vv_d, 8)
1558 
1559 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1560 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1561 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1562 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1563 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1564 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1565 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1566 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1567 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1568 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1569 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1570 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1571 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1572 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1573 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1574 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1575 GEN_VEXT_VX(vminu_vx_b, 1)
1576 GEN_VEXT_VX(vminu_vx_h, 2)
1577 GEN_VEXT_VX(vminu_vx_w, 4)
1578 GEN_VEXT_VX(vminu_vx_d, 8)
1579 GEN_VEXT_VX(vmin_vx_b, 1)
1580 GEN_VEXT_VX(vmin_vx_h, 2)
1581 GEN_VEXT_VX(vmin_vx_w, 4)
1582 GEN_VEXT_VX(vmin_vx_d, 8)
1583 GEN_VEXT_VX(vmaxu_vx_b, 1)
1584 GEN_VEXT_VX(vmaxu_vx_h, 2)
1585 GEN_VEXT_VX(vmaxu_vx_w, 4)
1586 GEN_VEXT_VX(vmaxu_vx_d, 8)
1587 GEN_VEXT_VX(vmax_vx_b, 1)
1588 GEN_VEXT_VX(vmax_vx_h, 2)
1589 GEN_VEXT_VX(vmax_vx_w, 4)
1590 GEN_VEXT_VX(vmax_vx_d, 8)
1591 
1592 /* Vector Single-Width Integer Multiply Instructions */
1593 #define DO_MUL(N, M) (N * M)
1594 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1595 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1596 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1597 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1598 GEN_VEXT_VV(vmul_vv_b, 1)
1599 GEN_VEXT_VV(vmul_vv_h, 2)
1600 GEN_VEXT_VV(vmul_vv_w, 4)
1601 GEN_VEXT_VV(vmul_vv_d, 8)
1602 
1603 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1604 {
1605     return (int16_t)s2 * (int16_t)s1 >> 8;
1606 }
1607 
1608 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1609 {
1610     return (int32_t)s2 * (int32_t)s1 >> 16;
1611 }
1612 
1613 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1614 {
1615     return (int64_t)s2 * (int64_t)s1 >> 32;
1616 }
1617 
1618 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1619 {
1620     uint64_t hi_64, lo_64;
1621 
1622     muls64(&lo_64, &hi_64, s1, s2);
1623     return hi_64;
1624 }
1625 
1626 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1627 {
1628     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1629 }
1630 
1631 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1632 {
1633     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1634 }
1635 
1636 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1637 {
1638     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1639 }
1640 
1641 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1642 {
1643     uint64_t hi_64, lo_64;
1644 
1645     mulu64(&lo_64, &hi_64, s2, s1);
1646     return hi_64;
1647 }
1648 
1649 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1650 {
1651     return (int16_t)s2 * (uint16_t)s1 >> 8;
1652 }
1653 
1654 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1655 {
1656     return (int32_t)s2 * (uint32_t)s1 >> 16;
1657 }
1658 
1659 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1660 {
1661     return (int64_t)s2 * (uint64_t)s1 >> 32;
1662 }
1663 
1664 /*
1665  * Let  A = signed operand,
1666  *      B = unsigned operand
1667  *      P = mulu64(A, B), unsigned product
1668  *
1669  * LET  X = 2 ** 64  - A, 2's complement of A
1670  *      SP = signed product
1671  * THEN
1672  *      IF A < 0
1673  *          SP = -X * B
1674  *             = -(2 ** 64 - A) * B
1675  *             = A * B - 2 ** 64 * B
1676  *             = P - 2 ** 64 * B
1677  *      ELSE
1678  *          SP = P
1679  * THEN
1680  *      HI_P -= (A < 0 ? B : 0)
1681  */
1682 
1683 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1684 {
1685     uint64_t hi_64, lo_64;
1686 
1687     mulu64(&lo_64, &hi_64, s2, s1);
1688 
1689     hi_64 -= s2 < 0 ? s1 : 0;
1690     return hi_64;
1691 }
1692 
1693 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1694 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1695 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1696 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1697 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1698 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1699 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1700 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1701 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1702 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1703 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1704 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1705 GEN_VEXT_VV(vmulh_vv_b, 1)
1706 GEN_VEXT_VV(vmulh_vv_h, 2)
1707 GEN_VEXT_VV(vmulh_vv_w, 4)
1708 GEN_VEXT_VV(vmulh_vv_d, 8)
1709 GEN_VEXT_VV(vmulhu_vv_b, 1)
1710 GEN_VEXT_VV(vmulhu_vv_h, 2)
1711 GEN_VEXT_VV(vmulhu_vv_w, 4)
1712 GEN_VEXT_VV(vmulhu_vv_d, 8)
1713 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1714 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1715 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1716 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1717 
1718 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1719 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1720 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1721 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1722 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1723 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1724 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1725 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1726 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1727 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1728 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1729 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1730 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1731 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1732 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1733 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1734 GEN_VEXT_VX(vmul_vx_b, 1)
1735 GEN_VEXT_VX(vmul_vx_h, 2)
1736 GEN_VEXT_VX(vmul_vx_w, 4)
1737 GEN_VEXT_VX(vmul_vx_d, 8)
1738 GEN_VEXT_VX(vmulh_vx_b, 1)
1739 GEN_VEXT_VX(vmulh_vx_h, 2)
1740 GEN_VEXT_VX(vmulh_vx_w, 4)
1741 GEN_VEXT_VX(vmulh_vx_d, 8)
1742 GEN_VEXT_VX(vmulhu_vx_b, 1)
1743 GEN_VEXT_VX(vmulhu_vx_h, 2)
1744 GEN_VEXT_VX(vmulhu_vx_w, 4)
1745 GEN_VEXT_VX(vmulhu_vx_d, 8)
1746 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1747 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1748 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1749 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1750 
1751 /* Vector Integer Divide Instructions */
1752 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1753 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1754 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1755         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1756 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1757         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1758 
1759 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1760 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1761 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1762 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1763 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1764 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1765 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1766 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1767 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1768 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1769 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1770 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1771 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1772 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1773 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1774 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1775 GEN_VEXT_VV(vdivu_vv_b, 1)
1776 GEN_VEXT_VV(vdivu_vv_h, 2)
1777 GEN_VEXT_VV(vdivu_vv_w, 4)
1778 GEN_VEXT_VV(vdivu_vv_d, 8)
1779 GEN_VEXT_VV(vdiv_vv_b, 1)
1780 GEN_VEXT_VV(vdiv_vv_h, 2)
1781 GEN_VEXT_VV(vdiv_vv_w, 4)
1782 GEN_VEXT_VV(vdiv_vv_d, 8)
1783 GEN_VEXT_VV(vremu_vv_b, 1)
1784 GEN_VEXT_VV(vremu_vv_h, 2)
1785 GEN_VEXT_VV(vremu_vv_w, 4)
1786 GEN_VEXT_VV(vremu_vv_d, 8)
1787 GEN_VEXT_VV(vrem_vv_b, 1)
1788 GEN_VEXT_VV(vrem_vv_h, 2)
1789 GEN_VEXT_VV(vrem_vv_w, 4)
1790 GEN_VEXT_VV(vrem_vv_d, 8)
1791 
1792 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1793 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1794 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1795 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1796 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1797 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1798 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1799 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1800 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1801 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1802 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1803 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1804 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1805 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1806 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1807 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1808 GEN_VEXT_VX(vdivu_vx_b, 1)
1809 GEN_VEXT_VX(vdivu_vx_h, 2)
1810 GEN_VEXT_VX(vdivu_vx_w, 4)
1811 GEN_VEXT_VX(vdivu_vx_d, 8)
1812 GEN_VEXT_VX(vdiv_vx_b, 1)
1813 GEN_VEXT_VX(vdiv_vx_h, 2)
1814 GEN_VEXT_VX(vdiv_vx_w, 4)
1815 GEN_VEXT_VX(vdiv_vx_d, 8)
1816 GEN_VEXT_VX(vremu_vx_b, 1)
1817 GEN_VEXT_VX(vremu_vx_h, 2)
1818 GEN_VEXT_VX(vremu_vx_w, 4)
1819 GEN_VEXT_VX(vremu_vx_d, 8)
1820 GEN_VEXT_VX(vrem_vx_b, 1)
1821 GEN_VEXT_VX(vrem_vx_h, 2)
1822 GEN_VEXT_VX(vrem_vx_w, 4)
1823 GEN_VEXT_VX(vrem_vx_d, 8)
1824 
1825 /* Vector Widening Integer Multiply Instructions */
1826 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1827 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1828 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1829 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1830 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1831 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1832 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1833 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1834 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1835 GEN_VEXT_VV(vwmul_vv_b, 2)
1836 GEN_VEXT_VV(vwmul_vv_h, 4)
1837 GEN_VEXT_VV(vwmul_vv_w, 8)
1838 GEN_VEXT_VV(vwmulu_vv_b, 2)
1839 GEN_VEXT_VV(vwmulu_vv_h, 4)
1840 GEN_VEXT_VV(vwmulu_vv_w, 8)
1841 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1842 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1843 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1844 
1845 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1846 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1847 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1848 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1849 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1850 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1851 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1852 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1853 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1854 GEN_VEXT_VX(vwmul_vx_b, 2)
1855 GEN_VEXT_VX(vwmul_vx_h, 4)
1856 GEN_VEXT_VX(vwmul_vx_w, 8)
1857 GEN_VEXT_VX(vwmulu_vx_b, 2)
1858 GEN_VEXT_VX(vwmulu_vx_h, 4)
1859 GEN_VEXT_VX(vwmulu_vx_w, 8)
1860 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1861 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1862 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1863 
1864 /* Vector Single-Width Integer Multiply-Add Instructions */
1865 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1866 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1867 {                                                                  \
1868     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1869     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1870     TD d = *((TD *)vd + HD(i));                                    \
1871     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1872 }
1873 
1874 #define DO_MACC(N, M, D) (M * N + D)
1875 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1876 #define DO_MADD(N, M, D) (M * D + N)
1877 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1878 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1879 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1880 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1881 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1882 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1883 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1884 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1885 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1886 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1887 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1888 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1889 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1890 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1891 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1892 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1893 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1894 GEN_VEXT_VV(vmacc_vv_b, 1)
1895 GEN_VEXT_VV(vmacc_vv_h, 2)
1896 GEN_VEXT_VV(vmacc_vv_w, 4)
1897 GEN_VEXT_VV(vmacc_vv_d, 8)
1898 GEN_VEXT_VV(vnmsac_vv_b, 1)
1899 GEN_VEXT_VV(vnmsac_vv_h, 2)
1900 GEN_VEXT_VV(vnmsac_vv_w, 4)
1901 GEN_VEXT_VV(vnmsac_vv_d, 8)
1902 GEN_VEXT_VV(vmadd_vv_b, 1)
1903 GEN_VEXT_VV(vmadd_vv_h, 2)
1904 GEN_VEXT_VV(vmadd_vv_w, 4)
1905 GEN_VEXT_VV(vmadd_vv_d, 8)
1906 GEN_VEXT_VV(vnmsub_vv_b, 1)
1907 GEN_VEXT_VV(vnmsub_vv_h, 2)
1908 GEN_VEXT_VV(vnmsub_vv_w, 4)
1909 GEN_VEXT_VV(vnmsub_vv_d, 8)
1910 
1911 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1912 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1913 {                                                                   \
1914     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1915     TD d = *((TD *)vd + HD(i));                                     \
1916     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1917 }
1918 
1919 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1920 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1921 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1922 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1923 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1924 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1925 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1926 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1927 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1928 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1929 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1930 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1931 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1932 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1933 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1934 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1935 GEN_VEXT_VX(vmacc_vx_b, 1)
1936 GEN_VEXT_VX(vmacc_vx_h, 2)
1937 GEN_VEXT_VX(vmacc_vx_w, 4)
1938 GEN_VEXT_VX(vmacc_vx_d, 8)
1939 GEN_VEXT_VX(vnmsac_vx_b, 1)
1940 GEN_VEXT_VX(vnmsac_vx_h, 2)
1941 GEN_VEXT_VX(vnmsac_vx_w, 4)
1942 GEN_VEXT_VX(vnmsac_vx_d, 8)
1943 GEN_VEXT_VX(vmadd_vx_b, 1)
1944 GEN_VEXT_VX(vmadd_vx_h, 2)
1945 GEN_VEXT_VX(vmadd_vx_w, 4)
1946 GEN_VEXT_VX(vmadd_vx_d, 8)
1947 GEN_VEXT_VX(vnmsub_vx_b, 1)
1948 GEN_VEXT_VX(vnmsub_vx_h, 2)
1949 GEN_VEXT_VX(vnmsub_vx_w, 4)
1950 GEN_VEXT_VX(vnmsub_vx_d, 8)
1951 
1952 /* Vector Widening Integer Multiply-Add Instructions */
1953 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1954 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1955 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1956 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1957 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1958 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1959 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1960 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1961 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1962 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1963 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1964 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1965 GEN_VEXT_VV(vwmacc_vv_b, 2)
1966 GEN_VEXT_VV(vwmacc_vv_h, 4)
1967 GEN_VEXT_VV(vwmacc_vv_w, 8)
1968 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1969 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1970 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1971 
1972 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1973 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1974 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1975 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1976 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1977 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1978 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1979 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1980 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1981 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1982 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1983 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1984 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1985 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1986 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1987 GEN_VEXT_VX(vwmacc_vx_b, 2)
1988 GEN_VEXT_VX(vwmacc_vx_h, 4)
1989 GEN_VEXT_VX(vwmacc_vx_w, 8)
1990 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1991 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1992 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1993 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1994 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1995 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1996 
1997 /* Vector Integer Merge and Move Instructions */
1998 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1999 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2000                   uint32_t desc)                                     \
2001 {                                                                    \
2002     uint32_t vl = env->vl;                                           \
2003     uint32_t esz = sizeof(ETYPE);                                    \
2004     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2005     uint32_t vta = vext_vta(desc);                                   \
2006     uint32_t i;                                                      \
2007                                                                      \
2008     for (i = env->vstart; i < vl; i++) {                             \
2009         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2010         *((ETYPE *)vd + H(i)) = s1;                                  \
2011     }                                                                \
2012     env->vstart = 0;                                                 \
2013     /* set tail elements to 1s */                                    \
2014     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2015 }
2016 
2017 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2018 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2019 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2020 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2021 
2022 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2023 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2024                   uint32_t desc)                                     \
2025 {                                                                    \
2026     uint32_t vl = env->vl;                                           \
2027     uint32_t esz = sizeof(ETYPE);                                    \
2028     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2029     uint32_t vta = vext_vta(desc);                                   \
2030     uint32_t i;                                                      \
2031                                                                      \
2032     for (i = env->vstart; i < vl; i++) {                             \
2033         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2034     }                                                                \
2035     env->vstart = 0;                                                 \
2036     /* set tail elements to 1s */                                    \
2037     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2038 }
2039 
2040 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2041 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2042 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2043 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2044 
2045 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2046 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2047                   CPURISCVState *env, uint32_t desc)                 \
2048 {                                                                    \
2049     uint32_t vl = env->vl;                                           \
2050     uint32_t esz = sizeof(ETYPE);                                    \
2051     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2052     uint32_t vta = vext_vta(desc);                                   \
2053     uint32_t i;                                                      \
2054                                                                      \
2055     for (i = env->vstart; i < vl; i++) {                             \
2056         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2057         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2058     }                                                                \
2059     env->vstart = 0;                                                 \
2060     /* set tail elements to 1s */                                    \
2061     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2062 }
2063 
2064 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2065 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2066 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2067 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2068 
2069 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2070 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2071                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2072 {                                                                    \
2073     uint32_t vl = env->vl;                                           \
2074     uint32_t esz = sizeof(ETYPE);                                    \
2075     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2076     uint32_t vta = vext_vta(desc);                                   \
2077     uint32_t i;                                                      \
2078                                                                      \
2079     for (i = env->vstart; i < vl; i++) {                             \
2080         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2081         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2082                    (ETYPE)(target_long)s1);                          \
2083         *((ETYPE *)vd + H(i)) = d;                                   \
2084     }                                                                \
2085     env->vstart = 0;                                                 \
2086     /* set tail elements to 1s */                                    \
2087     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2088 }
2089 
2090 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2091 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2092 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2093 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2094 
2095 /*
2096  *** Vector Fixed-Point Arithmetic Instructions
2097  */
2098 
2099 /* Vector Single-Width Saturating Add and Subtract */
2100 
2101 /*
2102  * As fixed point instructions probably have round mode and saturation,
2103  * define common macros for fixed point here.
2104  */
2105 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2106                           CPURISCVState *env, int vxrm);
2107 
2108 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2109 static inline void                                                  \
2110 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2111           CPURISCVState *env, int vxrm)                             \
2112 {                                                                   \
2113     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2114     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2115     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2116 }
2117 
2118 static inline void
2119 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2120              CPURISCVState *env,
2121              uint32_t vl, uint32_t vm, int vxrm,
2122              opivv2_rm_fn *fn)
2123 {
2124     for (uint32_t i = env->vstart; i < vl; i++) {
2125         if (!vm && !vext_elem_mask(v0, i)) {
2126             continue;
2127         }
2128         fn(vd, vs1, vs2, i, env, vxrm);
2129     }
2130     env->vstart = 0;
2131 }
2132 
2133 static inline void
2134 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2135              CPURISCVState *env,
2136              uint32_t desc,
2137              opivv2_rm_fn *fn, uint32_t esz)
2138 {
2139     uint32_t vm = vext_vm(desc);
2140     uint32_t vl = env->vl;
2141     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2142     uint32_t vta = vext_vta(desc);
2143 
2144     switch (env->vxrm) {
2145     case 0: /* rnu */
2146         vext_vv_rm_1(vd, v0, vs1, vs2,
2147                      env, vl, vm, 0, fn);
2148         break;
2149     case 1: /* rne */
2150         vext_vv_rm_1(vd, v0, vs1, vs2,
2151                      env, vl, vm, 1, fn);
2152         break;
2153     case 2: /* rdn */
2154         vext_vv_rm_1(vd, v0, vs1, vs2,
2155                      env, vl, vm, 2, fn);
2156         break;
2157     default: /* rod */
2158         vext_vv_rm_1(vd, v0, vs1, vs2,
2159                      env, vl, vm, 3, fn);
2160         break;
2161     }
2162     /* set tail elements to 1s */
2163     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2164 }
2165 
2166 /* generate helpers for fixed point instructions with OPIVV format */
2167 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2168 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2169                   CPURISCVState *env, uint32_t desc)            \
2170 {                                                               \
2171     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2172                  do_##NAME, ESZ);                               \
2173 }
2174 
2175 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2176 {
2177     uint8_t res = a + b;
2178     if (res < a) {
2179         res = UINT8_MAX;
2180         env->vxsat = 0x1;
2181     }
2182     return res;
2183 }
2184 
2185 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2186                                uint16_t b)
2187 {
2188     uint16_t res = a + b;
2189     if (res < a) {
2190         res = UINT16_MAX;
2191         env->vxsat = 0x1;
2192     }
2193     return res;
2194 }
2195 
2196 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2197                                uint32_t b)
2198 {
2199     uint32_t res = a + b;
2200     if (res < a) {
2201         res = UINT32_MAX;
2202         env->vxsat = 0x1;
2203     }
2204     return res;
2205 }
2206 
2207 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2208                                uint64_t b)
2209 {
2210     uint64_t res = a + b;
2211     if (res < a) {
2212         res = UINT64_MAX;
2213         env->vxsat = 0x1;
2214     }
2215     return res;
2216 }
2217 
2218 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2219 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2220 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2221 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2222 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2223 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2224 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2225 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2226 
2227 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2228                           CPURISCVState *env, int vxrm);
2229 
2230 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2231 static inline void                                                  \
2232 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2233           CPURISCVState *env, int vxrm)                             \
2234 {                                                                   \
2235     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2236     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2237 }
2238 
2239 static inline void
2240 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2241              CPURISCVState *env,
2242              uint32_t vl, uint32_t vm, int vxrm,
2243              opivx2_rm_fn *fn)
2244 {
2245     for (uint32_t i = env->vstart; i < vl; i++) {
2246         if (!vm && !vext_elem_mask(v0, i)) {
2247             continue;
2248         }
2249         fn(vd, s1, vs2, i, env, vxrm);
2250     }
2251     env->vstart = 0;
2252 }
2253 
2254 static inline void
2255 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2256              CPURISCVState *env,
2257              uint32_t desc,
2258              opivx2_rm_fn *fn, uint32_t esz)
2259 {
2260     uint32_t vm = vext_vm(desc);
2261     uint32_t vl = env->vl;
2262     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2263     uint32_t vta = vext_vta(desc);
2264 
2265     switch (env->vxrm) {
2266     case 0: /* rnu */
2267         vext_vx_rm_1(vd, v0, s1, vs2,
2268                      env, vl, vm, 0, fn);
2269         break;
2270     case 1: /* rne */
2271         vext_vx_rm_1(vd, v0, s1, vs2,
2272                      env, vl, vm, 1, fn);
2273         break;
2274     case 2: /* rdn */
2275         vext_vx_rm_1(vd, v0, s1, vs2,
2276                      env, vl, vm, 2, fn);
2277         break;
2278     default: /* rod */
2279         vext_vx_rm_1(vd, v0, s1, vs2,
2280                      env, vl, vm, 3, fn);
2281         break;
2282     }
2283     /* set tail elements to 1s */
2284     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2285 }
2286 
2287 /* generate helpers for fixed point instructions with OPIVX format */
2288 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2289 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2290         void *vs2, CPURISCVState *env, uint32_t desc)     \
2291 {                                                         \
2292     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2293                  do_##NAME, ESZ);                         \
2294 }
2295 
2296 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2297 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2298 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2299 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2300 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2301 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2302 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2303 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2304 
2305 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2306 {
2307     int8_t res = a + b;
2308     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2309         res = a > 0 ? INT8_MAX : INT8_MIN;
2310         env->vxsat = 0x1;
2311     }
2312     return res;
2313 }
2314 
2315 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2316 {
2317     int16_t res = a + b;
2318     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2319         res = a > 0 ? INT16_MAX : INT16_MIN;
2320         env->vxsat = 0x1;
2321     }
2322     return res;
2323 }
2324 
2325 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2326 {
2327     int32_t res = a + b;
2328     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2329         res = a > 0 ? INT32_MAX : INT32_MIN;
2330         env->vxsat = 0x1;
2331     }
2332     return res;
2333 }
2334 
2335 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2336 {
2337     int64_t res = a + b;
2338     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2339         res = a > 0 ? INT64_MAX : INT64_MIN;
2340         env->vxsat = 0x1;
2341     }
2342     return res;
2343 }
2344 
2345 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2346 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2347 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2348 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2349 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2350 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2351 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2352 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2353 
2354 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2355 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2356 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2357 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2358 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2359 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2360 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2361 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2362 
2363 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2364 {
2365     uint8_t res = a - b;
2366     if (res > a) {
2367         res = 0;
2368         env->vxsat = 0x1;
2369     }
2370     return res;
2371 }
2372 
2373 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2374                                uint16_t b)
2375 {
2376     uint16_t res = a - b;
2377     if (res > a) {
2378         res = 0;
2379         env->vxsat = 0x1;
2380     }
2381     return res;
2382 }
2383 
2384 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2385                                uint32_t b)
2386 {
2387     uint32_t res = a - b;
2388     if (res > a) {
2389         res = 0;
2390         env->vxsat = 0x1;
2391     }
2392     return res;
2393 }
2394 
2395 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2396                                uint64_t b)
2397 {
2398     uint64_t res = a - b;
2399     if (res > a) {
2400         res = 0;
2401         env->vxsat = 0x1;
2402     }
2403     return res;
2404 }
2405 
2406 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2407 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2408 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2409 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2410 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2411 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2412 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2413 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2414 
2415 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2416 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2417 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2418 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2419 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2420 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2421 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2422 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2423 
2424 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2425 {
2426     int8_t res = a - b;
2427     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2428         res = a >= 0 ? INT8_MAX : INT8_MIN;
2429         env->vxsat = 0x1;
2430     }
2431     return res;
2432 }
2433 
2434 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2435 {
2436     int16_t res = a - b;
2437     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2438         res = a >= 0 ? INT16_MAX : INT16_MIN;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2445 {
2446     int32_t res = a - b;
2447     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2448         res = a >= 0 ? INT32_MAX : INT32_MIN;
2449         env->vxsat = 0x1;
2450     }
2451     return res;
2452 }
2453 
2454 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2455 {
2456     int64_t res = a - b;
2457     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2458         res = a >= 0 ? INT64_MAX : INT64_MIN;
2459         env->vxsat = 0x1;
2460     }
2461     return res;
2462 }
2463 
2464 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2465 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2466 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2467 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2468 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2469 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2470 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2471 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2472 
2473 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2474 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2475 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2476 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2477 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2478 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2479 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2480 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2481 
2482 /* Vector Single-Width Averaging Add and Subtract */
2483 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2484 {
2485     uint8_t d = extract64(v, shift, 1);
2486     uint8_t d1;
2487     uint64_t D1, D2;
2488 
2489     if (shift == 0 || shift > 64) {
2490         return 0;
2491     }
2492 
2493     d1 = extract64(v, shift - 1, 1);
2494     D1 = extract64(v, 0, shift);
2495     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2496         return d1;
2497     } else if (vxrm == 1) { /* round-to-nearest-even */
2498         if (shift > 1) {
2499             D2 = extract64(v, 0, shift - 1);
2500             return d1 & ((D2 != 0) | d);
2501         } else {
2502             return d1 & d;
2503         }
2504     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2505         return !d & (D1 != 0);
2506     }
2507     return 0; /* round-down (truncate) */
2508 }
2509 
2510 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2511 {
2512     int64_t res = (int64_t)a + b;
2513     uint8_t round = get_round(vxrm, res, 1);
2514 
2515     return (res >> 1) + round;
2516 }
2517 
2518 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2519 {
2520     int64_t res = a + b;
2521     uint8_t round = get_round(vxrm, res, 1);
2522     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2523 
2524     /* With signed overflow, bit 64 is inverse of bit 63. */
2525     return ((res >> 1) ^ over) + round;
2526 }
2527 
2528 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2529 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2530 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2531 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2532 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2533 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2534 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2535 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2536 
2537 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2538 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2539 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2540 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2541 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2542 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2543 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2544 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2545 
2546 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2547                                uint32_t a, uint32_t b)
2548 {
2549     uint64_t res = (uint64_t)a + b;
2550     uint8_t round = get_round(vxrm, res, 1);
2551 
2552     return (res >> 1) + round;
2553 }
2554 
2555 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2556                                uint64_t a, uint64_t b)
2557 {
2558     uint64_t res = a + b;
2559     uint8_t round = get_round(vxrm, res, 1);
2560     uint64_t over = (uint64_t)(res < a) << 63;
2561 
2562     return ((res >> 1) | over) + round;
2563 }
2564 
2565 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2566 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2567 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2568 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2569 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2570 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2571 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2572 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2573 
2574 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2575 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2576 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2577 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2578 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2579 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2580 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2581 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2582 
2583 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2584 {
2585     int64_t res = (int64_t)a - b;
2586     uint8_t round = get_round(vxrm, res, 1);
2587 
2588     return (res >> 1) + round;
2589 }
2590 
2591 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2592 {
2593     int64_t res = (int64_t)a - b;
2594     uint8_t round = get_round(vxrm, res, 1);
2595     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2596 
2597     /* With signed overflow, bit 64 is inverse of bit 63. */
2598     return ((res >> 1) ^ over) + round;
2599 }
2600 
2601 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2602 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2603 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2604 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2605 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2606 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2607 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2608 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2609 
2610 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2611 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2612 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2613 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2614 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2615 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2616 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2617 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2618 
2619 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2620                                uint32_t a, uint32_t b)
2621 {
2622     int64_t res = (int64_t)a - b;
2623     uint8_t round = get_round(vxrm, res, 1);
2624 
2625     return (res >> 1) + round;
2626 }
2627 
2628 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2629                                uint64_t a, uint64_t b)
2630 {
2631     uint64_t res = (uint64_t)a - b;
2632     uint8_t round = get_round(vxrm, res, 1);
2633     uint64_t over = (uint64_t)(res > a) << 63;
2634 
2635     return ((res >> 1) | over) + round;
2636 }
2637 
2638 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2639 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2640 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2641 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2642 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2643 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2644 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2645 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2646 
2647 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2648 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2649 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2650 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2651 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2652 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2653 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2654 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2655 
2656 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2657 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2658 {
2659     uint8_t round;
2660     int16_t res;
2661 
2662     res = (int16_t)a * (int16_t)b;
2663     round = get_round(vxrm, res, 7);
2664     res   = (res >> 7) + round;
2665 
2666     if (res > INT8_MAX) {
2667         env->vxsat = 0x1;
2668         return INT8_MAX;
2669     } else if (res < INT8_MIN) {
2670         env->vxsat = 0x1;
2671         return INT8_MIN;
2672     } else {
2673         return res;
2674     }
2675 }
2676 
2677 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2678 {
2679     uint8_t round;
2680     int32_t res;
2681 
2682     res = (int32_t)a * (int32_t)b;
2683     round = get_round(vxrm, res, 15);
2684     res   = (res >> 15) + round;
2685 
2686     if (res > INT16_MAX) {
2687         env->vxsat = 0x1;
2688         return INT16_MAX;
2689     } else if (res < INT16_MIN) {
2690         env->vxsat = 0x1;
2691         return INT16_MIN;
2692     } else {
2693         return res;
2694     }
2695 }
2696 
2697 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2698 {
2699     uint8_t round;
2700     int64_t res;
2701 
2702     res = (int64_t)a * (int64_t)b;
2703     round = get_round(vxrm, res, 31);
2704     res   = (res >> 31) + round;
2705 
2706     if (res > INT32_MAX) {
2707         env->vxsat = 0x1;
2708         return INT32_MAX;
2709     } else if (res < INT32_MIN) {
2710         env->vxsat = 0x1;
2711         return INT32_MIN;
2712     } else {
2713         return res;
2714     }
2715 }
2716 
2717 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2718 {
2719     uint8_t round;
2720     uint64_t hi_64, lo_64;
2721     int64_t res;
2722 
2723     if (a == INT64_MIN && b == INT64_MIN) {
2724         env->vxsat = 1;
2725         return INT64_MAX;
2726     }
2727 
2728     muls64(&lo_64, &hi_64, a, b);
2729     round = get_round(vxrm, lo_64, 63);
2730     /*
2731      * Cannot overflow, as there are always
2732      * 2 sign bits after multiply.
2733      */
2734     res = (hi_64 << 1) | (lo_64 >> 63);
2735     if (round) {
2736         if (res == INT64_MAX) {
2737             env->vxsat = 1;
2738         } else {
2739             res += 1;
2740         }
2741     }
2742     return res;
2743 }
2744 
2745 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2746 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2747 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2748 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2749 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2750 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2751 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2752 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2753 
2754 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2755 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2756 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2757 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2758 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2759 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2760 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2761 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2762 
2763 /* Vector Single-Width Scaling Shift Instructions */
2764 static inline uint8_t
2765 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2766 {
2767     uint8_t round, shift = b & 0x7;
2768     uint8_t res;
2769 
2770     round = get_round(vxrm, a, shift);
2771     res   = (a >> shift)  + round;
2772     return res;
2773 }
2774 static inline uint16_t
2775 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2776 {
2777     uint8_t round, shift = b & 0xf;
2778     uint16_t res;
2779 
2780     round = get_round(vxrm, a, shift);
2781     res   = (a >> shift)  + round;
2782     return res;
2783 }
2784 static inline uint32_t
2785 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2786 {
2787     uint8_t round, shift = b & 0x1f;
2788     uint32_t res;
2789 
2790     round = get_round(vxrm, a, shift);
2791     res   = (a >> shift)  + round;
2792     return res;
2793 }
2794 static inline uint64_t
2795 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2796 {
2797     uint8_t round, shift = b & 0x3f;
2798     uint64_t res;
2799 
2800     round = get_round(vxrm, a, shift);
2801     res   = (a >> shift)  + round;
2802     return res;
2803 }
2804 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2805 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2806 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2807 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2808 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2809 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2810 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2811 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2812 
2813 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2814 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2815 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2816 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2817 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2818 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2819 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2820 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2821 
2822 static inline int8_t
2823 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2824 {
2825     uint8_t round, shift = b & 0x7;
2826     int8_t res;
2827 
2828     round = get_round(vxrm, a, shift);
2829     res   = (a >> shift)  + round;
2830     return res;
2831 }
2832 static inline int16_t
2833 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2834 {
2835     uint8_t round, shift = b & 0xf;
2836     int16_t res;
2837 
2838     round = get_round(vxrm, a, shift);
2839     res   = (a >> shift)  + round;
2840     return res;
2841 }
2842 static inline int32_t
2843 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2844 {
2845     uint8_t round, shift = b & 0x1f;
2846     int32_t res;
2847 
2848     round = get_round(vxrm, a, shift);
2849     res   = (a >> shift)  + round;
2850     return res;
2851 }
2852 static inline int64_t
2853 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2854 {
2855     uint8_t round, shift = b & 0x3f;
2856     int64_t res;
2857 
2858     round = get_round(vxrm, a, shift);
2859     res   = (a >> shift)  + round;
2860     return res;
2861 }
2862 
2863 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2864 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2865 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2866 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2867 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2868 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2869 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2870 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2871 
2872 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2873 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2874 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2875 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2876 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2877 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2878 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2879 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2880 
2881 /* Vector Narrowing Fixed-Point Clip Instructions */
2882 static inline int8_t
2883 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2884 {
2885     uint8_t round, shift = b & 0xf;
2886     int16_t res;
2887 
2888     round = get_round(vxrm, a, shift);
2889     res   = (a >> shift)  + round;
2890     if (res > INT8_MAX) {
2891         env->vxsat = 0x1;
2892         return INT8_MAX;
2893     } else if (res < INT8_MIN) {
2894         env->vxsat = 0x1;
2895         return INT8_MIN;
2896     } else {
2897         return res;
2898     }
2899 }
2900 
2901 static inline int16_t
2902 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2903 {
2904     uint8_t round, shift = b & 0x1f;
2905     int32_t res;
2906 
2907     round = get_round(vxrm, a, shift);
2908     res   = (a >> shift)  + round;
2909     if (res > INT16_MAX) {
2910         env->vxsat = 0x1;
2911         return INT16_MAX;
2912     } else if (res < INT16_MIN) {
2913         env->vxsat = 0x1;
2914         return INT16_MIN;
2915     } else {
2916         return res;
2917     }
2918 }
2919 
2920 static inline int32_t
2921 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2922 {
2923     uint8_t round, shift = b & 0x3f;
2924     int64_t res;
2925 
2926     round = get_round(vxrm, a, shift);
2927     res   = (a >> shift)  + round;
2928     if (res > INT32_MAX) {
2929         env->vxsat = 0x1;
2930         return INT32_MAX;
2931     } else if (res < INT32_MIN) {
2932         env->vxsat = 0x1;
2933         return INT32_MIN;
2934     } else {
2935         return res;
2936     }
2937 }
2938 
2939 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2940 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2941 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2942 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2943 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2944 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2945 
2946 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2947 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2948 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2949 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2950 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2951 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2952 
2953 static inline uint8_t
2954 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2955 {
2956     uint8_t round, shift = b & 0xf;
2957     uint16_t res;
2958 
2959     round = get_round(vxrm, a, shift);
2960     res   = (a >> shift)  + round;
2961     if (res > UINT8_MAX) {
2962         env->vxsat = 0x1;
2963         return UINT8_MAX;
2964     } else {
2965         return res;
2966     }
2967 }
2968 
2969 static inline uint16_t
2970 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2971 {
2972     uint8_t round, shift = b & 0x1f;
2973     uint32_t res;
2974 
2975     round = get_round(vxrm, a, shift);
2976     res   = (a >> shift)  + round;
2977     if (res > UINT16_MAX) {
2978         env->vxsat = 0x1;
2979         return UINT16_MAX;
2980     } else {
2981         return res;
2982     }
2983 }
2984 
2985 static inline uint32_t
2986 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2987 {
2988     uint8_t round, shift = b & 0x3f;
2989     uint64_t res;
2990 
2991     round = get_round(vxrm, a, shift);
2992     res   = (a >> shift)  + round;
2993     if (res > UINT32_MAX) {
2994         env->vxsat = 0x1;
2995         return UINT32_MAX;
2996     } else {
2997         return res;
2998     }
2999 }
3000 
3001 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3002 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3003 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3004 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3005 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3006 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3007 
3008 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3009 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3010 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3011 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3012 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3013 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3014 
3015 /*
3016  *** Vector Float Point Arithmetic Instructions
3017  */
3018 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3019 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3020 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3021                       CPURISCVState *env)                      \
3022 {                                                              \
3023     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3024     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3025     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3026 }
3027 
3028 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3029 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3030                   void *vs2, CPURISCVState *env,          \
3031                   uint32_t desc)                          \
3032 {                                                         \
3033     uint32_t vm = vext_vm(desc);                          \
3034     uint32_t vl = env->vl;                                \
3035     uint32_t total_elems =                                \
3036         vext_get_total_elems(env, desc, ESZ);             \
3037     uint32_t vta = vext_vta(desc);                        \
3038     uint32_t i;                                           \
3039                                                           \
3040     for (i = env->vstart; i < vl; i++) {                  \
3041         if (!vm && !vext_elem_mask(v0, i)) {              \
3042             continue;                                     \
3043         }                                                 \
3044         do_##NAME(vd, vs1, vs2, i, env);                  \
3045     }                                                     \
3046     env->vstart = 0;                                      \
3047     /* set tail elements to 1s */                         \
3048     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3049                       total_elems * ESZ);                 \
3050 }
3051 
3052 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3053 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3054 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3055 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3056 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3057 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3058 
3059 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3060 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3061                       CPURISCVState *env)                      \
3062 {                                                              \
3063     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3064     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3065 }
3066 
3067 #define GEN_VEXT_VF(NAME, ESZ)                            \
3068 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3069                   void *vs2, CPURISCVState *env,          \
3070                   uint32_t desc)                          \
3071 {                                                         \
3072     uint32_t vm = vext_vm(desc);                          \
3073     uint32_t vl = env->vl;                                \
3074     uint32_t total_elems =                                \
3075         vext_get_total_elems(env, desc, ESZ);              \
3076     uint32_t vta = vext_vta(desc);                        \
3077     uint32_t i;                                           \
3078                                                           \
3079     for (i = env->vstart; i < vl; i++) {                  \
3080         if (!vm && !vext_elem_mask(v0, i)) {              \
3081             continue;                                     \
3082         }                                                 \
3083         do_##NAME(vd, s1, vs2, i, env);                   \
3084     }                                                     \
3085     env->vstart = 0;                                      \
3086     /* set tail elements to 1s */                         \
3087     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3088                       total_elems * ESZ);                 \
3089 }
3090 
3091 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3092 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3093 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3094 GEN_VEXT_VF(vfadd_vf_h, 2)
3095 GEN_VEXT_VF(vfadd_vf_w, 4)
3096 GEN_VEXT_VF(vfadd_vf_d, 8)
3097 
3098 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3099 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3100 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3101 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3102 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3103 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3104 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3105 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3106 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3107 GEN_VEXT_VF(vfsub_vf_h, 2)
3108 GEN_VEXT_VF(vfsub_vf_w, 4)
3109 GEN_VEXT_VF(vfsub_vf_d, 8)
3110 
3111 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3112 {
3113     return float16_sub(b, a, s);
3114 }
3115 
3116 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3117 {
3118     return float32_sub(b, a, s);
3119 }
3120 
3121 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3122 {
3123     return float64_sub(b, a, s);
3124 }
3125 
3126 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3127 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3128 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3129 GEN_VEXT_VF(vfrsub_vf_h, 2)
3130 GEN_VEXT_VF(vfrsub_vf_w, 4)
3131 GEN_VEXT_VF(vfrsub_vf_d, 8)
3132 
3133 /* Vector Widening Floating-Point Add/Subtract Instructions */
3134 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3135 {
3136     return float32_add(float16_to_float32(a, true, s),
3137             float16_to_float32(b, true, s), s);
3138 }
3139 
3140 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3141 {
3142     return float64_add(float32_to_float64(a, s),
3143             float32_to_float64(b, s), s);
3144 
3145 }
3146 
3147 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3148 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3149 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3150 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3151 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3152 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3153 GEN_VEXT_VF(vfwadd_vf_h, 4)
3154 GEN_VEXT_VF(vfwadd_vf_w, 8)
3155 
3156 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3157 {
3158     return float32_sub(float16_to_float32(a, true, s),
3159             float16_to_float32(b, true, s), s);
3160 }
3161 
3162 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3163 {
3164     return float64_sub(float32_to_float64(a, s),
3165             float32_to_float64(b, s), s);
3166 
3167 }
3168 
3169 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3170 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3171 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3172 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3173 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3174 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3175 GEN_VEXT_VF(vfwsub_vf_h, 4)
3176 GEN_VEXT_VF(vfwsub_vf_w, 8)
3177 
3178 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3179 {
3180     return float32_add(a, float16_to_float32(b, true, s), s);
3181 }
3182 
3183 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3184 {
3185     return float64_add(a, float32_to_float64(b, s), s);
3186 }
3187 
3188 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3189 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3190 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3191 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3192 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3193 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3194 GEN_VEXT_VF(vfwadd_wf_h, 4)
3195 GEN_VEXT_VF(vfwadd_wf_w, 8)
3196 
3197 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3198 {
3199     return float32_sub(a, float16_to_float32(b, true, s), s);
3200 }
3201 
3202 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3203 {
3204     return float64_sub(a, float32_to_float64(b, s), s);
3205 }
3206 
3207 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3208 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3209 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3210 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3211 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3212 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3213 GEN_VEXT_VF(vfwsub_wf_h, 4)
3214 GEN_VEXT_VF(vfwsub_wf_w, 8)
3215 
3216 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3217 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3218 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3219 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3220 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3221 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3222 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3223 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3224 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3225 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3226 GEN_VEXT_VF(vfmul_vf_h, 2)
3227 GEN_VEXT_VF(vfmul_vf_w, 4)
3228 GEN_VEXT_VF(vfmul_vf_d, 8)
3229 
3230 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3231 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3232 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3233 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3234 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3235 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3236 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3237 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3238 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3239 GEN_VEXT_VF(vfdiv_vf_h, 2)
3240 GEN_VEXT_VF(vfdiv_vf_w, 4)
3241 GEN_VEXT_VF(vfdiv_vf_d, 8)
3242 
3243 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3244 {
3245     return float16_div(b, a, s);
3246 }
3247 
3248 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3249 {
3250     return float32_div(b, a, s);
3251 }
3252 
3253 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3254 {
3255     return float64_div(b, a, s);
3256 }
3257 
3258 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3259 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3260 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3261 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3262 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3263 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3264 
3265 /* Vector Widening Floating-Point Multiply */
3266 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3267 {
3268     return float32_mul(float16_to_float32(a, true, s),
3269             float16_to_float32(b, true, s), s);
3270 }
3271 
3272 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3273 {
3274     return float64_mul(float32_to_float64(a, s),
3275             float32_to_float64(b, s), s);
3276 
3277 }
3278 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3279 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3280 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3281 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3282 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3283 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3284 GEN_VEXT_VF(vfwmul_vf_h, 4)
3285 GEN_VEXT_VF(vfwmul_vf_w, 8)
3286 
3287 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3288 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3289 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3290         CPURISCVState *env)                                        \
3291 {                                                                  \
3292     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3293     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3294     TD d = *((TD *)vd + HD(i));                                    \
3295     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3296 }
3297 
3298 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3299 {
3300     return float16_muladd(a, b, d, 0, s);
3301 }
3302 
3303 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3304 {
3305     return float32_muladd(a, b, d, 0, s);
3306 }
3307 
3308 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3309 {
3310     return float64_muladd(a, b, d, 0, s);
3311 }
3312 
3313 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3314 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3315 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3316 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3317 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3318 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3319 
3320 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3321 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3322         CPURISCVState *env)                                       \
3323 {                                                                 \
3324     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3325     TD d = *((TD *)vd + HD(i));                                   \
3326     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3327 }
3328 
3329 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3330 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3331 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3332 GEN_VEXT_VF(vfmacc_vf_h, 2)
3333 GEN_VEXT_VF(vfmacc_vf_w, 4)
3334 GEN_VEXT_VF(vfmacc_vf_d, 8)
3335 
3336 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3337 {
3338     return float16_muladd(a, b, d,
3339             float_muladd_negate_c | float_muladd_negate_product, s);
3340 }
3341 
3342 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3343 {
3344     return float32_muladd(a, b, d,
3345             float_muladd_negate_c | float_muladd_negate_product, s);
3346 }
3347 
3348 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3349 {
3350     return float64_muladd(a, b, d,
3351             float_muladd_negate_c | float_muladd_negate_product, s);
3352 }
3353 
3354 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3355 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3356 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3357 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3358 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3359 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3360 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3361 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3362 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3363 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3364 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3365 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3366 
3367 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3368 {
3369     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3370 }
3371 
3372 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3373 {
3374     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3375 }
3376 
3377 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3378 {
3379     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3380 }
3381 
3382 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3383 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3384 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3385 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3386 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3387 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3388 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3389 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3390 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3391 GEN_VEXT_VF(vfmsac_vf_h, 2)
3392 GEN_VEXT_VF(vfmsac_vf_w, 4)
3393 GEN_VEXT_VF(vfmsac_vf_d, 8)
3394 
3395 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3396 {
3397     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3398 }
3399 
3400 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3401 {
3402     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3403 }
3404 
3405 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3406 {
3407     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3408 }
3409 
3410 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3411 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3412 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3413 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3414 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3415 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3416 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3417 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3418 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3419 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3420 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3421 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3422 
3423 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3424 {
3425     return float16_muladd(d, b, a, 0, s);
3426 }
3427 
3428 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3429 {
3430     return float32_muladd(d, b, a, 0, s);
3431 }
3432 
3433 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3434 {
3435     return float64_muladd(d, b, a, 0, s);
3436 }
3437 
3438 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3439 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3440 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3441 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3442 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3443 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3444 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3445 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3446 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3447 GEN_VEXT_VF(vfmadd_vf_h, 2)
3448 GEN_VEXT_VF(vfmadd_vf_w, 4)
3449 GEN_VEXT_VF(vfmadd_vf_d, 8)
3450 
3451 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3452 {
3453     return float16_muladd(d, b, a,
3454             float_muladd_negate_c | float_muladd_negate_product, s);
3455 }
3456 
3457 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3458 {
3459     return float32_muladd(d, b, a,
3460             float_muladd_negate_c | float_muladd_negate_product, s);
3461 }
3462 
3463 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3464 {
3465     return float64_muladd(d, b, a,
3466             float_muladd_negate_c | float_muladd_negate_product, s);
3467 }
3468 
3469 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3470 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3471 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3472 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3473 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3474 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3475 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3476 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3477 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3478 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3479 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3480 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3481 
3482 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3483 {
3484     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3485 }
3486 
3487 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3488 {
3489     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3490 }
3491 
3492 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3493 {
3494     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3495 }
3496 
3497 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3498 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3499 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3500 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3501 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3502 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3503 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3504 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3505 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3506 GEN_VEXT_VF(vfmsub_vf_h, 2)
3507 GEN_VEXT_VF(vfmsub_vf_w, 4)
3508 GEN_VEXT_VF(vfmsub_vf_d, 8)
3509 
3510 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3511 {
3512     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3513 }
3514 
3515 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3516 {
3517     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3518 }
3519 
3520 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3521 {
3522     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3523 }
3524 
3525 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3526 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3527 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3528 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3529 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3530 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3531 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3532 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3533 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3534 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3535 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3536 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3537 
3538 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3539 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3540 {
3541     return float32_muladd(float16_to_float32(a, true, s),
3542                         float16_to_float32(b, true, s), d, 0, s);
3543 }
3544 
3545 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3546 {
3547     return float64_muladd(float32_to_float64(a, s),
3548                         float32_to_float64(b, s), d, 0, s);
3549 }
3550 
3551 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3552 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3553 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3554 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3555 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3556 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3557 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3558 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3559 
3560 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3561 {
3562     return float32_muladd(float16_to_float32(a, true, s),
3563                         float16_to_float32(b, true, s), d,
3564                         float_muladd_negate_c | float_muladd_negate_product, s);
3565 }
3566 
3567 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3568 {
3569     return float64_muladd(float32_to_float64(a, s),
3570                         float32_to_float64(b, s), d,
3571                         float_muladd_negate_c | float_muladd_negate_product, s);
3572 }
3573 
3574 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3575 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3576 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3577 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3578 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3579 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3580 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3581 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3582 
3583 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3584 {
3585     return float32_muladd(float16_to_float32(a, true, s),
3586                         float16_to_float32(b, true, s), d,
3587                         float_muladd_negate_c, s);
3588 }
3589 
3590 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3591 {
3592     return float64_muladd(float32_to_float64(a, s),
3593                         float32_to_float64(b, s), d,
3594                         float_muladd_negate_c, s);
3595 }
3596 
3597 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3598 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3599 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3600 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3601 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3602 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3603 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3604 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3605 
3606 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3607 {
3608     return float32_muladd(float16_to_float32(a, true, s),
3609                         float16_to_float32(b, true, s), d,
3610                         float_muladd_negate_product, s);
3611 }
3612 
3613 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3614 {
3615     return float64_muladd(float32_to_float64(a, s),
3616                         float32_to_float64(b, s), d,
3617                         float_muladd_negate_product, s);
3618 }
3619 
3620 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3621 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3622 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3623 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3624 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3625 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3626 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3627 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3628 
3629 /* Vector Floating-Point Square-Root Instruction */
3630 /* (TD, T2, TX2) */
3631 #define OP_UU_H uint16_t, uint16_t, uint16_t
3632 #define OP_UU_W uint32_t, uint32_t, uint32_t
3633 #define OP_UU_D uint64_t, uint64_t, uint64_t
3634 
3635 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3636 static void do_##NAME(void *vd, void *vs2, int i,      \
3637         CPURISCVState *env)                            \
3638 {                                                      \
3639     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3640     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3641 }
3642 
3643 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3644 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3645         CPURISCVState *env, uint32_t desc)             \
3646 {                                                      \
3647     uint32_t vm = vext_vm(desc);                       \
3648     uint32_t vl = env->vl;                             \
3649     uint32_t total_elems =                             \
3650         vext_get_total_elems(env, desc, ESZ);          \
3651     uint32_t vta = vext_vta(desc);                     \
3652     uint32_t i;                                        \
3653                                                        \
3654     if (vl == 0) {                                     \
3655         return;                                        \
3656     }                                                  \
3657     for (i = env->vstart; i < vl; i++) {               \
3658         if (!vm && !vext_elem_mask(v0, i)) {           \
3659             continue;                                  \
3660         }                                              \
3661         do_##NAME(vd, vs2, i, env);                    \
3662     }                                                  \
3663     env->vstart = 0;                                   \
3664     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3665                       total_elems * ESZ);              \
3666 }
3667 
3668 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3669 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3670 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3671 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3672 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3673 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3674 
3675 /*
3676  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3677  *
3678  * Adapted from riscv-v-spec recip.c:
3679  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3680  */
3681 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3682 {
3683     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3684     uint64_t exp = extract64(f, frac_size, exp_size);
3685     uint64_t frac = extract64(f, 0, frac_size);
3686 
3687     const uint8_t lookup_table[] = {
3688         52, 51, 50, 48, 47, 46, 44, 43,
3689         42, 41, 40, 39, 38, 36, 35, 34,
3690         33, 32, 31, 30, 30, 29, 28, 27,
3691         26, 25, 24, 23, 23, 22, 21, 20,
3692         19, 19, 18, 17, 16, 16, 15, 14,
3693         14, 13, 12, 12, 11, 10, 10, 9,
3694         9, 8, 7, 7, 6, 6, 5, 4,
3695         4, 3, 3, 2, 2, 1, 1, 0,
3696         127, 125, 123, 121, 119, 118, 116, 114,
3697         113, 111, 109, 108, 106, 105, 103, 102,
3698         100, 99, 97, 96, 95, 93, 92, 91,
3699         90, 88, 87, 86, 85, 84, 83, 82,
3700         80, 79, 78, 77, 76, 75, 74, 73,
3701         72, 71, 70, 70, 69, 68, 67, 66,
3702         65, 64, 63, 63, 62, 61, 60, 59,
3703         59, 58, 57, 56, 56, 55, 54, 53
3704     };
3705     const int precision = 7;
3706 
3707     if (exp == 0 && frac != 0) { /* subnormal */
3708         /* Normalize the subnormal. */
3709         while (extract64(frac, frac_size - 1, 1) == 0) {
3710             exp--;
3711             frac <<= 1;
3712         }
3713 
3714         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3715     }
3716 
3717     int idx = ((exp & 1) << (precision - 1)) |
3718                 (frac >> (frac_size - precision + 1));
3719     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3720                             (frac_size - precision);
3721     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3722 
3723     uint64_t val = 0;
3724     val = deposit64(val, 0, frac_size, out_frac);
3725     val = deposit64(val, frac_size, exp_size, out_exp);
3726     val = deposit64(val, frac_size + exp_size, 1, sign);
3727     return val;
3728 }
3729 
3730 static float16 frsqrt7_h(float16 f, float_status *s)
3731 {
3732     int exp_size = 5, frac_size = 10;
3733     bool sign = float16_is_neg(f);
3734 
3735     /*
3736      * frsqrt7(sNaN) = canonical NaN
3737      * frsqrt7(-inf) = canonical NaN
3738      * frsqrt7(-normal) = canonical NaN
3739      * frsqrt7(-subnormal) = canonical NaN
3740      */
3741     if (float16_is_signaling_nan(f, s) ||
3742             (float16_is_infinity(f) && sign) ||
3743             (float16_is_normal(f) && sign) ||
3744             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3745         s->float_exception_flags |= float_flag_invalid;
3746         return float16_default_nan(s);
3747     }
3748 
3749     /* frsqrt7(qNaN) = canonical NaN */
3750     if (float16_is_quiet_nan(f, s)) {
3751         return float16_default_nan(s);
3752     }
3753 
3754     /* frsqrt7(+-0) = +-inf */
3755     if (float16_is_zero(f)) {
3756         s->float_exception_flags |= float_flag_divbyzero;
3757         return float16_set_sign(float16_infinity, sign);
3758     }
3759 
3760     /* frsqrt7(+inf) = +0 */
3761     if (float16_is_infinity(f) && !sign) {
3762         return float16_set_sign(float16_zero, sign);
3763     }
3764 
3765     /* +normal, +subnormal */
3766     uint64_t val = frsqrt7(f, exp_size, frac_size);
3767     return make_float16(val);
3768 }
3769 
3770 static float32 frsqrt7_s(float32 f, float_status *s)
3771 {
3772     int exp_size = 8, frac_size = 23;
3773     bool sign = float32_is_neg(f);
3774 
3775     /*
3776      * frsqrt7(sNaN) = canonical NaN
3777      * frsqrt7(-inf) = canonical NaN
3778      * frsqrt7(-normal) = canonical NaN
3779      * frsqrt7(-subnormal) = canonical NaN
3780      */
3781     if (float32_is_signaling_nan(f, s) ||
3782             (float32_is_infinity(f) && sign) ||
3783             (float32_is_normal(f) && sign) ||
3784             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3785         s->float_exception_flags |= float_flag_invalid;
3786         return float32_default_nan(s);
3787     }
3788 
3789     /* frsqrt7(qNaN) = canonical NaN */
3790     if (float32_is_quiet_nan(f, s)) {
3791         return float32_default_nan(s);
3792     }
3793 
3794     /* frsqrt7(+-0) = +-inf */
3795     if (float32_is_zero(f)) {
3796         s->float_exception_flags |= float_flag_divbyzero;
3797         return float32_set_sign(float32_infinity, sign);
3798     }
3799 
3800     /* frsqrt7(+inf) = +0 */
3801     if (float32_is_infinity(f) && !sign) {
3802         return float32_set_sign(float32_zero, sign);
3803     }
3804 
3805     /* +normal, +subnormal */
3806     uint64_t val = frsqrt7(f, exp_size, frac_size);
3807     return make_float32(val);
3808 }
3809 
3810 static float64 frsqrt7_d(float64 f, float_status *s)
3811 {
3812     int exp_size = 11, frac_size = 52;
3813     bool sign = float64_is_neg(f);
3814 
3815     /*
3816      * frsqrt7(sNaN) = canonical NaN
3817      * frsqrt7(-inf) = canonical NaN
3818      * frsqrt7(-normal) = canonical NaN
3819      * frsqrt7(-subnormal) = canonical NaN
3820      */
3821     if (float64_is_signaling_nan(f, s) ||
3822             (float64_is_infinity(f) && sign) ||
3823             (float64_is_normal(f) && sign) ||
3824             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3825         s->float_exception_flags |= float_flag_invalid;
3826         return float64_default_nan(s);
3827     }
3828 
3829     /* frsqrt7(qNaN) = canonical NaN */
3830     if (float64_is_quiet_nan(f, s)) {
3831         return float64_default_nan(s);
3832     }
3833 
3834     /* frsqrt7(+-0) = +-inf */
3835     if (float64_is_zero(f)) {
3836         s->float_exception_flags |= float_flag_divbyzero;
3837         return float64_set_sign(float64_infinity, sign);
3838     }
3839 
3840     /* frsqrt7(+inf) = +0 */
3841     if (float64_is_infinity(f) && !sign) {
3842         return float64_set_sign(float64_zero, sign);
3843     }
3844 
3845     /* +normal, +subnormal */
3846     uint64_t val = frsqrt7(f, exp_size, frac_size);
3847     return make_float64(val);
3848 }
3849 
3850 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3851 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3852 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3853 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3854 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3855 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3856 
3857 /*
3858  * Vector Floating-Point Reciprocal Estimate Instruction
3859  *
3860  * Adapted from riscv-v-spec recip.c:
3861  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3862  */
3863 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3864                       float_status *s)
3865 {
3866     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3867     uint64_t exp = extract64(f, frac_size, exp_size);
3868     uint64_t frac = extract64(f, 0, frac_size);
3869 
3870     const uint8_t lookup_table[] = {
3871         127, 125, 123, 121, 119, 117, 116, 114,
3872         112, 110, 109, 107, 105, 104, 102, 100,
3873         99, 97, 96, 94, 93, 91, 90, 88,
3874         87, 85, 84, 83, 81, 80, 79, 77,
3875         76, 75, 74, 72, 71, 70, 69, 68,
3876         66, 65, 64, 63, 62, 61, 60, 59,
3877         58, 57, 56, 55, 54, 53, 52, 51,
3878         50, 49, 48, 47, 46, 45, 44, 43,
3879         42, 41, 40, 40, 39, 38, 37, 36,
3880         35, 35, 34, 33, 32, 31, 31, 30,
3881         29, 28, 28, 27, 26, 25, 25, 24,
3882         23, 23, 22, 21, 21, 20, 19, 19,
3883         18, 17, 17, 16, 15, 15, 14, 14,
3884         13, 12, 12, 11, 11, 10, 9, 9,
3885         8, 8, 7, 7, 6, 5, 5, 4,
3886         4, 3, 3, 2, 2, 1, 1, 0
3887     };
3888     const int precision = 7;
3889 
3890     if (exp == 0 && frac != 0) { /* subnormal */
3891         /* Normalize the subnormal. */
3892         while (extract64(frac, frac_size - 1, 1) == 0) {
3893             exp--;
3894             frac <<= 1;
3895         }
3896 
3897         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3898 
3899         if (exp != 0 && exp != UINT64_MAX) {
3900             /*
3901              * Overflow to inf or max value of same sign,
3902              * depending on sign and rounding mode.
3903              */
3904             s->float_exception_flags |= (float_flag_inexact |
3905                                          float_flag_overflow);
3906 
3907             if ((s->float_rounding_mode == float_round_to_zero) ||
3908                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3909                 ((s->float_rounding_mode == float_round_up) && sign)) {
3910                 /* Return greatest/negative finite value. */
3911                 return (sign << (exp_size + frac_size)) |
3912                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3913             } else {
3914                 /* Return +-inf. */
3915                 return (sign << (exp_size + frac_size)) |
3916                     MAKE_64BIT_MASK(frac_size, exp_size);
3917             }
3918         }
3919     }
3920 
3921     int idx = frac >> (frac_size - precision);
3922     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3923                             (frac_size - precision);
3924     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3925 
3926     if (out_exp == 0 || out_exp == UINT64_MAX) {
3927         /*
3928          * The result is subnormal, but don't raise the underflow exception,
3929          * because there's no additional loss of precision.
3930          */
3931         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3932         if (out_exp == UINT64_MAX) {
3933             out_frac >>= 1;
3934             out_exp = 0;
3935         }
3936     }
3937 
3938     uint64_t val = 0;
3939     val = deposit64(val, 0, frac_size, out_frac);
3940     val = deposit64(val, frac_size, exp_size, out_exp);
3941     val = deposit64(val, frac_size + exp_size, 1, sign);
3942     return val;
3943 }
3944 
3945 static float16 frec7_h(float16 f, float_status *s)
3946 {
3947     int exp_size = 5, frac_size = 10;
3948     bool sign = float16_is_neg(f);
3949 
3950     /* frec7(+-inf) = +-0 */
3951     if (float16_is_infinity(f)) {
3952         return float16_set_sign(float16_zero, sign);
3953     }
3954 
3955     /* frec7(+-0) = +-inf */
3956     if (float16_is_zero(f)) {
3957         s->float_exception_flags |= float_flag_divbyzero;
3958         return float16_set_sign(float16_infinity, sign);
3959     }
3960 
3961     /* frec7(sNaN) = canonical NaN */
3962     if (float16_is_signaling_nan(f, s)) {
3963         s->float_exception_flags |= float_flag_invalid;
3964         return float16_default_nan(s);
3965     }
3966 
3967     /* frec7(qNaN) = canonical NaN */
3968     if (float16_is_quiet_nan(f, s)) {
3969         return float16_default_nan(s);
3970     }
3971 
3972     /* +-normal, +-subnormal */
3973     uint64_t val = frec7(f, exp_size, frac_size, s);
3974     return make_float16(val);
3975 }
3976 
3977 static float32 frec7_s(float32 f, float_status *s)
3978 {
3979     int exp_size = 8, frac_size = 23;
3980     bool sign = float32_is_neg(f);
3981 
3982     /* frec7(+-inf) = +-0 */
3983     if (float32_is_infinity(f)) {
3984         return float32_set_sign(float32_zero, sign);
3985     }
3986 
3987     /* frec7(+-0) = +-inf */
3988     if (float32_is_zero(f)) {
3989         s->float_exception_flags |= float_flag_divbyzero;
3990         return float32_set_sign(float32_infinity, sign);
3991     }
3992 
3993     /* frec7(sNaN) = canonical NaN */
3994     if (float32_is_signaling_nan(f, s)) {
3995         s->float_exception_flags |= float_flag_invalid;
3996         return float32_default_nan(s);
3997     }
3998 
3999     /* frec7(qNaN) = canonical NaN */
4000     if (float32_is_quiet_nan(f, s)) {
4001         return float32_default_nan(s);
4002     }
4003 
4004     /* +-normal, +-subnormal */
4005     uint64_t val = frec7(f, exp_size, frac_size, s);
4006     return make_float32(val);
4007 }
4008 
4009 static float64 frec7_d(float64 f, float_status *s)
4010 {
4011     int exp_size = 11, frac_size = 52;
4012     bool sign = float64_is_neg(f);
4013 
4014     /* frec7(+-inf) = +-0 */
4015     if (float64_is_infinity(f)) {
4016         return float64_set_sign(float64_zero, sign);
4017     }
4018 
4019     /* frec7(+-0) = +-inf */
4020     if (float64_is_zero(f)) {
4021         s->float_exception_flags |= float_flag_divbyzero;
4022         return float64_set_sign(float64_infinity, sign);
4023     }
4024 
4025     /* frec7(sNaN) = canonical NaN */
4026     if (float64_is_signaling_nan(f, s)) {
4027         s->float_exception_flags |= float_flag_invalid;
4028         return float64_default_nan(s);
4029     }
4030 
4031     /* frec7(qNaN) = canonical NaN */
4032     if (float64_is_quiet_nan(f, s)) {
4033         return float64_default_nan(s);
4034     }
4035 
4036     /* +-normal, +-subnormal */
4037     uint64_t val = frec7(f, exp_size, frac_size, s);
4038     return make_float64(val);
4039 }
4040 
4041 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4042 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4043 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4044 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4045 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4046 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4047 
4048 /* Vector Floating-Point MIN/MAX Instructions */
4049 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4050 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4051 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4052 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4053 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4054 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4055 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4056 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4057 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4058 GEN_VEXT_VF(vfmin_vf_h, 2)
4059 GEN_VEXT_VF(vfmin_vf_w, 4)
4060 GEN_VEXT_VF(vfmin_vf_d, 8)
4061 
4062 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4063 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4064 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4065 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4066 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4067 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4068 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4069 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4070 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4071 GEN_VEXT_VF(vfmax_vf_h, 2)
4072 GEN_VEXT_VF(vfmax_vf_w, 4)
4073 GEN_VEXT_VF(vfmax_vf_d, 8)
4074 
4075 /* Vector Floating-Point Sign-Injection Instructions */
4076 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4077 {
4078     return deposit64(b, 0, 15, a);
4079 }
4080 
4081 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4082 {
4083     return deposit64(b, 0, 31, a);
4084 }
4085 
4086 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4087 {
4088     return deposit64(b, 0, 63, a);
4089 }
4090 
4091 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4092 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4093 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4094 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4095 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4096 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4097 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4098 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4099 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4100 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4101 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4102 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4103 
4104 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4105 {
4106     return deposit64(~b, 0, 15, a);
4107 }
4108 
4109 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4110 {
4111     return deposit64(~b, 0, 31, a);
4112 }
4113 
4114 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4115 {
4116     return deposit64(~b, 0, 63, a);
4117 }
4118 
4119 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4120 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4121 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4122 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4123 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4124 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4125 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4126 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4127 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4128 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4129 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4130 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4131 
4132 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4133 {
4134     return deposit64(b ^ a, 0, 15, a);
4135 }
4136 
4137 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4138 {
4139     return deposit64(b ^ a, 0, 31, a);
4140 }
4141 
4142 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4143 {
4144     return deposit64(b ^ a, 0, 63, a);
4145 }
4146 
4147 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4148 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4149 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4150 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4151 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4152 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4153 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4154 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4155 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4156 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4157 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4158 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4159 
4160 /* Vector Floating-Point Compare Instructions */
4161 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4162 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4163                   CPURISCVState *env, uint32_t desc)          \
4164 {                                                             \
4165     uint32_t vm = vext_vm(desc);                              \
4166     uint32_t vl = env->vl;                                    \
4167     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4168     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4169     uint32_t i;                                               \
4170                                                               \
4171     for (i = env->vstart; i < vl; i++) {                      \
4172         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4173         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4174         if (!vm && !vext_elem_mask(v0, i)) {                  \
4175             continue;                                         \
4176         }                                                     \
4177         vext_set_elem_mask(vd, i,                             \
4178                            DO_OP(s2, s1, &env->fp_status));   \
4179     }                                                         \
4180     env->vstart = 0;                                          \
4181     /* mask destination register are always tail-agnostic */  \
4182     /* set tail elements to 1s */                             \
4183     if (vta_all_1s) {                                         \
4184         for (; i < total_elems; i++) {                        \
4185             vext_set_elem_mask(vd, i, 1);                     \
4186         }                                                     \
4187     }                                                         \
4188 }
4189 
4190 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4191 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4192 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4193 
4194 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4195 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4196                   CPURISCVState *env, uint32_t desc)                \
4197 {                                                                   \
4198     uint32_t vm = vext_vm(desc);                                    \
4199     uint32_t vl = env->vl;                                          \
4200     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4201     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4202     uint32_t i;                                                     \
4203                                                                     \
4204     for (i = env->vstart; i < vl; i++) {                            \
4205         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4206         if (!vm && !vext_elem_mask(v0, i)) {                        \
4207             continue;                                               \
4208         }                                                           \
4209         vext_set_elem_mask(vd, i,                                   \
4210                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4211     }                                                               \
4212     env->vstart = 0;                                                \
4213     /* mask destination register are always tail-agnostic */        \
4214     /* set tail elements to 1s */                                   \
4215     if (vta_all_1s) {                                               \
4216         for (; i < total_elems; i++) {                              \
4217             vext_set_elem_mask(vd, i, 1);                           \
4218         }                                                           \
4219     }                                                               \
4220 }
4221 
4222 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4223 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4224 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4225 
4226 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4227 {
4228     FloatRelation compare = float16_compare_quiet(a, b, s);
4229     return compare != float_relation_equal;
4230 }
4231 
4232 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4233 {
4234     FloatRelation compare = float32_compare_quiet(a, b, s);
4235     return compare != float_relation_equal;
4236 }
4237 
4238 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4239 {
4240     FloatRelation compare = float64_compare_quiet(a, b, s);
4241     return compare != float_relation_equal;
4242 }
4243 
4244 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4245 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4246 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4247 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4248 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4249 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4250 
4251 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4252 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4253 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4254 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4255 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4256 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4257 
4258 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4259 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4260 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4261 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4262 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4263 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4264 
4265 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4266 {
4267     FloatRelation compare = float16_compare(a, b, s);
4268     return compare == float_relation_greater;
4269 }
4270 
4271 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4272 {
4273     FloatRelation compare = float32_compare(a, b, s);
4274     return compare == float_relation_greater;
4275 }
4276 
4277 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4278 {
4279     FloatRelation compare = float64_compare(a, b, s);
4280     return compare == float_relation_greater;
4281 }
4282 
4283 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4284 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4285 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4286 
4287 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4288 {
4289     FloatRelation compare = float16_compare(a, b, s);
4290     return compare == float_relation_greater ||
4291            compare == float_relation_equal;
4292 }
4293 
4294 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4295 {
4296     FloatRelation compare = float32_compare(a, b, s);
4297     return compare == float_relation_greater ||
4298            compare == float_relation_equal;
4299 }
4300 
4301 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4302 {
4303     FloatRelation compare = float64_compare(a, b, s);
4304     return compare == float_relation_greater ||
4305            compare == float_relation_equal;
4306 }
4307 
4308 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4309 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4310 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4311 
4312 /* Vector Floating-Point Classify Instruction */
4313 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4314 static void do_##NAME(void *vd, void *vs2, int i)      \
4315 {                                                      \
4316     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4317     *((TD *)vd + HD(i)) = OP(s2);                      \
4318 }
4319 
4320 #define GEN_VEXT_V(NAME, ESZ)                          \
4321 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4322                   CPURISCVState *env, uint32_t desc)   \
4323 {                                                      \
4324     uint32_t vm = vext_vm(desc);                       \
4325     uint32_t vl = env->vl;                             \
4326     uint32_t total_elems =                             \
4327         vext_get_total_elems(env, desc, ESZ);          \
4328     uint32_t vta = vext_vta(desc);                     \
4329     uint32_t i;                                        \
4330                                                        \
4331     for (i = env->vstart; i < vl; i++) {               \
4332         if (!vm && !vext_elem_mask(v0, i)) {           \
4333             continue;                                  \
4334         }                                              \
4335         do_##NAME(vd, vs2, i);                         \
4336     }                                                  \
4337     env->vstart = 0;                                   \
4338     /* set tail elements to 1s */                      \
4339     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4340                       total_elems * ESZ);              \
4341 }
4342 
4343 target_ulong fclass_h(uint64_t frs1)
4344 {
4345     float16 f = frs1;
4346     bool sign = float16_is_neg(f);
4347 
4348     if (float16_is_infinity(f)) {
4349         return sign ? 1 << 0 : 1 << 7;
4350     } else if (float16_is_zero(f)) {
4351         return sign ? 1 << 3 : 1 << 4;
4352     } else if (float16_is_zero_or_denormal(f)) {
4353         return sign ? 1 << 2 : 1 << 5;
4354     } else if (float16_is_any_nan(f)) {
4355         float_status s = { }; /* for snan_bit_is_one */
4356         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4357     } else {
4358         return sign ? 1 << 1 : 1 << 6;
4359     }
4360 }
4361 
4362 target_ulong fclass_s(uint64_t frs1)
4363 {
4364     float32 f = frs1;
4365     bool sign = float32_is_neg(f);
4366 
4367     if (float32_is_infinity(f)) {
4368         return sign ? 1 << 0 : 1 << 7;
4369     } else if (float32_is_zero(f)) {
4370         return sign ? 1 << 3 : 1 << 4;
4371     } else if (float32_is_zero_or_denormal(f)) {
4372         return sign ? 1 << 2 : 1 << 5;
4373     } else if (float32_is_any_nan(f)) {
4374         float_status s = { }; /* for snan_bit_is_one */
4375         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4376     } else {
4377         return sign ? 1 << 1 : 1 << 6;
4378     }
4379 }
4380 
4381 target_ulong fclass_d(uint64_t frs1)
4382 {
4383     float64 f = frs1;
4384     bool sign = float64_is_neg(f);
4385 
4386     if (float64_is_infinity(f)) {
4387         return sign ? 1 << 0 : 1 << 7;
4388     } else if (float64_is_zero(f)) {
4389         return sign ? 1 << 3 : 1 << 4;
4390     } else if (float64_is_zero_or_denormal(f)) {
4391         return sign ? 1 << 2 : 1 << 5;
4392     } else if (float64_is_any_nan(f)) {
4393         float_status s = { }; /* for snan_bit_is_one */
4394         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4395     } else {
4396         return sign ? 1 << 1 : 1 << 6;
4397     }
4398 }
4399 
4400 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4401 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4402 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4403 GEN_VEXT_V(vfclass_v_h, 2)
4404 GEN_VEXT_V(vfclass_v_w, 4)
4405 GEN_VEXT_V(vfclass_v_d, 8)
4406 
4407 /* Vector Floating-Point Merge Instruction */
4408 
4409 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4410 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4411                   CPURISCVState *env, uint32_t desc)          \
4412 {                                                             \
4413     uint32_t vm = vext_vm(desc);                              \
4414     uint32_t vl = env->vl;                                    \
4415     uint32_t esz = sizeof(ETYPE);                             \
4416     uint32_t total_elems =                                    \
4417         vext_get_total_elems(env, desc, esz);                 \
4418     uint32_t vta = vext_vta(desc);                            \
4419     uint32_t i;                                               \
4420                                                               \
4421     for (i = env->vstart; i < vl; i++) {                      \
4422         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4423         *((ETYPE *)vd + H(i))                                 \
4424           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4425     }                                                         \
4426     env->vstart = 0;                                          \
4427     /* set tail elements to 1s */                             \
4428     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4429 }
4430 
4431 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4432 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4433 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4434 
4435 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4436 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4437 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4438 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4439 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4440 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4441 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4442 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4443 
4444 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4445 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4446 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4447 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4448 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4449 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4450 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4451 
4452 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4453 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4454 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4455 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4456 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4457 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4458 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4459 
4460 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4461 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4462 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4463 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4464 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4465 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4466 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4467 
4468 /* Widening Floating-Point/Integer Type-Convert Instructions */
4469 /* (TD, T2, TX2) */
4470 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4471 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4472 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4473 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4474 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4475 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4476 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4477 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4478 
4479 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4480 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4481 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4482 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4483 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4484 
4485 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4486 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4487 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4488 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4489 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4490 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4491 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4492 
4493 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4494 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4495 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4496 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4497 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4498 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4499 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4500 
4501 /*
4502  * vfwcvt.f.f.v vd, vs2, vm
4503  * Convert single-width float to double-width float.
4504  */
4505 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4506 {
4507     return float16_to_float32(a, true, s);
4508 }
4509 
4510 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4511 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4512 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4513 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4514 
4515 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4516 /* (TD, T2, TX2) */
4517 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4518 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4519 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4520 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4521 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4522 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4523 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4524 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4525 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4526 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4527 
4528 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4529 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4530 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4531 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4532 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4533 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4534 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4535 
4536 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4537 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4538 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4539 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4540 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4541 
4542 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4543 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4544 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4545 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4546 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4547 
4548 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4549 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4550 {
4551     return float32_to_float16(a, true, s);
4552 }
4553 
4554 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4555 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4556 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4557 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4558 
4559 /*
4560  *** Vector Reduction Operations
4561  */
4562 /* Vector Single-Width Integer Reduction Instructions */
4563 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4564 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4565         void *vs2, CPURISCVState *env, uint32_t desc)     \
4566 {                                                         \
4567     uint32_t vm = vext_vm(desc);                          \
4568     uint32_t vl = env->vl;                                \
4569     uint32_t esz = sizeof(TD);                            \
4570     uint32_t vlenb = simd_maxsz(desc);                    \
4571     uint32_t vta = vext_vta(desc);                        \
4572     uint32_t i;                                           \
4573     TD s1 =  *((TD *)vs1 + HD(0));                        \
4574                                                           \
4575     for (i = env->vstart; i < vl; i++) {                  \
4576         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4577         if (!vm && !vext_elem_mask(v0, i)) {              \
4578             continue;                                     \
4579         }                                                 \
4580         s1 = OP(s1, (TD)s2);                              \
4581     }                                                     \
4582     *((TD *)vd + HD(0)) = s1;                             \
4583     env->vstart = 0;                                      \
4584     /* set tail elements to 1s */                         \
4585     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4586 }
4587 
4588 /* vd[0] = sum(vs1[0], vs2[*]) */
4589 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4590 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4591 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4592 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4593 
4594 /* vd[0] = maxu(vs1[0], vs2[*]) */
4595 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4596 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4597 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4598 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4599 
4600 /* vd[0] = max(vs1[0], vs2[*]) */
4601 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4602 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4603 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4604 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4605 
4606 /* vd[0] = minu(vs1[0], vs2[*]) */
4607 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4608 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4609 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4610 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4611 
4612 /* vd[0] = min(vs1[0], vs2[*]) */
4613 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4614 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4615 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4616 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4617 
4618 /* vd[0] = and(vs1[0], vs2[*]) */
4619 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4620 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4621 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4622 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4623 
4624 /* vd[0] = or(vs1[0], vs2[*]) */
4625 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4626 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4627 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4628 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4629 
4630 /* vd[0] = xor(vs1[0], vs2[*]) */
4631 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4632 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4633 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4634 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4635 
4636 /* Vector Widening Integer Reduction Instructions */
4637 /* signed sum reduction into double-width accumulator */
4638 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4639 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4640 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4641 
4642 /* Unsigned sum reduction into double-width accumulator */
4643 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4644 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4645 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4646 
4647 /* Vector Single-Width Floating-Point Reduction Instructions */
4648 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4649 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4650                   void *vs2, CPURISCVState *env,           \
4651                   uint32_t desc)                           \
4652 {                                                          \
4653     uint32_t vm = vext_vm(desc);                           \
4654     uint32_t vl = env->vl;                                 \
4655     uint32_t esz = sizeof(TD);                             \
4656     uint32_t vlenb = simd_maxsz(desc);                     \
4657     uint32_t vta = vext_vta(desc);                         \
4658     uint32_t i;                                            \
4659     TD s1 =  *((TD *)vs1 + HD(0));                         \
4660                                                            \
4661     for (i = env->vstart; i < vl; i++) {                   \
4662         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4663         if (!vm && !vext_elem_mask(v0, i)) {               \
4664             continue;                                      \
4665         }                                                  \
4666         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4667     }                                                      \
4668     *((TD *)vd + HD(0)) = s1;                              \
4669     env->vstart = 0;                                       \
4670     /* set tail elements to 1s */                          \
4671     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4672 }
4673 
4674 /* Unordered sum */
4675 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4676 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4677 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4678 
4679 /* Maximum value */
4680 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4681 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4682 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4683 
4684 /* Minimum value */
4685 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4686 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4687 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4688 
4689 /* Vector Widening Floating-Point Reduction Instructions */
4690 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4691 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4692                             void *vs2, CPURISCVState *env, uint32_t desc)
4693 {
4694     uint32_t vm = vext_vm(desc);
4695     uint32_t vl = env->vl;
4696     uint32_t esz = sizeof(uint32_t);
4697     uint32_t vlenb = simd_maxsz(desc);
4698     uint32_t vta = vext_vta(desc);
4699     uint32_t i;
4700     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4701 
4702     for (i = env->vstart; i < vl; i++) {
4703         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4704         if (!vm && !vext_elem_mask(v0, i)) {
4705             continue;
4706         }
4707         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4708                          &env->fp_status);
4709     }
4710     *((uint32_t *)vd + H4(0)) = s1;
4711     env->vstart = 0;
4712     /* set tail elements to 1s */
4713     vext_set_elems_1s(vd, vta, esz, vlenb);
4714 }
4715 
4716 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4717                             void *vs2, CPURISCVState *env, uint32_t desc)
4718 {
4719     uint32_t vm = vext_vm(desc);
4720     uint32_t vl = env->vl;
4721     uint32_t esz = sizeof(uint64_t);
4722     uint32_t vlenb = simd_maxsz(desc);
4723     uint32_t vta = vext_vta(desc);
4724     uint32_t i;
4725     uint64_t s1 =  *((uint64_t *)vs1);
4726 
4727     for (i = env->vstart; i < vl; i++) {
4728         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4729         if (!vm && !vext_elem_mask(v0, i)) {
4730             continue;
4731         }
4732         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4733                          &env->fp_status);
4734     }
4735     *((uint64_t *)vd) = s1;
4736     env->vstart = 0;
4737     /* set tail elements to 1s */
4738     vext_set_elems_1s(vd, vta, esz, vlenb);
4739 }
4740 
4741 /*
4742  *** Vector Mask Operations
4743  */
4744 /* Vector Mask-Register Logical Instructions */
4745 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4746 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4747                   void *vs2, CPURISCVState *env,          \
4748                   uint32_t desc)                          \
4749 {                                                         \
4750     uint32_t vl = env->vl;                                \
4751     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4752     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4753     uint32_t i;                                           \
4754     int a, b;                                             \
4755                                                           \
4756     for (i = env->vstart; i < vl; i++) {                  \
4757         a = vext_elem_mask(vs1, i);                       \
4758         b = vext_elem_mask(vs2, i);                       \
4759         vext_set_elem_mask(vd, i, OP(b, a));              \
4760     }                                                     \
4761     env->vstart = 0;                                      \
4762     /* mask destination register are always tail-         \
4763      * agnostic                                           \
4764      */                                                   \
4765     /* set tail elements to 1s */                         \
4766     if (vta_all_1s) {                                     \
4767         for (; i < total_elems; i++) {                    \
4768             vext_set_elem_mask(vd, i, 1);                 \
4769         }                                                 \
4770     }                                                     \
4771 }
4772 
4773 #define DO_NAND(N, M)  (!(N & M))
4774 #define DO_ANDNOT(N, M)  (N & !M)
4775 #define DO_NOR(N, M)  (!(N | M))
4776 #define DO_ORNOT(N, M)  (N | !M)
4777 #define DO_XNOR(N, M)  (!(N ^ M))
4778 
4779 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4780 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4781 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4782 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4783 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4784 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4785 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4786 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4787 
4788 /* Vector count population in mask vcpop */
4789 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4790                              uint32_t desc)
4791 {
4792     target_ulong cnt = 0;
4793     uint32_t vm = vext_vm(desc);
4794     uint32_t vl = env->vl;
4795     int i;
4796 
4797     for (i = env->vstart; i < vl; i++) {
4798         if (vm || vext_elem_mask(v0, i)) {
4799             if (vext_elem_mask(vs2, i)) {
4800                 cnt++;
4801             }
4802         }
4803     }
4804     env->vstart = 0;
4805     return cnt;
4806 }
4807 
4808 /* vfirst find-first-set mask bit*/
4809 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4810                               uint32_t desc)
4811 {
4812     uint32_t vm = vext_vm(desc);
4813     uint32_t vl = env->vl;
4814     int i;
4815 
4816     for (i = env->vstart; i < vl; i++) {
4817         if (vm || vext_elem_mask(v0, i)) {
4818             if (vext_elem_mask(vs2, i)) {
4819                 return i;
4820             }
4821         }
4822     }
4823     env->vstart = 0;
4824     return -1LL;
4825 }
4826 
4827 enum set_mask_type {
4828     ONLY_FIRST = 1,
4829     INCLUDE_FIRST,
4830     BEFORE_FIRST,
4831 };
4832 
4833 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4834                    uint32_t desc, enum set_mask_type type)
4835 {
4836     uint32_t vm = vext_vm(desc);
4837     uint32_t vl = env->vl;
4838     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4839     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4840     int i;
4841     bool first_mask_bit = false;
4842 
4843     for (i = env->vstart; i < vl; i++) {
4844         if (!vm && !vext_elem_mask(v0, i)) {
4845             continue;
4846         }
4847         /* write a zero to all following active elements */
4848         if (first_mask_bit) {
4849             vext_set_elem_mask(vd, i, 0);
4850             continue;
4851         }
4852         if (vext_elem_mask(vs2, i)) {
4853             first_mask_bit = true;
4854             if (type == BEFORE_FIRST) {
4855                 vext_set_elem_mask(vd, i, 0);
4856             } else {
4857                 vext_set_elem_mask(vd, i, 1);
4858             }
4859         } else {
4860             if (type == ONLY_FIRST) {
4861                 vext_set_elem_mask(vd, i, 0);
4862             } else {
4863                 vext_set_elem_mask(vd, i, 1);
4864             }
4865         }
4866     }
4867     env->vstart = 0;
4868     /* mask destination register are always tail-agnostic */
4869     /* set tail elements to 1s */
4870     if (vta_all_1s) {
4871         for (; i < total_elems; i++) {
4872             vext_set_elem_mask(vd, i, 1);
4873         }
4874     }
4875 }
4876 
4877 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4878                      uint32_t desc)
4879 {
4880     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4881 }
4882 
4883 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4884                      uint32_t desc)
4885 {
4886     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4887 }
4888 
4889 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4890                      uint32_t desc)
4891 {
4892     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4893 }
4894 
4895 /* Vector Iota Instruction */
4896 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4897 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4898                   uint32_t desc)                                          \
4899 {                                                                         \
4900     uint32_t vm = vext_vm(desc);                                          \
4901     uint32_t vl = env->vl;                                                \
4902     uint32_t esz = sizeof(ETYPE);                                         \
4903     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4904     uint32_t vta = vext_vta(desc);                                        \
4905     uint32_t sum = 0;                                                     \
4906     int i;                                                                \
4907                                                                           \
4908     for (i = env->vstart; i < vl; i++) {                                  \
4909         if (!vm && !vext_elem_mask(v0, i)) {                              \
4910             continue;                                                     \
4911         }                                                                 \
4912         *((ETYPE *)vd + H(i)) = sum;                                      \
4913         if (vext_elem_mask(vs2, i)) {                                     \
4914             sum++;                                                        \
4915         }                                                                 \
4916     }                                                                     \
4917     env->vstart = 0;                                                      \
4918     /* set tail elements to 1s */                                         \
4919     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4920 }
4921 
4922 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4923 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4924 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4925 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4926 
4927 /* Vector Element Index Instruction */
4928 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4929 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4930 {                                                                         \
4931     uint32_t vm = vext_vm(desc);                                          \
4932     uint32_t vl = env->vl;                                                \
4933     uint32_t esz = sizeof(ETYPE);                                         \
4934     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4935     uint32_t vta = vext_vta(desc);                                        \
4936     int i;                                                                \
4937                                                                           \
4938     for (i = env->vstart; i < vl; i++) {                                  \
4939         if (!vm && !vext_elem_mask(v0, i)) {                              \
4940             continue;                                                     \
4941         }                                                                 \
4942         *((ETYPE *)vd + H(i)) = i;                                        \
4943     }                                                                     \
4944     env->vstart = 0;                                                      \
4945     /* set tail elements to 1s */                                         \
4946     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4947 }
4948 
4949 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4950 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4951 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4952 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4953 
4954 /*
4955  *** Vector Permutation Instructions
4956  */
4957 
4958 /* Vector Slide Instructions */
4959 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4960 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4961                   CPURISCVState *env, uint32_t desc)                      \
4962 {                                                                         \
4963     uint32_t vm = vext_vm(desc);                                          \
4964     uint32_t vl = env->vl;                                                \
4965     uint32_t esz = sizeof(ETYPE);                                         \
4966     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4967     uint32_t vta = vext_vta(desc);                                        \
4968     target_ulong offset = s1, i_min, i;                                   \
4969                                                                           \
4970     i_min = MAX(env->vstart, offset);                                     \
4971     for (i = i_min; i < vl; i++) {                                        \
4972         if (!vm && !vext_elem_mask(v0, i)) {                              \
4973             continue;                                                     \
4974         }                                                                 \
4975         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4976     }                                                                     \
4977     /* set tail elements to 1s */                                         \
4978     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4979 }
4980 
4981 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4982 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4983 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4984 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4985 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4986 
4987 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4988 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4989                   CPURISCVState *env, uint32_t desc)                      \
4990 {                                                                         \
4991     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4992     uint32_t vm = vext_vm(desc);                                          \
4993     uint32_t vl = env->vl;                                                \
4994     uint32_t esz = sizeof(ETYPE);                                         \
4995     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4996     uint32_t vta = vext_vta(desc);                                        \
4997     target_ulong i_max, i;                                                \
4998                                                                           \
4999     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5000     for (i = env->vstart; i < i_max; ++i) {                               \
5001         if (vm || vext_elem_mask(v0, i)) {                                \
5002             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
5003         }                                                                 \
5004     }                                                                     \
5005                                                                           \
5006     for (i = i_max; i < vl; ++i) {                                        \
5007         if (vm || vext_elem_mask(v0, i)) {                                \
5008             *((ETYPE *)vd + H(i)) = 0;                                    \
5009         }                                                                 \
5010     }                                                                     \
5011                                                                           \
5012     env->vstart = 0;                                                      \
5013     /* set tail elements to 1s */                                         \
5014     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5015 }
5016 
5017 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5018 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5019 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5020 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5021 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5022 
5023 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5024 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5025                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5026 {                                                                           \
5027     typedef uint##BITWIDTH##_t ETYPE;                                       \
5028     uint32_t vm = vext_vm(desc);                                            \
5029     uint32_t vl = env->vl;                                                  \
5030     uint32_t esz = sizeof(ETYPE);                                           \
5031     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5032     uint32_t vta = vext_vta(desc);                                          \
5033     uint32_t i;                                                             \
5034                                                                             \
5035     for (i = env->vstart; i < vl; i++) {                                    \
5036         if (!vm && !vext_elem_mask(v0, i)) {                                \
5037             continue;                                                       \
5038         }                                                                   \
5039         if (i == 0) {                                                       \
5040             *((ETYPE *)vd + H(i)) = s1;                                     \
5041         } else {                                                            \
5042             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5043         }                                                                   \
5044     }                                                                       \
5045     env->vstart = 0;                                                        \
5046     /* set tail elements to 1s */                                           \
5047     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5048 }
5049 
5050 GEN_VEXT_VSLIE1UP(8,  H1)
5051 GEN_VEXT_VSLIE1UP(16, H2)
5052 GEN_VEXT_VSLIE1UP(32, H4)
5053 GEN_VEXT_VSLIE1UP(64, H8)
5054 
5055 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5056 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5057                   CPURISCVState *env, uint32_t desc)              \
5058 {                                                                 \
5059     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5060 }
5061 
5062 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5063 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5064 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5065 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5066 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5067 
5068 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5069 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5070                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5071 {                                                                             \
5072     typedef uint##BITWIDTH##_t ETYPE;                                         \
5073     uint32_t vm = vext_vm(desc);                                              \
5074     uint32_t vl = env->vl;                                                    \
5075     uint32_t esz = sizeof(ETYPE);                                             \
5076     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5077     uint32_t vta = vext_vta(desc);                                            \
5078     uint32_t i;                                                               \
5079                                                                               \
5080     for (i = env->vstart; i < vl; i++) {                                      \
5081         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5082             continue;                                                         \
5083         }                                                                     \
5084         if (i == vl - 1) {                                                    \
5085             *((ETYPE *)vd + H(i)) = s1;                                       \
5086         } else {                                                              \
5087             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5088         }                                                                     \
5089     }                                                                         \
5090     env->vstart = 0;                                                          \
5091     /* set tail elements to 1s */                                             \
5092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5093 }
5094 
5095 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5096 GEN_VEXT_VSLIDE1DOWN(16, H2)
5097 GEN_VEXT_VSLIDE1DOWN(32, H4)
5098 GEN_VEXT_VSLIDE1DOWN(64, H8)
5099 
5100 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5101 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5102                   CPURISCVState *env, uint32_t desc)              \
5103 {                                                                 \
5104     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5105 }
5106 
5107 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5108 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5109 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5110 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5111 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5112 
5113 /* Vector Floating-Point Slide Instructions */
5114 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5115 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5116                   CPURISCVState *env, uint32_t desc)          \
5117 {                                                             \
5118     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5119 }
5120 
5121 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5122 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5123 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5124 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5125 
5126 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5127 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5128                   CPURISCVState *env, uint32_t desc)          \
5129 {                                                             \
5130     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5131 }
5132 
5133 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5134 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5135 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5136 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5137 
5138 /* Vector Register Gather Instruction */
5139 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5140 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5141                   CPURISCVState *env, uint32_t desc)                      \
5142 {                                                                         \
5143     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5144     uint32_t vm = vext_vm(desc);                                          \
5145     uint32_t vl = env->vl;                                                \
5146     uint32_t esz = sizeof(TS2);                                           \
5147     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5148     uint32_t vta = vext_vta(desc);                                        \
5149     uint64_t index;                                                       \
5150     uint32_t i;                                                           \
5151                                                                           \
5152     for (i = env->vstart; i < vl; i++) {                                  \
5153         if (!vm && !vext_elem_mask(v0, i)) {                              \
5154             continue;                                                     \
5155         }                                                                 \
5156         index = *((TS1 *)vs1 + HS1(i));                                   \
5157         if (index >= vlmax) {                                             \
5158             *((TS2 *)vd + HS2(i)) = 0;                                    \
5159         } else {                                                          \
5160             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5161         }                                                                 \
5162     }                                                                     \
5163     env->vstart = 0;                                                      \
5164     /* set tail elements to 1s */                                         \
5165     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5166 }
5167 
5168 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5169 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5170 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5171 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5172 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5173 
5174 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5175 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5176 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5177 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5178 
5179 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5180 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5181                   CPURISCVState *env, uint32_t desc)                      \
5182 {                                                                         \
5183     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5184     uint32_t vm = vext_vm(desc);                                          \
5185     uint32_t vl = env->vl;                                                \
5186     uint32_t esz = sizeof(ETYPE);                                         \
5187     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5188     uint32_t vta = vext_vta(desc);                                        \
5189     uint64_t index = s1;                                                  \
5190     uint32_t i;                                                           \
5191                                                                           \
5192     for (i = env->vstart; i < vl; i++) {                                  \
5193         if (!vm && !vext_elem_mask(v0, i)) {                              \
5194             continue;                                                     \
5195         }                                                                 \
5196         if (index >= vlmax) {                                             \
5197             *((ETYPE *)vd + H(i)) = 0;                                    \
5198         } else {                                                          \
5199             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5200         }                                                                 \
5201     }                                                                     \
5202     env->vstart = 0;                                                      \
5203     /* set tail elements to 1s */                                         \
5204     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5205 }
5206 
5207 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5208 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5209 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5210 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5211 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5212 
5213 /* Vector Compress Instruction */
5214 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5215 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5216                   CPURISCVState *env, uint32_t desc)                      \
5217 {                                                                         \
5218     uint32_t vl = env->vl;                                                \
5219     uint32_t esz = sizeof(ETYPE);                                         \
5220     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5221     uint32_t vta = vext_vta(desc);                                        \
5222     uint32_t num = 0, i;                                                  \
5223                                                                           \
5224     for (i = env->vstart; i < vl; i++) {                                  \
5225         if (!vext_elem_mask(vs1, i)) {                                    \
5226             continue;                                                     \
5227         }                                                                 \
5228         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5229         num++;                                                            \
5230     }                                                                     \
5231     env->vstart = 0;                                                      \
5232     /* set tail elements to 1s */                                         \
5233     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5234 }
5235 
5236 /* Compress into vd elements of vs2 where vs1 is enabled */
5237 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5238 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5239 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5240 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5241 
5242 /* Vector Whole Register Move */
5243 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5244 {
5245     /* EEW = SEW */
5246     uint32_t maxsz = simd_maxsz(desc);
5247     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5248     uint32_t startb = env->vstart * sewb;
5249     uint32_t i = startb;
5250 
5251     memcpy((uint8_t *)vd + H1(i),
5252            (uint8_t *)vs2 + H1(i),
5253            maxsz - startb);
5254 
5255     env->vstart = 0;
5256 }
5257 
5258 /* Vector Integer Extension */
5259 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5260 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5261                   CPURISCVState *env, uint32_t desc)             \
5262 {                                                                \
5263     uint32_t vl = env->vl;                                       \
5264     uint32_t vm = vext_vm(desc);                                 \
5265     uint32_t esz = sizeof(ETYPE);                                \
5266     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5267     uint32_t vta = vext_vta(desc);                               \
5268     uint32_t i;                                                  \
5269                                                                  \
5270     for (i = env->vstart; i < vl; i++) {                         \
5271         if (!vm && !vext_elem_mask(v0, i)) {                     \
5272             continue;                                            \
5273         }                                                        \
5274         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5275     }                                                            \
5276     env->vstart = 0;                                             \
5277     /* set tail elements to 1s */                                \
5278     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5279 }
5280 
5281 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5282 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5283 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5284 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5285 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5286 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5287 
5288 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5289 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5290 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5291 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5292 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5293 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5294