xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 355d5584)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286 
287     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
288         if (!vm && !vext_elem_mask(v0, i)) {
289             continue;
290         }
291 
292         k = 0;
293         while (k < nf) {
294             target_ulong addr = base + stride * i + (k << log2_esz);
295             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
296             k++;
297         }
298     }
299     env->vstart = 0;
300     /* set tail elements to 1s */
301     for (k = 0; k < nf; ++k) {
302         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
303                           (k * max_elems + max_elems) * esz);
304     }
305     if (nf * max_elems % total_elems != 0) {
306         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
307         uint32_t registers_used =
308             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
309         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
310                           registers_used * vlenb);
311     }
312 }
313 
314 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
315 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
316                   target_ulong stride, CPURISCVState *env,              \
317                   uint32_t desc)                                        \
318 {                                                                       \
319     uint32_t vm = vext_vm(desc);                                        \
320     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
321                      ctzl(sizeof(ETYPE)), GETPC());                     \
322 }
323 
324 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
325 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
326 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
327 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
328 
329 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
330 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
331                   target_ulong stride, CPURISCVState *env,              \
332                   uint32_t desc)                                        \
333 {                                                                       \
334     uint32_t vm = vext_vm(desc);                                        \
335     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
336                      ctzl(sizeof(ETYPE)), GETPC());                     \
337 }
338 
339 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
340 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
341 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
342 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
343 
344 /*
345  *** unit-stride: access elements stored contiguously in memory
346  */
347 
348 /* unmasked unit-stride load and store operation*/
349 static void
350 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
351              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
352              uintptr_t ra)
353 {
354     uint32_t i, k;
355     uint32_t nf = vext_nf(desc);
356     uint32_t max_elems = vext_max_elems(desc, log2_esz);
357     uint32_t esz = 1 << log2_esz;
358     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
359     uint32_t vta = vext_vta(desc);
360 
361     /* load bytes from guest memory */
362     for (i = env->vstart; i < evl; i++, env->vstart++) {
363         k = 0;
364         while (k < nf) {
365             target_ulong addr = base + ((i * nf + k) << log2_esz);
366             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
367             k++;
368         }
369     }
370     env->vstart = 0;
371     /* set tail elements to 1s */
372     for (k = 0; k < nf; ++k) {
373         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
374                           (k * max_elems + max_elems) * esz);
375     }
376     if (nf * max_elems % total_elems != 0) {
377         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
378         uint32_t registers_used =
379             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
380         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
381                           registers_used * vlenb);
382     }
383 }
384 
385 /*
386  * masked unit-stride load and store operation will be a special case of stride,
387  * stride = NF * sizeof (MTYPE)
388  */
389 
390 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
391 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
392                          CPURISCVState *env, uint32_t desc)             \
393 {                                                                       \
394     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
395     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
396                      ctzl(sizeof(ETYPE)), GETPC());                     \
397 }                                                                       \
398                                                                         \
399 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
400                   CPURISCVState *env, uint32_t desc)                    \
401 {                                                                       \
402     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
403                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
404 }
405 
406 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
407 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
408 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
409 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
410 
411 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
412 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
413                          CPURISCVState *env, uint32_t desc)              \
414 {                                                                        \
415     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
416     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
417                      ctzl(sizeof(ETYPE)), GETPC());                      \
418 }                                                                        \
419                                                                          \
420 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
421                   CPURISCVState *env, uint32_t desc)                     \
422 {                                                                        \
423     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
424                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
425 }
426 
427 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
428 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
429 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
430 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
431 
432 /*
433  *** unit stride mask load and store, EEW = 1
434  */
435 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
436                     CPURISCVState *env, uint32_t desc)
437 {
438     /* evl = ceil(vl/8) */
439     uint8_t evl = (env->vl + 7) >> 3;
440     vext_ldst_us(vd, base, env, desc, lde_b,
441                  0, evl, GETPC());
442 }
443 
444 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
445                     CPURISCVState *env, uint32_t desc)
446 {
447     /* evl = ceil(vl/8) */
448     uint8_t evl = (env->vl + 7) >> 3;
449     vext_ldst_us(vd, base, env, desc, ste_b,
450                  0, evl, GETPC());
451 }
452 
453 /*
454  *** index: access vector element from indexed memory
455  */
456 typedef target_ulong vext_get_index_addr(target_ulong base,
457         uint32_t idx, void *vs2);
458 
459 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
460 static target_ulong NAME(target_ulong base,            \
461                          uint32_t idx, void *vs2)      \
462 {                                                      \
463     return (base + *((ETYPE *)vs2 + H(idx)));          \
464 }
465 
466 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
467 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
468 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
469 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
470 
471 static inline void
472 vext_ldst_index(void *vd, void *v0, target_ulong base,
473                 void *vs2, CPURISCVState *env, uint32_t desc,
474                 vext_get_index_addr get_index_addr,
475                 vext_ldst_elem_fn *ldst_elem,
476                 uint32_t log2_esz, uintptr_t ra)
477 {
478     uint32_t i, k;
479     uint32_t nf = vext_nf(desc);
480     uint32_t vm = vext_vm(desc);
481     uint32_t max_elems = vext_max_elems(desc, log2_esz);
482     uint32_t esz = 1 << log2_esz;
483     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
484     uint32_t vta = vext_vta(desc);
485 
486     /* load bytes from guest memory */
487     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
488         if (!vm && !vext_elem_mask(v0, i)) {
489             continue;
490         }
491 
492         k = 0;
493         while (k < nf) {
494             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
495             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
496             k++;
497         }
498     }
499     env->vstart = 0;
500     /* set tail elements to 1s */
501     for (k = 0; k < nf; ++k) {
502         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
503                           (k * max_elems + max_elems) * esz);
504     }
505     if (nf * max_elems % total_elems != 0) {
506         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
507         uint32_t registers_used =
508             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
509         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
510                           registers_used * vlenb);
511     }
512 }
513 
514 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
515 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
516                   void *vs2, CPURISCVState *env, uint32_t desc)            \
517 {                                                                          \
518     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
519                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
520 }
521 
522 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
523 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
524 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
525 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
526 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
527 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
528 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
529 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
530 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
538 
539 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
540 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
541                   void *vs2, CPURISCVState *env, uint32_t desc)  \
542 {                                                                \
543     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
544                     STORE_FN, ctzl(sizeof(ETYPE)),               \
545                     GETPC());                                    \
546 }
547 
548 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
549 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
550 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
551 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
552 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
553 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
554 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
555 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
556 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
564 
565 /*
566  *** unit-stride fault-only-fisrt load instructions
567  */
568 static inline void
569 vext_ldff(void *vd, void *v0, target_ulong base,
570           CPURISCVState *env, uint32_t desc,
571           vext_ldst_elem_fn *ldst_elem,
572           uint32_t log2_esz, uintptr_t ra)
573 {
574     void *host;
575     uint32_t i, k, vl = 0;
576     uint32_t nf = vext_nf(desc);
577     uint32_t vm = vext_vm(desc);
578     uint32_t max_elems = vext_max_elems(desc, log2_esz);
579     uint32_t esz = 1 << log2_esz;
580     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
581     uint32_t vta = vext_vta(desc);
582     target_ulong addr, offset, remain;
583 
584     /* probe every access*/
585     for (i = env->vstart; i < env->vl; i++) {
586         if (!vm && !vext_elem_mask(v0, i)) {
587             continue;
588         }
589         addr = adjust_addr(env, base + i * (nf << log2_esz));
590         if (i == 0) {
591             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
592         } else {
593             /* if it triggers an exception, no need to check watchpoint */
594             remain = nf << log2_esz;
595             while (remain > 0) {
596                 offset = -(addr | TARGET_PAGE_MASK);
597                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
598                                          cpu_mmu_index(env, false));
599                 if (host) {
600 #ifdef CONFIG_USER_ONLY
601                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
602                         vl = i;
603                         goto ProbeSuccess;
604                     }
605 #else
606                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
607 #endif
608                 } else {
609                     vl = i;
610                     goto ProbeSuccess;
611                 }
612                 if (remain <=  offset) {
613                     break;
614                 }
615                 remain -= offset;
616                 addr = adjust_addr(env, addr + offset);
617             }
618         }
619     }
620 ProbeSuccess:
621     /* load bytes from guest memory */
622     if (vl != 0) {
623         env->vl = vl;
624     }
625     for (i = env->vstart; i < env->vl; i++) {
626         k = 0;
627         if (!vm && !vext_elem_mask(v0, i)) {
628             continue;
629         }
630         while (k < nf) {
631             target_ulong addr = base + ((i * nf + k) << log2_esz);
632             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
633             k++;
634         }
635     }
636     env->vstart = 0;
637     /* set tail elements to 1s */
638     for (k = 0; k < nf; ++k) {
639         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
640                           (k * max_elems + max_elems) * esz);
641     }
642     if (nf * max_elems % total_elems != 0) {
643         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
644         uint32_t registers_used =
645             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
646         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
647                           registers_used * vlenb);
648     }
649 }
650 
651 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
652 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
653                   CPURISCVState *env, uint32_t desc)      \
654 {                                                         \
655     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
656               ctzl(sizeof(ETYPE)), GETPC());              \
657 }
658 
659 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
660 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
661 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
662 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
663 
664 #define DO_SWAP(N, M) (M)
665 #define DO_AND(N, M)  (N & M)
666 #define DO_XOR(N, M)  (N ^ M)
667 #define DO_OR(N, M)   (N | M)
668 #define DO_ADD(N, M)  (N + M)
669 
670 /* Signed min/max */
671 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
672 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
673 
674 /* Unsigned min/max */
675 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
676 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
677 
678 /*
679  *** load and store whole register instructions
680  */
681 static void
682 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
683                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
684 {
685     uint32_t i, k, off, pos;
686     uint32_t nf = vext_nf(desc);
687     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
688     uint32_t max_elems = vlenb >> log2_esz;
689 
690     k = env->vstart / max_elems;
691     off = env->vstart % max_elems;
692 
693     if (off) {
694         /* load/store rest of elements of current segment pointed by vstart */
695         for (pos = off; pos < max_elems; pos++, env->vstart++) {
696             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
697             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
698         }
699         k++;
700     }
701 
702     /* load/store elements for rest of segments */
703     for (; k < nf; k++) {
704         for (i = 0; i < max_elems; i++, env->vstart++) {
705             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
706             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
707         }
708     }
709 
710     env->vstart = 0;
711 }
712 
713 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
714 void HELPER(NAME)(void *vd, target_ulong base,       \
715                   CPURISCVState *env, uint32_t desc) \
716 {                                                    \
717     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
718                     ctzl(sizeof(ETYPE)), GETPC());   \
719 }
720 
721 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
722 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
723 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
724 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
725 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
726 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
727 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
728 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
729 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
730 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
731 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
732 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
733 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
734 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
735 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
736 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
737 
738 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
739 void HELPER(NAME)(void *vd, target_ulong base,       \
740                   CPURISCVState *env, uint32_t desc) \
741 {                                                    \
742     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
743                     ctzl(sizeof(ETYPE)), GETPC());   \
744 }
745 
746 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
747 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
748 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
749 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
750 
751 /*
752  *** Vector Integer Arithmetic Instructions
753  */
754 
755 /* expand macro args before macro */
756 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
757 
758 /* (TD, T1, T2, TX1, TX2) */
759 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
760 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
761 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
762 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
763 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
764 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
765 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
766 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
767 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
768 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
769 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
770 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
771 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
772 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
773 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
774 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
775 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
776 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
777 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
778 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
779 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
780 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
781 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
782 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
783 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
784 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
785 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
786 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
787 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
788 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
789 
790 /* operation of two vector elements */
791 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
792 
793 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
794 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
795 {                                                               \
796     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
797     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
798     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
799 }
800 #define DO_SUB(N, M) (N - M)
801 #define DO_RSUB(N, M) (M - N)
802 
803 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
804 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
805 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
806 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
807 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
808 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
809 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
810 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
811 
812 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
813                        CPURISCVState *env, uint32_t desc,
814                        opivv2_fn *fn, uint32_t esz)
815 {
816     uint32_t vm = vext_vm(desc);
817     uint32_t vl = env->vl;
818     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
819     uint32_t vta = vext_vta(desc);
820     uint32_t vma = vext_vma(desc);
821     uint32_t i;
822 
823     for (i = env->vstart; i < vl; i++) {
824         if (!vm && !vext_elem_mask(v0, i)) {
825             /* set masked-off elements to 1s */
826             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
827             continue;
828         }
829         fn(vd, vs1, vs2, i);
830     }
831     env->vstart = 0;
832     /* set tail elements to 1s */
833     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
834 }
835 
836 /* generate the helpers for OPIVV */
837 #define GEN_VEXT_VV(NAME, ESZ)                            \
838 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
839                   void *vs2, CPURISCVState *env,          \
840                   uint32_t desc)                          \
841 {                                                         \
842     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
843                do_##NAME, ESZ);                           \
844 }
845 
846 GEN_VEXT_VV(vadd_vv_b, 1)
847 GEN_VEXT_VV(vadd_vv_h, 2)
848 GEN_VEXT_VV(vadd_vv_w, 4)
849 GEN_VEXT_VV(vadd_vv_d, 8)
850 GEN_VEXT_VV(vsub_vv_b, 1)
851 GEN_VEXT_VV(vsub_vv_h, 2)
852 GEN_VEXT_VV(vsub_vv_w, 4)
853 GEN_VEXT_VV(vsub_vv_d, 8)
854 
855 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
856 
857 /*
858  * (T1)s1 gives the real operator type.
859  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
860  */
861 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
862 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
863 {                                                                   \
864     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
865     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
866 }
867 
868 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
869 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
870 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
871 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
872 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
873 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
874 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
875 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
876 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
877 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
878 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
879 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
880 
881 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
882                        CPURISCVState *env, uint32_t desc,
883                        opivx2_fn fn, uint32_t esz)
884 {
885     uint32_t vm = vext_vm(desc);
886     uint32_t vl = env->vl;
887     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
888     uint32_t vta = vext_vta(desc);
889     uint32_t i;
890 
891     for (i = env->vstart; i < vl; i++) {
892         if (!vm && !vext_elem_mask(v0, i)) {
893             continue;
894         }
895         fn(vd, s1, vs2, i);
896     }
897     env->vstart = 0;
898     /* set tail elements to 1s */
899     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
900 }
901 
902 /* generate the helpers for OPIVX */
903 #define GEN_VEXT_VX(NAME, ESZ)                            \
904 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
905                   void *vs2, CPURISCVState *env,          \
906                   uint32_t desc)                          \
907 {                                                         \
908     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
909                do_##NAME, ESZ);                           \
910 }
911 
912 GEN_VEXT_VX(vadd_vx_b, 1)
913 GEN_VEXT_VX(vadd_vx_h, 2)
914 GEN_VEXT_VX(vadd_vx_w, 4)
915 GEN_VEXT_VX(vadd_vx_d, 8)
916 GEN_VEXT_VX(vsub_vx_b, 1)
917 GEN_VEXT_VX(vsub_vx_h, 2)
918 GEN_VEXT_VX(vsub_vx_w, 4)
919 GEN_VEXT_VX(vsub_vx_d, 8)
920 GEN_VEXT_VX(vrsub_vx_b, 1)
921 GEN_VEXT_VX(vrsub_vx_h, 2)
922 GEN_VEXT_VX(vrsub_vx_w, 4)
923 GEN_VEXT_VX(vrsub_vx_d, 8)
924 
925 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
926 {
927     intptr_t oprsz = simd_oprsz(desc);
928     intptr_t i;
929 
930     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
931         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
932     }
933 }
934 
935 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
936 {
937     intptr_t oprsz = simd_oprsz(desc);
938     intptr_t i;
939 
940     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
941         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
942     }
943 }
944 
945 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
946 {
947     intptr_t oprsz = simd_oprsz(desc);
948     intptr_t i;
949 
950     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
951         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
952     }
953 }
954 
955 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
956 {
957     intptr_t oprsz = simd_oprsz(desc);
958     intptr_t i;
959 
960     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
961         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
962     }
963 }
964 
965 /* Vector Widening Integer Add/Subtract */
966 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
967 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
968 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
969 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
970 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
971 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
972 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
973 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
974 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
975 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
976 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
977 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
978 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
979 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
980 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
981 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
982 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
983 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
984 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
985 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
986 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
987 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
988 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
989 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
990 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
991 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
992 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
993 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
994 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
995 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
996 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
997 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
998 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
999 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1000 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1001 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1002 GEN_VEXT_VV(vwaddu_vv_b, 2)
1003 GEN_VEXT_VV(vwaddu_vv_h, 4)
1004 GEN_VEXT_VV(vwaddu_vv_w, 8)
1005 GEN_VEXT_VV(vwsubu_vv_b, 2)
1006 GEN_VEXT_VV(vwsubu_vv_h, 4)
1007 GEN_VEXT_VV(vwsubu_vv_w, 8)
1008 GEN_VEXT_VV(vwadd_vv_b, 2)
1009 GEN_VEXT_VV(vwadd_vv_h, 4)
1010 GEN_VEXT_VV(vwadd_vv_w, 8)
1011 GEN_VEXT_VV(vwsub_vv_b, 2)
1012 GEN_VEXT_VV(vwsub_vv_h, 4)
1013 GEN_VEXT_VV(vwsub_vv_w, 8)
1014 GEN_VEXT_VV(vwaddu_wv_b, 2)
1015 GEN_VEXT_VV(vwaddu_wv_h, 4)
1016 GEN_VEXT_VV(vwaddu_wv_w, 8)
1017 GEN_VEXT_VV(vwsubu_wv_b, 2)
1018 GEN_VEXT_VV(vwsubu_wv_h, 4)
1019 GEN_VEXT_VV(vwsubu_wv_w, 8)
1020 GEN_VEXT_VV(vwadd_wv_b, 2)
1021 GEN_VEXT_VV(vwadd_wv_h, 4)
1022 GEN_VEXT_VV(vwadd_wv_w, 8)
1023 GEN_VEXT_VV(vwsub_wv_b, 2)
1024 GEN_VEXT_VV(vwsub_wv_h, 4)
1025 GEN_VEXT_VV(vwsub_wv_w, 8)
1026 
1027 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1028 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1029 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1030 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1031 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1032 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1033 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1034 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1035 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1036 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1037 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1038 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1039 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1040 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1041 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1042 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1043 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1044 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1045 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1046 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1047 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1048 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1049 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1050 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1051 GEN_VEXT_VX(vwaddu_vx_b, 2)
1052 GEN_VEXT_VX(vwaddu_vx_h, 4)
1053 GEN_VEXT_VX(vwaddu_vx_w, 8)
1054 GEN_VEXT_VX(vwsubu_vx_b, 2)
1055 GEN_VEXT_VX(vwsubu_vx_h, 4)
1056 GEN_VEXT_VX(vwsubu_vx_w, 8)
1057 GEN_VEXT_VX(vwadd_vx_b, 2)
1058 GEN_VEXT_VX(vwadd_vx_h, 4)
1059 GEN_VEXT_VX(vwadd_vx_w, 8)
1060 GEN_VEXT_VX(vwsub_vx_b, 2)
1061 GEN_VEXT_VX(vwsub_vx_h, 4)
1062 GEN_VEXT_VX(vwsub_vx_w, 8)
1063 GEN_VEXT_VX(vwaddu_wx_b, 2)
1064 GEN_VEXT_VX(vwaddu_wx_h, 4)
1065 GEN_VEXT_VX(vwaddu_wx_w, 8)
1066 GEN_VEXT_VX(vwsubu_wx_b, 2)
1067 GEN_VEXT_VX(vwsubu_wx_h, 4)
1068 GEN_VEXT_VX(vwsubu_wx_w, 8)
1069 GEN_VEXT_VX(vwadd_wx_b, 2)
1070 GEN_VEXT_VX(vwadd_wx_h, 4)
1071 GEN_VEXT_VX(vwadd_wx_w, 8)
1072 GEN_VEXT_VX(vwsub_wx_b, 2)
1073 GEN_VEXT_VX(vwsub_wx_h, 4)
1074 GEN_VEXT_VX(vwsub_wx_w, 8)
1075 
1076 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1077 #define DO_VADC(N, M, C) (N + M + C)
1078 #define DO_VSBC(N, M, C) (N - M - C)
1079 
1080 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1081 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1082                   CPURISCVState *env, uint32_t desc)          \
1083 {                                                             \
1084     uint32_t vl = env->vl;                                    \
1085     uint32_t esz = sizeof(ETYPE);                             \
1086     uint32_t total_elems =                                    \
1087         vext_get_total_elems(env, desc, esz);                 \
1088     uint32_t vta = vext_vta(desc);                            \
1089     uint32_t i;                                               \
1090                                                               \
1091     for (i = env->vstart; i < vl; i++) {                      \
1092         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1093         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1094         ETYPE carry = vext_elem_mask(v0, i);                  \
1095                                                               \
1096         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1097     }                                                         \
1098     env->vstart = 0;                                          \
1099     /* set tail elements to 1s */                             \
1100     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1101 }
1102 
1103 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1104 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1105 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1106 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1107 
1108 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1109 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1110 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1111 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1112 
1113 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1114 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1115                   CPURISCVState *env, uint32_t desc)                     \
1116 {                                                                        \
1117     uint32_t vl = env->vl;                                               \
1118     uint32_t esz = sizeof(ETYPE);                                        \
1119     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1120     uint32_t vta = vext_vta(desc);                                       \
1121     uint32_t i;                                                          \
1122                                                                          \
1123     for (i = env->vstart; i < vl; i++) {                                 \
1124         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1125         ETYPE carry = vext_elem_mask(v0, i);                             \
1126                                                                          \
1127         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1128     }                                                                    \
1129     env->vstart = 0;                                          \
1130     /* set tail elements to 1s */                                        \
1131     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1132 }
1133 
1134 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1135 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1136 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1137 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1138 
1139 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1140 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1141 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1142 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1143 
1144 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1145                           (__typeof(N))(N + M) < N)
1146 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1147 
1148 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1149 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1150                   CPURISCVState *env, uint32_t desc)          \
1151 {                                                             \
1152     uint32_t vl = env->vl;                                    \
1153     uint32_t vm = vext_vm(desc);                              \
1154     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1155     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1156     uint32_t i;                                               \
1157                                                               \
1158     for (i = env->vstart; i < vl; i++) {                      \
1159         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1160         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1161         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1162         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1163     }                                                         \
1164     env->vstart = 0;                                          \
1165     /* mask destination register are always tail-agnostic */  \
1166     /* set tail elements to 1s */                             \
1167     if (vta_all_1s) {                                         \
1168         for (; i < total_elems; i++) {                        \
1169             vext_set_elem_mask(vd, i, 1);                     \
1170         }                                                     \
1171     }                                                         \
1172 }
1173 
1174 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1175 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1176 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1177 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1178 
1179 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1180 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1181 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1182 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1183 
1184 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1185 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1186                   void *vs2, CPURISCVState *env, uint32_t desc) \
1187 {                                                               \
1188     uint32_t vl = env->vl;                                      \
1189     uint32_t vm = vext_vm(desc);                                \
1190     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1191     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1192     uint32_t i;                                                 \
1193                                                                 \
1194     for (i = env->vstart; i < vl; i++) {                        \
1195         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1196         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1197         vext_set_elem_mask(vd, i,                               \
1198                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1199     }                                                           \
1200     env->vstart = 0;                                            \
1201     /* mask destination register are always tail-agnostic */    \
1202     /* set tail elements to 1s */                               \
1203     if (vta_all_1s) {                                           \
1204         for (; i < total_elems; i++) {                          \
1205             vext_set_elem_mask(vd, i, 1);                       \
1206         }                                                       \
1207     }                                                           \
1208 }
1209 
1210 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1211 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1212 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1213 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1214 
1215 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1216 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1217 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1218 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1219 
1220 /* Vector Bitwise Logical Instructions */
1221 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1222 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1223 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1224 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1225 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1226 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1227 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1228 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1229 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1230 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1231 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1232 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1233 GEN_VEXT_VV(vand_vv_b, 1)
1234 GEN_VEXT_VV(vand_vv_h, 2)
1235 GEN_VEXT_VV(vand_vv_w, 4)
1236 GEN_VEXT_VV(vand_vv_d, 8)
1237 GEN_VEXT_VV(vor_vv_b, 1)
1238 GEN_VEXT_VV(vor_vv_h, 2)
1239 GEN_VEXT_VV(vor_vv_w, 4)
1240 GEN_VEXT_VV(vor_vv_d, 8)
1241 GEN_VEXT_VV(vxor_vv_b, 1)
1242 GEN_VEXT_VV(vxor_vv_h, 2)
1243 GEN_VEXT_VV(vxor_vv_w, 4)
1244 GEN_VEXT_VV(vxor_vv_d, 8)
1245 
1246 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1247 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1248 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1249 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1250 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1251 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1252 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1253 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1254 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1255 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1256 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1257 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1258 GEN_VEXT_VX(vand_vx_b, 1)
1259 GEN_VEXT_VX(vand_vx_h, 2)
1260 GEN_VEXT_VX(vand_vx_w, 4)
1261 GEN_VEXT_VX(vand_vx_d, 8)
1262 GEN_VEXT_VX(vor_vx_b, 1)
1263 GEN_VEXT_VX(vor_vx_h, 2)
1264 GEN_VEXT_VX(vor_vx_w, 4)
1265 GEN_VEXT_VX(vor_vx_d, 8)
1266 GEN_VEXT_VX(vxor_vx_b, 1)
1267 GEN_VEXT_VX(vxor_vx_h, 2)
1268 GEN_VEXT_VX(vxor_vx_w, 4)
1269 GEN_VEXT_VX(vxor_vx_d, 8)
1270 
1271 /* Vector Single-Width Bit Shift Instructions */
1272 #define DO_SLL(N, M)  (N << (M))
1273 #define DO_SRL(N, M)  (N >> (M))
1274 
1275 /* generate the helpers for shift instructions with two vector operators */
1276 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1277 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1278                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1279 {                                                                         \
1280     uint32_t vm = vext_vm(desc);                                          \
1281     uint32_t vl = env->vl;                                                \
1282     uint32_t esz = sizeof(TS1);                                           \
1283     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1284     uint32_t vta = vext_vta(desc);                                        \
1285     uint32_t i;                                                           \
1286                                                                           \
1287     for (i = env->vstart; i < vl; i++) {                                  \
1288         if (!vm && !vext_elem_mask(v0, i)) {                              \
1289             continue;                                                     \
1290         }                                                                 \
1291         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1292         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1293         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1294     }                                                                     \
1295     env->vstart = 0;                                                      \
1296     /* set tail elements to 1s */                                         \
1297     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1298 }
1299 
1300 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1301 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1302 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1303 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1304 
1305 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1306 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1307 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1308 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1309 
1310 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1311 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1312 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1313 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1314 
1315 /* generate the helpers for shift instructions with one vector and one scalar */
1316 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1317 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1318         void *vs2, CPURISCVState *env, uint32_t desc)       \
1319 {                                                           \
1320     uint32_t vm = vext_vm(desc);                            \
1321     uint32_t vl = env->vl;                                  \
1322     uint32_t esz = sizeof(TD);                              \
1323     uint32_t total_elems =                                  \
1324         vext_get_total_elems(env, desc, esz);               \
1325     uint32_t vta = vext_vta(desc);                          \
1326     uint32_t i;                                             \
1327                                                             \
1328     for (i = env->vstart; i < vl; i++) {                    \
1329         if (!vm && !vext_elem_mask(v0, i)) {                \
1330             continue;                                       \
1331         }                                                   \
1332         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1333         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1334     }                                                       \
1335     env->vstart = 0;                                        \
1336     /* set tail elements to 1s */                           \
1337     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1338 }
1339 
1340 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1341 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1342 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1343 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1344 
1345 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1346 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1347 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1348 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1349 
1350 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1351 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1354 
1355 /* Vector Narrowing Integer Right Shift Instructions */
1356 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1357 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1358 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1359 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1360 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1361 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1362 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1363 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1364 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1365 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1366 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1367 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1368 
1369 /* Vector Integer Comparison Instructions */
1370 #define DO_MSEQ(N, M) (N == M)
1371 #define DO_MSNE(N, M) (N != M)
1372 #define DO_MSLT(N, M) (N < M)
1373 #define DO_MSLE(N, M) (N <= M)
1374 #define DO_MSGT(N, M) (N > M)
1375 
1376 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1377 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1378                   CPURISCVState *env, uint32_t desc)          \
1379 {                                                             \
1380     uint32_t vm = vext_vm(desc);                              \
1381     uint32_t vl = env->vl;                                    \
1382     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1383     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1384     uint32_t i;                                               \
1385                                                               \
1386     for (i = env->vstart; i < vl; i++) {                      \
1387         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1388         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1389         if (!vm && !vext_elem_mask(v0, i)) {                  \
1390             continue;                                         \
1391         }                                                     \
1392         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1393     }                                                         \
1394     env->vstart = 0;                                          \
1395     /* mask destination register are always tail-agnostic */  \
1396     /* set tail elements to 1s */                             \
1397     if (vta_all_1s) {                                         \
1398         for (; i < total_elems; i++) {                        \
1399             vext_set_elem_mask(vd, i, 1);                     \
1400         }                                                     \
1401     }                                                         \
1402 }
1403 
1404 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1405 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1406 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1408 
1409 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1410 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1411 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1413 
1414 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1415 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1416 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1418 
1419 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1420 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1421 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1423 
1424 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1425 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1426 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1428 
1429 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1430 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1431 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1433 
1434 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1435 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1436                   CPURISCVState *env, uint32_t desc)                \
1437 {                                                                   \
1438     uint32_t vm = vext_vm(desc);                                    \
1439     uint32_t vl = env->vl;                                          \
1440     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1441     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1442     uint32_t i;                                                     \
1443                                                                     \
1444     for (i = env->vstart; i < vl; i++) {                            \
1445         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1446         if (!vm && !vext_elem_mask(v0, i)) {                        \
1447             continue;                                               \
1448         }                                                           \
1449         vext_set_elem_mask(vd, i,                                   \
1450                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1451     }                                                               \
1452     env->vstart = 0;                                                \
1453     /* mask destination register are always tail-agnostic */        \
1454     /* set tail elements to 1s */                                   \
1455     if (vta_all_1s) {                                               \
1456         for (; i < total_elems; i++) {                              \
1457             vext_set_elem_mask(vd, i, 1);                           \
1458         }                                                           \
1459     }                                                               \
1460 }
1461 
1462 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1463 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1464 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1465 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1466 
1467 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1468 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1469 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1470 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1471 
1472 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1473 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1474 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1475 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1476 
1477 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1478 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1479 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1480 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1481 
1482 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1483 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1484 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1485 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1486 
1487 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1488 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1489 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1490 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1491 
1492 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1493 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1494 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1495 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1496 
1497 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1498 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1499 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1500 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1501 
1502 /* Vector Integer Min/Max Instructions */
1503 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1504 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1505 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1506 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1507 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1508 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1509 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1510 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1511 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1512 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1513 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1514 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1515 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1516 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1517 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1518 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1519 GEN_VEXT_VV(vminu_vv_b, 1)
1520 GEN_VEXT_VV(vminu_vv_h, 2)
1521 GEN_VEXT_VV(vminu_vv_w, 4)
1522 GEN_VEXT_VV(vminu_vv_d, 8)
1523 GEN_VEXT_VV(vmin_vv_b, 1)
1524 GEN_VEXT_VV(vmin_vv_h, 2)
1525 GEN_VEXT_VV(vmin_vv_w, 4)
1526 GEN_VEXT_VV(vmin_vv_d, 8)
1527 GEN_VEXT_VV(vmaxu_vv_b, 1)
1528 GEN_VEXT_VV(vmaxu_vv_h, 2)
1529 GEN_VEXT_VV(vmaxu_vv_w, 4)
1530 GEN_VEXT_VV(vmaxu_vv_d, 8)
1531 GEN_VEXT_VV(vmax_vv_b, 1)
1532 GEN_VEXT_VV(vmax_vv_h, 2)
1533 GEN_VEXT_VV(vmax_vv_w, 4)
1534 GEN_VEXT_VV(vmax_vv_d, 8)
1535 
1536 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1537 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1538 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1539 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1540 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1541 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1542 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1543 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1544 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1545 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1546 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1547 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1548 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1549 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1550 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1551 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1552 GEN_VEXT_VX(vminu_vx_b, 1)
1553 GEN_VEXT_VX(vminu_vx_h, 2)
1554 GEN_VEXT_VX(vminu_vx_w, 4)
1555 GEN_VEXT_VX(vminu_vx_d, 8)
1556 GEN_VEXT_VX(vmin_vx_b, 1)
1557 GEN_VEXT_VX(vmin_vx_h, 2)
1558 GEN_VEXT_VX(vmin_vx_w, 4)
1559 GEN_VEXT_VX(vmin_vx_d, 8)
1560 GEN_VEXT_VX(vmaxu_vx_b, 1)
1561 GEN_VEXT_VX(vmaxu_vx_h, 2)
1562 GEN_VEXT_VX(vmaxu_vx_w, 4)
1563 GEN_VEXT_VX(vmaxu_vx_d, 8)
1564 GEN_VEXT_VX(vmax_vx_b, 1)
1565 GEN_VEXT_VX(vmax_vx_h, 2)
1566 GEN_VEXT_VX(vmax_vx_w, 4)
1567 GEN_VEXT_VX(vmax_vx_d, 8)
1568 
1569 /* Vector Single-Width Integer Multiply Instructions */
1570 #define DO_MUL(N, M) (N * M)
1571 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1572 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1573 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1574 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1575 GEN_VEXT_VV(vmul_vv_b, 1)
1576 GEN_VEXT_VV(vmul_vv_h, 2)
1577 GEN_VEXT_VV(vmul_vv_w, 4)
1578 GEN_VEXT_VV(vmul_vv_d, 8)
1579 
1580 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1581 {
1582     return (int16_t)s2 * (int16_t)s1 >> 8;
1583 }
1584 
1585 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1586 {
1587     return (int32_t)s2 * (int32_t)s1 >> 16;
1588 }
1589 
1590 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1591 {
1592     return (int64_t)s2 * (int64_t)s1 >> 32;
1593 }
1594 
1595 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1596 {
1597     uint64_t hi_64, lo_64;
1598 
1599     muls64(&lo_64, &hi_64, s1, s2);
1600     return hi_64;
1601 }
1602 
1603 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1604 {
1605     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1606 }
1607 
1608 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1609 {
1610     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1611 }
1612 
1613 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1614 {
1615     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1616 }
1617 
1618 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1619 {
1620     uint64_t hi_64, lo_64;
1621 
1622     mulu64(&lo_64, &hi_64, s2, s1);
1623     return hi_64;
1624 }
1625 
1626 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1627 {
1628     return (int16_t)s2 * (uint16_t)s1 >> 8;
1629 }
1630 
1631 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1632 {
1633     return (int32_t)s2 * (uint32_t)s1 >> 16;
1634 }
1635 
1636 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1637 {
1638     return (int64_t)s2 * (uint64_t)s1 >> 32;
1639 }
1640 
1641 /*
1642  * Let  A = signed operand,
1643  *      B = unsigned operand
1644  *      P = mulu64(A, B), unsigned product
1645  *
1646  * LET  X = 2 ** 64  - A, 2's complement of A
1647  *      SP = signed product
1648  * THEN
1649  *      IF A < 0
1650  *          SP = -X * B
1651  *             = -(2 ** 64 - A) * B
1652  *             = A * B - 2 ** 64 * B
1653  *             = P - 2 ** 64 * B
1654  *      ELSE
1655  *          SP = P
1656  * THEN
1657  *      HI_P -= (A < 0 ? B : 0)
1658  */
1659 
1660 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1661 {
1662     uint64_t hi_64, lo_64;
1663 
1664     mulu64(&lo_64, &hi_64, s2, s1);
1665 
1666     hi_64 -= s2 < 0 ? s1 : 0;
1667     return hi_64;
1668 }
1669 
1670 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1671 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1672 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1673 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1674 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1675 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1676 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1677 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1678 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1679 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1680 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1681 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1682 GEN_VEXT_VV(vmulh_vv_b, 1)
1683 GEN_VEXT_VV(vmulh_vv_h, 2)
1684 GEN_VEXT_VV(vmulh_vv_w, 4)
1685 GEN_VEXT_VV(vmulh_vv_d, 8)
1686 GEN_VEXT_VV(vmulhu_vv_b, 1)
1687 GEN_VEXT_VV(vmulhu_vv_h, 2)
1688 GEN_VEXT_VV(vmulhu_vv_w, 4)
1689 GEN_VEXT_VV(vmulhu_vv_d, 8)
1690 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1691 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1692 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1693 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1694 
1695 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1696 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1697 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1698 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1699 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1700 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1701 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1702 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1703 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1704 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1705 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1706 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1707 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1708 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1709 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1710 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1711 GEN_VEXT_VX(vmul_vx_b, 1)
1712 GEN_VEXT_VX(vmul_vx_h, 2)
1713 GEN_VEXT_VX(vmul_vx_w, 4)
1714 GEN_VEXT_VX(vmul_vx_d, 8)
1715 GEN_VEXT_VX(vmulh_vx_b, 1)
1716 GEN_VEXT_VX(vmulh_vx_h, 2)
1717 GEN_VEXT_VX(vmulh_vx_w, 4)
1718 GEN_VEXT_VX(vmulh_vx_d, 8)
1719 GEN_VEXT_VX(vmulhu_vx_b, 1)
1720 GEN_VEXT_VX(vmulhu_vx_h, 2)
1721 GEN_VEXT_VX(vmulhu_vx_w, 4)
1722 GEN_VEXT_VX(vmulhu_vx_d, 8)
1723 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1724 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1725 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1726 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1727 
1728 /* Vector Integer Divide Instructions */
1729 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1730 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1731 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1732         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1733 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1734         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1735 
1736 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1737 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1738 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1739 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1740 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1741 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1742 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1743 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1744 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1745 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1746 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1747 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1748 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1749 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1750 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1751 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1752 GEN_VEXT_VV(vdivu_vv_b, 1)
1753 GEN_VEXT_VV(vdivu_vv_h, 2)
1754 GEN_VEXT_VV(vdivu_vv_w, 4)
1755 GEN_VEXT_VV(vdivu_vv_d, 8)
1756 GEN_VEXT_VV(vdiv_vv_b, 1)
1757 GEN_VEXT_VV(vdiv_vv_h, 2)
1758 GEN_VEXT_VV(vdiv_vv_w, 4)
1759 GEN_VEXT_VV(vdiv_vv_d, 8)
1760 GEN_VEXT_VV(vremu_vv_b, 1)
1761 GEN_VEXT_VV(vremu_vv_h, 2)
1762 GEN_VEXT_VV(vremu_vv_w, 4)
1763 GEN_VEXT_VV(vremu_vv_d, 8)
1764 GEN_VEXT_VV(vrem_vv_b, 1)
1765 GEN_VEXT_VV(vrem_vv_h, 2)
1766 GEN_VEXT_VV(vrem_vv_w, 4)
1767 GEN_VEXT_VV(vrem_vv_d, 8)
1768 
1769 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1770 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1771 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1772 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1773 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1774 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1775 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1776 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1777 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1778 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1779 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1780 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1781 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1782 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1783 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1784 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1785 GEN_VEXT_VX(vdivu_vx_b, 1)
1786 GEN_VEXT_VX(vdivu_vx_h, 2)
1787 GEN_VEXT_VX(vdivu_vx_w, 4)
1788 GEN_VEXT_VX(vdivu_vx_d, 8)
1789 GEN_VEXT_VX(vdiv_vx_b, 1)
1790 GEN_VEXT_VX(vdiv_vx_h, 2)
1791 GEN_VEXT_VX(vdiv_vx_w, 4)
1792 GEN_VEXT_VX(vdiv_vx_d, 8)
1793 GEN_VEXT_VX(vremu_vx_b, 1)
1794 GEN_VEXT_VX(vremu_vx_h, 2)
1795 GEN_VEXT_VX(vremu_vx_w, 4)
1796 GEN_VEXT_VX(vremu_vx_d, 8)
1797 GEN_VEXT_VX(vrem_vx_b, 1)
1798 GEN_VEXT_VX(vrem_vx_h, 2)
1799 GEN_VEXT_VX(vrem_vx_w, 4)
1800 GEN_VEXT_VX(vrem_vx_d, 8)
1801 
1802 /* Vector Widening Integer Multiply Instructions */
1803 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1804 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1805 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1806 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1807 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1808 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1809 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1810 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1811 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1812 GEN_VEXT_VV(vwmul_vv_b, 2)
1813 GEN_VEXT_VV(vwmul_vv_h, 4)
1814 GEN_VEXT_VV(vwmul_vv_w, 8)
1815 GEN_VEXT_VV(vwmulu_vv_b, 2)
1816 GEN_VEXT_VV(vwmulu_vv_h, 4)
1817 GEN_VEXT_VV(vwmulu_vv_w, 8)
1818 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1819 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1820 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1821 
1822 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1823 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1824 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1825 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1826 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1827 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1828 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1829 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1830 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1831 GEN_VEXT_VX(vwmul_vx_b, 2)
1832 GEN_VEXT_VX(vwmul_vx_h, 4)
1833 GEN_VEXT_VX(vwmul_vx_w, 8)
1834 GEN_VEXT_VX(vwmulu_vx_b, 2)
1835 GEN_VEXT_VX(vwmulu_vx_h, 4)
1836 GEN_VEXT_VX(vwmulu_vx_w, 8)
1837 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1838 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1839 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1840 
1841 /* Vector Single-Width Integer Multiply-Add Instructions */
1842 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1843 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1844 {                                                                  \
1845     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1846     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1847     TD d = *((TD *)vd + HD(i));                                    \
1848     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1849 }
1850 
1851 #define DO_MACC(N, M, D) (M * N + D)
1852 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1853 #define DO_MADD(N, M, D) (M * D + N)
1854 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1855 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1856 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1857 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1858 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1859 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1860 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1861 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1862 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1863 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1864 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1865 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1866 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1867 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1868 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1869 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1870 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1871 GEN_VEXT_VV(vmacc_vv_b, 1)
1872 GEN_VEXT_VV(vmacc_vv_h, 2)
1873 GEN_VEXT_VV(vmacc_vv_w, 4)
1874 GEN_VEXT_VV(vmacc_vv_d, 8)
1875 GEN_VEXT_VV(vnmsac_vv_b, 1)
1876 GEN_VEXT_VV(vnmsac_vv_h, 2)
1877 GEN_VEXT_VV(vnmsac_vv_w, 4)
1878 GEN_VEXT_VV(vnmsac_vv_d, 8)
1879 GEN_VEXT_VV(vmadd_vv_b, 1)
1880 GEN_VEXT_VV(vmadd_vv_h, 2)
1881 GEN_VEXT_VV(vmadd_vv_w, 4)
1882 GEN_VEXT_VV(vmadd_vv_d, 8)
1883 GEN_VEXT_VV(vnmsub_vv_b, 1)
1884 GEN_VEXT_VV(vnmsub_vv_h, 2)
1885 GEN_VEXT_VV(vnmsub_vv_w, 4)
1886 GEN_VEXT_VV(vnmsub_vv_d, 8)
1887 
1888 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1889 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1890 {                                                                   \
1891     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1892     TD d = *((TD *)vd + HD(i));                                     \
1893     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1894 }
1895 
1896 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1897 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1898 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1899 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1900 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1901 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1902 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1903 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1904 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1905 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1906 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1907 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1908 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1909 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1910 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1911 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1912 GEN_VEXT_VX(vmacc_vx_b, 1)
1913 GEN_VEXT_VX(vmacc_vx_h, 2)
1914 GEN_VEXT_VX(vmacc_vx_w, 4)
1915 GEN_VEXT_VX(vmacc_vx_d, 8)
1916 GEN_VEXT_VX(vnmsac_vx_b, 1)
1917 GEN_VEXT_VX(vnmsac_vx_h, 2)
1918 GEN_VEXT_VX(vnmsac_vx_w, 4)
1919 GEN_VEXT_VX(vnmsac_vx_d, 8)
1920 GEN_VEXT_VX(vmadd_vx_b, 1)
1921 GEN_VEXT_VX(vmadd_vx_h, 2)
1922 GEN_VEXT_VX(vmadd_vx_w, 4)
1923 GEN_VEXT_VX(vmadd_vx_d, 8)
1924 GEN_VEXT_VX(vnmsub_vx_b, 1)
1925 GEN_VEXT_VX(vnmsub_vx_h, 2)
1926 GEN_VEXT_VX(vnmsub_vx_w, 4)
1927 GEN_VEXT_VX(vnmsub_vx_d, 8)
1928 
1929 /* Vector Widening Integer Multiply-Add Instructions */
1930 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1931 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1932 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1933 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1934 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1935 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1936 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1937 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1938 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1939 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1940 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1941 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1942 GEN_VEXT_VV(vwmacc_vv_b, 2)
1943 GEN_VEXT_VV(vwmacc_vv_h, 4)
1944 GEN_VEXT_VV(vwmacc_vv_w, 8)
1945 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1946 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1947 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1948 
1949 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1950 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1951 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1952 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1953 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1954 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1955 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1956 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1957 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1958 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1959 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1960 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1961 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1962 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1963 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1964 GEN_VEXT_VX(vwmacc_vx_b, 2)
1965 GEN_VEXT_VX(vwmacc_vx_h, 4)
1966 GEN_VEXT_VX(vwmacc_vx_w, 8)
1967 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1968 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1969 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1970 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1971 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1972 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1973 
1974 /* Vector Integer Merge and Move Instructions */
1975 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1976 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1977                   uint32_t desc)                                     \
1978 {                                                                    \
1979     uint32_t vl = env->vl;                                           \
1980     uint32_t esz = sizeof(ETYPE);                                    \
1981     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1982     uint32_t vta = vext_vta(desc);                                   \
1983     uint32_t i;                                                      \
1984                                                                      \
1985     for (i = env->vstart; i < vl; i++) {                             \
1986         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1987         *((ETYPE *)vd + H(i)) = s1;                                  \
1988     }                                                                \
1989     env->vstart = 0;                                                 \
1990     /* set tail elements to 1s */                                    \
1991     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1992 }
1993 
1994 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1995 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1996 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1997 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1998 
1999 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2000 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2001                   uint32_t desc)                                     \
2002 {                                                                    \
2003     uint32_t vl = env->vl;                                           \
2004     uint32_t esz = sizeof(ETYPE);                                    \
2005     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2006     uint32_t vta = vext_vta(desc);                                   \
2007     uint32_t i;                                                      \
2008                                                                      \
2009     for (i = env->vstart; i < vl; i++) {                             \
2010         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2011     }                                                                \
2012     env->vstart = 0;                                                 \
2013     /* set tail elements to 1s */                                    \
2014     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2015 }
2016 
2017 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2018 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2019 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2020 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2021 
2022 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2023 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2024                   CPURISCVState *env, uint32_t desc)                 \
2025 {                                                                    \
2026     uint32_t vl = env->vl;                                           \
2027     uint32_t esz = sizeof(ETYPE);                                    \
2028     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2029     uint32_t vta = vext_vta(desc);                                   \
2030     uint32_t i;                                                      \
2031                                                                      \
2032     for (i = env->vstart; i < vl; i++) {                             \
2033         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2034         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2035     }                                                                \
2036     env->vstart = 0;                                                 \
2037     /* set tail elements to 1s */                                    \
2038     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2039 }
2040 
2041 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2042 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2043 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2044 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2045 
2046 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2047 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2048                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2049 {                                                                    \
2050     uint32_t vl = env->vl;                                           \
2051     uint32_t esz = sizeof(ETYPE);                                    \
2052     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2053     uint32_t vta = vext_vta(desc);                                   \
2054     uint32_t i;                                                      \
2055                                                                      \
2056     for (i = env->vstart; i < vl; i++) {                             \
2057         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2058         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2059                    (ETYPE)(target_long)s1);                          \
2060         *((ETYPE *)vd + H(i)) = d;                                   \
2061     }                                                                \
2062     env->vstart = 0;                                                 \
2063     /* set tail elements to 1s */                                    \
2064     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2065 }
2066 
2067 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2068 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2069 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2070 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2071 
2072 /*
2073  *** Vector Fixed-Point Arithmetic Instructions
2074  */
2075 
2076 /* Vector Single-Width Saturating Add and Subtract */
2077 
2078 /*
2079  * As fixed point instructions probably have round mode and saturation,
2080  * define common macros for fixed point here.
2081  */
2082 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2083                           CPURISCVState *env, int vxrm);
2084 
2085 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2086 static inline void                                                  \
2087 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2088           CPURISCVState *env, int vxrm)                             \
2089 {                                                                   \
2090     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2091     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2092     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2093 }
2094 
2095 static inline void
2096 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2097              CPURISCVState *env,
2098              uint32_t vl, uint32_t vm, int vxrm,
2099              opivv2_rm_fn *fn)
2100 {
2101     for (uint32_t i = env->vstart; i < vl; i++) {
2102         if (!vm && !vext_elem_mask(v0, i)) {
2103             continue;
2104         }
2105         fn(vd, vs1, vs2, i, env, vxrm);
2106     }
2107     env->vstart = 0;
2108 }
2109 
2110 static inline void
2111 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2112              CPURISCVState *env,
2113              uint32_t desc,
2114              opivv2_rm_fn *fn, uint32_t esz)
2115 {
2116     uint32_t vm = vext_vm(desc);
2117     uint32_t vl = env->vl;
2118     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2119     uint32_t vta = vext_vta(desc);
2120 
2121     switch (env->vxrm) {
2122     case 0: /* rnu */
2123         vext_vv_rm_1(vd, v0, vs1, vs2,
2124                      env, vl, vm, 0, fn);
2125         break;
2126     case 1: /* rne */
2127         vext_vv_rm_1(vd, v0, vs1, vs2,
2128                      env, vl, vm, 1, fn);
2129         break;
2130     case 2: /* rdn */
2131         vext_vv_rm_1(vd, v0, vs1, vs2,
2132                      env, vl, vm, 2, fn);
2133         break;
2134     default: /* rod */
2135         vext_vv_rm_1(vd, v0, vs1, vs2,
2136                      env, vl, vm, 3, fn);
2137         break;
2138     }
2139     /* set tail elements to 1s */
2140     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2141 }
2142 
2143 /* generate helpers for fixed point instructions with OPIVV format */
2144 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2145 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2146                   CPURISCVState *env, uint32_t desc)            \
2147 {                                                               \
2148     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2149                  do_##NAME, ESZ);                               \
2150 }
2151 
2152 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2153 {
2154     uint8_t res = a + b;
2155     if (res < a) {
2156         res = UINT8_MAX;
2157         env->vxsat = 0x1;
2158     }
2159     return res;
2160 }
2161 
2162 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2163                                uint16_t b)
2164 {
2165     uint16_t res = a + b;
2166     if (res < a) {
2167         res = UINT16_MAX;
2168         env->vxsat = 0x1;
2169     }
2170     return res;
2171 }
2172 
2173 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2174                                uint32_t b)
2175 {
2176     uint32_t res = a + b;
2177     if (res < a) {
2178         res = UINT32_MAX;
2179         env->vxsat = 0x1;
2180     }
2181     return res;
2182 }
2183 
2184 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2185                                uint64_t b)
2186 {
2187     uint64_t res = a + b;
2188     if (res < a) {
2189         res = UINT64_MAX;
2190         env->vxsat = 0x1;
2191     }
2192     return res;
2193 }
2194 
2195 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2196 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2197 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2198 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2199 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2200 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2201 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2202 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2203 
2204 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2205                           CPURISCVState *env, int vxrm);
2206 
2207 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2208 static inline void                                                  \
2209 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2210           CPURISCVState *env, int vxrm)                             \
2211 {                                                                   \
2212     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2213     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2214 }
2215 
2216 static inline void
2217 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2218              CPURISCVState *env,
2219              uint32_t vl, uint32_t vm, int vxrm,
2220              opivx2_rm_fn *fn)
2221 {
2222     for (uint32_t i = env->vstart; i < vl; i++) {
2223         if (!vm && !vext_elem_mask(v0, i)) {
2224             continue;
2225         }
2226         fn(vd, s1, vs2, i, env, vxrm);
2227     }
2228     env->vstart = 0;
2229 }
2230 
2231 static inline void
2232 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2233              CPURISCVState *env,
2234              uint32_t desc,
2235              opivx2_rm_fn *fn, uint32_t esz)
2236 {
2237     uint32_t vm = vext_vm(desc);
2238     uint32_t vl = env->vl;
2239     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2240     uint32_t vta = vext_vta(desc);
2241 
2242     switch (env->vxrm) {
2243     case 0: /* rnu */
2244         vext_vx_rm_1(vd, v0, s1, vs2,
2245                      env, vl, vm, 0, fn);
2246         break;
2247     case 1: /* rne */
2248         vext_vx_rm_1(vd, v0, s1, vs2,
2249                      env, vl, vm, 1, fn);
2250         break;
2251     case 2: /* rdn */
2252         vext_vx_rm_1(vd, v0, s1, vs2,
2253                      env, vl, vm, 2, fn);
2254         break;
2255     default: /* rod */
2256         vext_vx_rm_1(vd, v0, s1, vs2,
2257                      env, vl, vm, 3, fn);
2258         break;
2259     }
2260     /* set tail elements to 1s */
2261     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2262 }
2263 
2264 /* generate helpers for fixed point instructions with OPIVX format */
2265 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2266 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2267         void *vs2, CPURISCVState *env, uint32_t desc)     \
2268 {                                                         \
2269     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2270                  do_##NAME, ESZ);                         \
2271 }
2272 
2273 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2274 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2275 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2276 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2277 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2278 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2279 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2280 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2281 
2282 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2283 {
2284     int8_t res = a + b;
2285     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2286         res = a > 0 ? INT8_MAX : INT8_MIN;
2287         env->vxsat = 0x1;
2288     }
2289     return res;
2290 }
2291 
2292 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2293 {
2294     int16_t res = a + b;
2295     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2296         res = a > 0 ? INT16_MAX : INT16_MIN;
2297         env->vxsat = 0x1;
2298     }
2299     return res;
2300 }
2301 
2302 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2303 {
2304     int32_t res = a + b;
2305     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2306         res = a > 0 ? INT32_MAX : INT32_MIN;
2307         env->vxsat = 0x1;
2308     }
2309     return res;
2310 }
2311 
2312 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2313 {
2314     int64_t res = a + b;
2315     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2316         res = a > 0 ? INT64_MAX : INT64_MIN;
2317         env->vxsat = 0x1;
2318     }
2319     return res;
2320 }
2321 
2322 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2323 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2324 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2325 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2326 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2327 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2328 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2329 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2330 
2331 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2332 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2333 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2334 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2335 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2336 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2337 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2338 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2339 
2340 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2341 {
2342     uint8_t res = a - b;
2343     if (res > a) {
2344         res = 0;
2345         env->vxsat = 0x1;
2346     }
2347     return res;
2348 }
2349 
2350 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2351                                uint16_t b)
2352 {
2353     uint16_t res = a - b;
2354     if (res > a) {
2355         res = 0;
2356         env->vxsat = 0x1;
2357     }
2358     return res;
2359 }
2360 
2361 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2362                                uint32_t b)
2363 {
2364     uint32_t res = a - b;
2365     if (res > a) {
2366         res = 0;
2367         env->vxsat = 0x1;
2368     }
2369     return res;
2370 }
2371 
2372 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2373                                uint64_t b)
2374 {
2375     uint64_t res = a - b;
2376     if (res > a) {
2377         res = 0;
2378         env->vxsat = 0x1;
2379     }
2380     return res;
2381 }
2382 
2383 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2384 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2385 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2386 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2387 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2388 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2389 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2390 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2391 
2392 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2393 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2394 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2395 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2396 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2397 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2398 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2399 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2400 
2401 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2402 {
2403     int8_t res = a - b;
2404     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2405         res = a >= 0 ? INT8_MAX : INT8_MIN;
2406         env->vxsat = 0x1;
2407     }
2408     return res;
2409 }
2410 
2411 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2412 {
2413     int16_t res = a - b;
2414     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2415         res = a >= 0 ? INT16_MAX : INT16_MIN;
2416         env->vxsat = 0x1;
2417     }
2418     return res;
2419 }
2420 
2421 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2422 {
2423     int32_t res = a - b;
2424     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2425         res = a >= 0 ? INT32_MAX : INT32_MIN;
2426         env->vxsat = 0x1;
2427     }
2428     return res;
2429 }
2430 
2431 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2432 {
2433     int64_t res = a - b;
2434     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2435         res = a >= 0 ? INT64_MAX : INT64_MIN;
2436         env->vxsat = 0x1;
2437     }
2438     return res;
2439 }
2440 
2441 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2442 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2443 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2444 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2445 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2446 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2447 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2448 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2449 
2450 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2451 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2452 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2453 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2454 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2455 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2456 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2457 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2458 
2459 /* Vector Single-Width Averaging Add and Subtract */
2460 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2461 {
2462     uint8_t d = extract64(v, shift, 1);
2463     uint8_t d1;
2464     uint64_t D1, D2;
2465 
2466     if (shift == 0 || shift > 64) {
2467         return 0;
2468     }
2469 
2470     d1 = extract64(v, shift - 1, 1);
2471     D1 = extract64(v, 0, shift);
2472     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2473         return d1;
2474     } else if (vxrm == 1) { /* round-to-nearest-even */
2475         if (shift > 1) {
2476             D2 = extract64(v, 0, shift - 1);
2477             return d1 & ((D2 != 0) | d);
2478         } else {
2479             return d1 & d;
2480         }
2481     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2482         return !d & (D1 != 0);
2483     }
2484     return 0; /* round-down (truncate) */
2485 }
2486 
2487 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2488 {
2489     int64_t res = (int64_t)a + b;
2490     uint8_t round = get_round(vxrm, res, 1);
2491 
2492     return (res >> 1) + round;
2493 }
2494 
2495 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2496 {
2497     int64_t res = a + b;
2498     uint8_t round = get_round(vxrm, res, 1);
2499     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2500 
2501     /* With signed overflow, bit 64 is inverse of bit 63. */
2502     return ((res >> 1) ^ over) + round;
2503 }
2504 
2505 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2506 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2507 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2508 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2509 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2510 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2511 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2512 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2513 
2514 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2515 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2516 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2517 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2518 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2519 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2520 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2521 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2522 
2523 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2524                                uint32_t a, uint32_t b)
2525 {
2526     uint64_t res = (uint64_t)a + b;
2527     uint8_t round = get_round(vxrm, res, 1);
2528 
2529     return (res >> 1) + round;
2530 }
2531 
2532 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2533                                uint64_t a, uint64_t b)
2534 {
2535     uint64_t res = a + b;
2536     uint8_t round = get_round(vxrm, res, 1);
2537     uint64_t over = (uint64_t)(res < a) << 63;
2538 
2539     return ((res >> 1) | over) + round;
2540 }
2541 
2542 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2543 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2544 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2545 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2546 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2547 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2548 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2549 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2550 
2551 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2552 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2553 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2554 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2555 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2556 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2557 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2558 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2559 
2560 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2561 {
2562     int64_t res = (int64_t)a - b;
2563     uint8_t round = get_round(vxrm, res, 1);
2564 
2565     return (res >> 1) + round;
2566 }
2567 
2568 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2569 {
2570     int64_t res = (int64_t)a - b;
2571     uint8_t round = get_round(vxrm, res, 1);
2572     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2573 
2574     /* With signed overflow, bit 64 is inverse of bit 63. */
2575     return ((res >> 1) ^ over) + round;
2576 }
2577 
2578 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2579 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2580 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2581 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2582 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2583 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2584 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2585 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2586 
2587 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2588 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2589 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2590 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2591 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2592 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2593 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2594 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2595 
2596 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2597                                uint32_t a, uint32_t b)
2598 {
2599     int64_t res = (int64_t)a - b;
2600     uint8_t round = get_round(vxrm, res, 1);
2601 
2602     return (res >> 1) + round;
2603 }
2604 
2605 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2606                                uint64_t a, uint64_t b)
2607 {
2608     uint64_t res = (uint64_t)a - b;
2609     uint8_t round = get_round(vxrm, res, 1);
2610     uint64_t over = (uint64_t)(res > a) << 63;
2611 
2612     return ((res >> 1) | over) + round;
2613 }
2614 
2615 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2616 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2617 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2618 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2619 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2620 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2621 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2622 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2623 
2624 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2625 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2626 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2627 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2628 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2629 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2630 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2631 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2632 
2633 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2634 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2635 {
2636     uint8_t round;
2637     int16_t res;
2638 
2639     res = (int16_t)a * (int16_t)b;
2640     round = get_round(vxrm, res, 7);
2641     res   = (res >> 7) + round;
2642 
2643     if (res > INT8_MAX) {
2644         env->vxsat = 0x1;
2645         return INT8_MAX;
2646     } else if (res < INT8_MIN) {
2647         env->vxsat = 0x1;
2648         return INT8_MIN;
2649     } else {
2650         return res;
2651     }
2652 }
2653 
2654 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2655 {
2656     uint8_t round;
2657     int32_t res;
2658 
2659     res = (int32_t)a * (int32_t)b;
2660     round = get_round(vxrm, res, 15);
2661     res   = (res >> 15) + round;
2662 
2663     if (res > INT16_MAX) {
2664         env->vxsat = 0x1;
2665         return INT16_MAX;
2666     } else if (res < INT16_MIN) {
2667         env->vxsat = 0x1;
2668         return INT16_MIN;
2669     } else {
2670         return res;
2671     }
2672 }
2673 
2674 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2675 {
2676     uint8_t round;
2677     int64_t res;
2678 
2679     res = (int64_t)a * (int64_t)b;
2680     round = get_round(vxrm, res, 31);
2681     res   = (res >> 31) + round;
2682 
2683     if (res > INT32_MAX) {
2684         env->vxsat = 0x1;
2685         return INT32_MAX;
2686     } else if (res < INT32_MIN) {
2687         env->vxsat = 0x1;
2688         return INT32_MIN;
2689     } else {
2690         return res;
2691     }
2692 }
2693 
2694 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2695 {
2696     uint8_t round;
2697     uint64_t hi_64, lo_64;
2698     int64_t res;
2699 
2700     if (a == INT64_MIN && b == INT64_MIN) {
2701         env->vxsat = 1;
2702         return INT64_MAX;
2703     }
2704 
2705     muls64(&lo_64, &hi_64, a, b);
2706     round = get_round(vxrm, lo_64, 63);
2707     /*
2708      * Cannot overflow, as there are always
2709      * 2 sign bits after multiply.
2710      */
2711     res = (hi_64 << 1) | (lo_64 >> 63);
2712     if (round) {
2713         if (res == INT64_MAX) {
2714             env->vxsat = 1;
2715         } else {
2716             res += 1;
2717         }
2718     }
2719     return res;
2720 }
2721 
2722 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2723 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2724 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2725 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2726 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2727 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2728 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2729 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2730 
2731 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2732 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2733 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2734 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2735 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2736 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2737 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2738 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2739 
2740 /* Vector Single-Width Scaling Shift Instructions */
2741 static inline uint8_t
2742 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2743 {
2744     uint8_t round, shift = b & 0x7;
2745     uint8_t res;
2746 
2747     round = get_round(vxrm, a, shift);
2748     res   = (a >> shift)  + round;
2749     return res;
2750 }
2751 static inline uint16_t
2752 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2753 {
2754     uint8_t round, shift = b & 0xf;
2755     uint16_t res;
2756 
2757     round = get_round(vxrm, a, shift);
2758     res   = (a >> shift)  + round;
2759     return res;
2760 }
2761 static inline uint32_t
2762 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2763 {
2764     uint8_t round, shift = b & 0x1f;
2765     uint32_t res;
2766 
2767     round = get_round(vxrm, a, shift);
2768     res   = (a >> shift)  + round;
2769     return res;
2770 }
2771 static inline uint64_t
2772 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2773 {
2774     uint8_t round, shift = b & 0x3f;
2775     uint64_t res;
2776 
2777     round = get_round(vxrm, a, shift);
2778     res   = (a >> shift)  + round;
2779     return res;
2780 }
2781 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2782 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2783 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2784 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2785 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2786 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2787 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2788 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2789 
2790 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2791 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2792 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2793 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2794 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2795 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2796 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2797 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2798 
2799 static inline int8_t
2800 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2801 {
2802     uint8_t round, shift = b & 0x7;
2803     int8_t res;
2804 
2805     round = get_round(vxrm, a, shift);
2806     res   = (a >> shift)  + round;
2807     return res;
2808 }
2809 static inline int16_t
2810 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2811 {
2812     uint8_t round, shift = b & 0xf;
2813     int16_t res;
2814 
2815     round = get_round(vxrm, a, shift);
2816     res   = (a >> shift)  + round;
2817     return res;
2818 }
2819 static inline int32_t
2820 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2821 {
2822     uint8_t round, shift = b & 0x1f;
2823     int32_t res;
2824 
2825     round = get_round(vxrm, a, shift);
2826     res   = (a >> shift)  + round;
2827     return res;
2828 }
2829 static inline int64_t
2830 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2831 {
2832     uint8_t round, shift = b & 0x3f;
2833     int64_t res;
2834 
2835     round = get_round(vxrm, a, shift);
2836     res   = (a >> shift)  + round;
2837     return res;
2838 }
2839 
2840 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2841 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2842 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2843 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2844 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2845 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2846 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2847 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2848 
2849 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2850 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2851 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2852 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2853 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2854 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2855 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2856 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2857 
2858 /* Vector Narrowing Fixed-Point Clip Instructions */
2859 static inline int8_t
2860 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2861 {
2862     uint8_t round, shift = b & 0xf;
2863     int16_t res;
2864 
2865     round = get_round(vxrm, a, shift);
2866     res   = (a >> shift)  + round;
2867     if (res > INT8_MAX) {
2868         env->vxsat = 0x1;
2869         return INT8_MAX;
2870     } else if (res < INT8_MIN) {
2871         env->vxsat = 0x1;
2872         return INT8_MIN;
2873     } else {
2874         return res;
2875     }
2876 }
2877 
2878 static inline int16_t
2879 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2880 {
2881     uint8_t round, shift = b & 0x1f;
2882     int32_t res;
2883 
2884     round = get_round(vxrm, a, shift);
2885     res   = (a >> shift)  + round;
2886     if (res > INT16_MAX) {
2887         env->vxsat = 0x1;
2888         return INT16_MAX;
2889     } else if (res < INT16_MIN) {
2890         env->vxsat = 0x1;
2891         return INT16_MIN;
2892     } else {
2893         return res;
2894     }
2895 }
2896 
2897 static inline int32_t
2898 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2899 {
2900     uint8_t round, shift = b & 0x3f;
2901     int64_t res;
2902 
2903     round = get_round(vxrm, a, shift);
2904     res   = (a >> shift)  + round;
2905     if (res > INT32_MAX) {
2906         env->vxsat = 0x1;
2907         return INT32_MAX;
2908     } else if (res < INT32_MIN) {
2909         env->vxsat = 0x1;
2910         return INT32_MIN;
2911     } else {
2912         return res;
2913     }
2914 }
2915 
2916 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2917 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2918 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2919 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2920 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2921 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2922 
2923 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2924 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2925 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2926 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2927 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2928 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2929 
2930 static inline uint8_t
2931 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2932 {
2933     uint8_t round, shift = b & 0xf;
2934     uint16_t res;
2935 
2936     round = get_round(vxrm, a, shift);
2937     res   = (a >> shift)  + round;
2938     if (res > UINT8_MAX) {
2939         env->vxsat = 0x1;
2940         return UINT8_MAX;
2941     } else {
2942         return res;
2943     }
2944 }
2945 
2946 static inline uint16_t
2947 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2948 {
2949     uint8_t round, shift = b & 0x1f;
2950     uint32_t res;
2951 
2952     round = get_round(vxrm, a, shift);
2953     res   = (a >> shift)  + round;
2954     if (res > UINT16_MAX) {
2955         env->vxsat = 0x1;
2956         return UINT16_MAX;
2957     } else {
2958         return res;
2959     }
2960 }
2961 
2962 static inline uint32_t
2963 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2964 {
2965     uint8_t round, shift = b & 0x3f;
2966     uint64_t res;
2967 
2968     round = get_round(vxrm, a, shift);
2969     res   = (a >> shift)  + round;
2970     if (res > UINT32_MAX) {
2971         env->vxsat = 0x1;
2972         return UINT32_MAX;
2973     } else {
2974         return res;
2975     }
2976 }
2977 
2978 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2979 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2980 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2981 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2982 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2983 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2984 
2985 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2986 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2987 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2988 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2989 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2990 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2991 
2992 /*
2993  *** Vector Float Point Arithmetic Instructions
2994  */
2995 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2996 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2997 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2998                       CPURISCVState *env)                      \
2999 {                                                              \
3000     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3001     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3002     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3003 }
3004 
3005 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3006 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3007                   void *vs2, CPURISCVState *env,          \
3008                   uint32_t desc)                          \
3009 {                                                         \
3010     uint32_t vm = vext_vm(desc);                          \
3011     uint32_t vl = env->vl;                                \
3012     uint32_t total_elems =                                \
3013         vext_get_total_elems(env, desc, ESZ);             \
3014     uint32_t vta = vext_vta(desc);                        \
3015     uint32_t i;                                           \
3016                                                           \
3017     for (i = env->vstart; i < vl; i++) {                  \
3018         if (!vm && !vext_elem_mask(v0, i)) {              \
3019             continue;                                     \
3020         }                                                 \
3021         do_##NAME(vd, vs1, vs2, i, env);                  \
3022     }                                                     \
3023     env->vstart = 0;                                      \
3024     /* set tail elements to 1s */                         \
3025     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3026                       total_elems * ESZ);                 \
3027 }
3028 
3029 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3030 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3031 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3032 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3033 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3034 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3035 
3036 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3037 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3038                       CPURISCVState *env)                      \
3039 {                                                              \
3040     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3041     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3042 }
3043 
3044 #define GEN_VEXT_VF(NAME, ESZ)                            \
3045 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3046                   void *vs2, CPURISCVState *env,          \
3047                   uint32_t desc)                          \
3048 {                                                         \
3049     uint32_t vm = vext_vm(desc);                          \
3050     uint32_t vl = env->vl;                                \
3051     uint32_t total_elems =                                \
3052         vext_get_total_elems(env, desc, ESZ);              \
3053     uint32_t vta = vext_vta(desc);                        \
3054     uint32_t i;                                           \
3055                                                           \
3056     for (i = env->vstart; i < vl; i++) {                  \
3057         if (!vm && !vext_elem_mask(v0, i)) {              \
3058             continue;                                     \
3059         }                                                 \
3060         do_##NAME(vd, s1, vs2, i, env);                   \
3061     }                                                     \
3062     env->vstart = 0;                                      \
3063     /* set tail elements to 1s */                         \
3064     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3065                       total_elems * ESZ);                 \
3066 }
3067 
3068 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3069 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3070 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3071 GEN_VEXT_VF(vfadd_vf_h, 2)
3072 GEN_VEXT_VF(vfadd_vf_w, 4)
3073 GEN_VEXT_VF(vfadd_vf_d, 8)
3074 
3075 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3076 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3077 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3078 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3079 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3080 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3081 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3082 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3083 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3084 GEN_VEXT_VF(vfsub_vf_h, 2)
3085 GEN_VEXT_VF(vfsub_vf_w, 4)
3086 GEN_VEXT_VF(vfsub_vf_d, 8)
3087 
3088 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3089 {
3090     return float16_sub(b, a, s);
3091 }
3092 
3093 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3094 {
3095     return float32_sub(b, a, s);
3096 }
3097 
3098 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3099 {
3100     return float64_sub(b, a, s);
3101 }
3102 
3103 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3104 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3105 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3106 GEN_VEXT_VF(vfrsub_vf_h, 2)
3107 GEN_VEXT_VF(vfrsub_vf_w, 4)
3108 GEN_VEXT_VF(vfrsub_vf_d, 8)
3109 
3110 /* Vector Widening Floating-Point Add/Subtract Instructions */
3111 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3112 {
3113     return float32_add(float16_to_float32(a, true, s),
3114             float16_to_float32(b, true, s), s);
3115 }
3116 
3117 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3118 {
3119     return float64_add(float32_to_float64(a, s),
3120             float32_to_float64(b, s), s);
3121 
3122 }
3123 
3124 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3125 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3126 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3127 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3128 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3129 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3130 GEN_VEXT_VF(vfwadd_vf_h, 4)
3131 GEN_VEXT_VF(vfwadd_vf_w, 8)
3132 
3133 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3134 {
3135     return float32_sub(float16_to_float32(a, true, s),
3136             float16_to_float32(b, true, s), s);
3137 }
3138 
3139 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3140 {
3141     return float64_sub(float32_to_float64(a, s),
3142             float32_to_float64(b, s), s);
3143 
3144 }
3145 
3146 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3147 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3148 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3149 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3150 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3151 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3152 GEN_VEXT_VF(vfwsub_vf_h, 4)
3153 GEN_VEXT_VF(vfwsub_vf_w, 8)
3154 
3155 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3156 {
3157     return float32_add(a, float16_to_float32(b, true, s), s);
3158 }
3159 
3160 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3161 {
3162     return float64_add(a, float32_to_float64(b, s), s);
3163 }
3164 
3165 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3166 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3167 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3168 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3169 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3170 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3171 GEN_VEXT_VF(vfwadd_wf_h, 4)
3172 GEN_VEXT_VF(vfwadd_wf_w, 8)
3173 
3174 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3175 {
3176     return float32_sub(a, float16_to_float32(b, true, s), s);
3177 }
3178 
3179 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3180 {
3181     return float64_sub(a, float32_to_float64(b, s), s);
3182 }
3183 
3184 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3185 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3186 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3187 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3188 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3189 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3190 GEN_VEXT_VF(vfwsub_wf_h, 4)
3191 GEN_VEXT_VF(vfwsub_wf_w, 8)
3192 
3193 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3194 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3195 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3196 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3197 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3198 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3199 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3200 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3201 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3202 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3203 GEN_VEXT_VF(vfmul_vf_h, 2)
3204 GEN_VEXT_VF(vfmul_vf_w, 4)
3205 GEN_VEXT_VF(vfmul_vf_d, 8)
3206 
3207 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3208 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3209 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3210 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3211 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3212 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3213 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3214 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3215 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3216 GEN_VEXT_VF(vfdiv_vf_h, 2)
3217 GEN_VEXT_VF(vfdiv_vf_w, 4)
3218 GEN_VEXT_VF(vfdiv_vf_d, 8)
3219 
3220 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3221 {
3222     return float16_div(b, a, s);
3223 }
3224 
3225 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3226 {
3227     return float32_div(b, a, s);
3228 }
3229 
3230 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3231 {
3232     return float64_div(b, a, s);
3233 }
3234 
3235 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3236 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3237 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3238 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3239 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3240 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3241 
3242 /* Vector Widening Floating-Point Multiply */
3243 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3244 {
3245     return float32_mul(float16_to_float32(a, true, s),
3246             float16_to_float32(b, true, s), s);
3247 }
3248 
3249 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3250 {
3251     return float64_mul(float32_to_float64(a, s),
3252             float32_to_float64(b, s), s);
3253 
3254 }
3255 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3256 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3257 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3258 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3259 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3260 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3261 GEN_VEXT_VF(vfwmul_vf_h, 4)
3262 GEN_VEXT_VF(vfwmul_vf_w, 8)
3263 
3264 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3265 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3266 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3267         CPURISCVState *env)                                        \
3268 {                                                                  \
3269     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3270     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3271     TD d = *((TD *)vd + HD(i));                                    \
3272     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3273 }
3274 
3275 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3276 {
3277     return float16_muladd(a, b, d, 0, s);
3278 }
3279 
3280 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3281 {
3282     return float32_muladd(a, b, d, 0, s);
3283 }
3284 
3285 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3286 {
3287     return float64_muladd(a, b, d, 0, s);
3288 }
3289 
3290 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3291 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3292 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3293 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3294 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3295 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3296 
3297 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3298 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3299         CPURISCVState *env)                                       \
3300 {                                                                 \
3301     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3302     TD d = *((TD *)vd + HD(i));                                   \
3303     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3304 }
3305 
3306 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3307 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3308 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3309 GEN_VEXT_VF(vfmacc_vf_h, 2)
3310 GEN_VEXT_VF(vfmacc_vf_w, 4)
3311 GEN_VEXT_VF(vfmacc_vf_d, 8)
3312 
3313 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3314 {
3315     return float16_muladd(a, b, d,
3316             float_muladd_negate_c | float_muladd_negate_product, s);
3317 }
3318 
3319 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3320 {
3321     return float32_muladd(a, b, d,
3322             float_muladd_negate_c | float_muladd_negate_product, s);
3323 }
3324 
3325 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3326 {
3327     return float64_muladd(a, b, d,
3328             float_muladd_negate_c | float_muladd_negate_product, s);
3329 }
3330 
3331 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3332 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3333 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3334 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3335 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3336 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3337 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3338 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3339 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3340 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3341 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3342 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3343 
3344 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3345 {
3346     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3347 }
3348 
3349 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3350 {
3351     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3352 }
3353 
3354 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3355 {
3356     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3357 }
3358 
3359 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3360 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3361 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3362 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3363 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3364 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3365 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3366 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3367 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3368 GEN_VEXT_VF(vfmsac_vf_h, 2)
3369 GEN_VEXT_VF(vfmsac_vf_w, 4)
3370 GEN_VEXT_VF(vfmsac_vf_d, 8)
3371 
3372 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3373 {
3374     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3375 }
3376 
3377 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3378 {
3379     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3380 }
3381 
3382 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3383 {
3384     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3385 }
3386 
3387 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3388 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3389 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3390 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3391 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3392 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3393 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3394 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3395 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3396 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3397 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3398 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3399 
3400 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3401 {
3402     return float16_muladd(d, b, a, 0, s);
3403 }
3404 
3405 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3406 {
3407     return float32_muladd(d, b, a, 0, s);
3408 }
3409 
3410 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3411 {
3412     return float64_muladd(d, b, a, 0, s);
3413 }
3414 
3415 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3416 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3417 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3418 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3419 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3420 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3421 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3422 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3423 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3424 GEN_VEXT_VF(vfmadd_vf_h, 2)
3425 GEN_VEXT_VF(vfmadd_vf_w, 4)
3426 GEN_VEXT_VF(vfmadd_vf_d, 8)
3427 
3428 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3429 {
3430     return float16_muladd(d, b, a,
3431             float_muladd_negate_c | float_muladd_negate_product, s);
3432 }
3433 
3434 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3435 {
3436     return float32_muladd(d, b, a,
3437             float_muladd_negate_c | float_muladd_negate_product, s);
3438 }
3439 
3440 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3441 {
3442     return float64_muladd(d, b, a,
3443             float_muladd_negate_c | float_muladd_negate_product, s);
3444 }
3445 
3446 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3447 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3448 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3449 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3450 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3451 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3452 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3453 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3454 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3455 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3456 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3457 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3458 
3459 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3460 {
3461     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3462 }
3463 
3464 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3465 {
3466     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3467 }
3468 
3469 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3470 {
3471     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3472 }
3473 
3474 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3475 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3476 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3477 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3478 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3479 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3480 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3481 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3482 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3483 GEN_VEXT_VF(vfmsub_vf_h, 2)
3484 GEN_VEXT_VF(vfmsub_vf_w, 4)
3485 GEN_VEXT_VF(vfmsub_vf_d, 8)
3486 
3487 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3488 {
3489     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3490 }
3491 
3492 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3493 {
3494     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3495 }
3496 
3497 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3498 {
3499     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3500 }
3501 
3502 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3503 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3504 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3505 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3506 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3507 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3508 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3509 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3510 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3511 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3512 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3513 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3514 
3515 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3516 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3517 {
3518     return float32_muladd(float16_to_float32(a, true, s),
3519                         float16_to_float32(b, true, s), d, 0, s);
3520 }
3521 
3522 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3523 {
3524     return float64_muladd(float32_to_float64(a, s),
3525                         float32_to_float64(b, s), d, 0, s);
3526 }
3527 
3528 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3529 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3530 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3531 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3532 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3533 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3534 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3535 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3536 
3537 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3538 {
3539     return float32_muladd(float16_to_float32(a, true, s),
3540                         float16_to_float32(b, true, s), d,
3541                         float_muladd_negate_c | float_muladd_negate_product, s);
3542 }
3543 
3544 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3545 {
3546     return float64_muladd(float32_to_float64(a, s),
3547                         float32_to_float64(b, s), d,
3548                         float_muladd_negate_c | float_muladd_negate_product, s);
3549 }
3550 
3551 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3552 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3553 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3554 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3555 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3556 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3557 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3558 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3559 
3560 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3561 {
3562     return float32_muladd(float16_to_float32(a, true, s),
3563                         float16_to_float32(b, true, s), d,
3564                         float_muladd_negate_c, s);
3565 }
3566 
3567 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3568 {
3569     return float64_muladd(float32_to_float64(a, s),
3570                         float32_to_float64(b, s), d,
3571                         float_muladd_negate_c, s);
3572 }
3573 
3574 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3575 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3576 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3577 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3578 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3579 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3580 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3581 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3582 
3583 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3584 {
3585     return float32_muladd(float16_to_float32(a, true, s),
3586                         float16_to_float32(b, true, s), d,
3587                         float_muladd_negate_product, s);
3588 }
3589 
3590 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3591 {
3592     return float64_muladd(float32_to_float64(a, s),
3593                         float32_to_float64(b, s), d,
3594                         float_muladd_negate_product, s);
3595 }
3596 
3597 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3598 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3599 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3600 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3601 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3602 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3603 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3604 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3605 
3606 /* Vector Floating-Point Square-Root Instruction */
3607 /* (TD, T2, TX2) */
3608 #define OP_UU_H uint16_t, uint16_t, uint16_t
3609 #define OP_UU_W uint32_t, uint32_t, uint32_t
3610 #define OP_UU_D uint64_t, uint64_t, uint64_t
3611 
3612 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3613 static void do_##NAME(void *vd, void *vs2, int i,      \
3614         CPURISCVState *env)                            \
3615 {                                                      \
3616     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3617     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3618 }
3619 
3620 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3621 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3622         CPURISCVState *env, uint32_t desc)             \
3623 {                                                      \
3624     uint32_t vm = vext_vm(desc);                       \
3625     uint32_t vl = env->vl;                             \
3626     uint32_t total_elems =                             \
3627         vext_get_total_elems(env, desc, ESZ);          \
3628     uint32_t vta = vext_vta(desc);                     \
3629     uint32_t i;                                        \
3630                                                        \
3631     if (vl == 0) {                                     \
3632         return;                                        \
3633     }                                                  \
3634     for (i = env->vstart; i < vl; i++) {               \
3635         if (!vm && !vext_elem_mask(v0, i)) {           \
3636             continue;                                  \
3637         }                                              \
3638         do_##NAME(vd, vs2, i, env);                    \
3639     }                                                  \
3640     env->vstart = 0;                                   \
3641     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3642                       total_elems * ESZ);              \
3643 }
3644 
3645 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3646 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3647 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3648 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3649 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3650 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3651 
3652 /*
3653  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3654  *
3655  * Adapted from riscv-v-spec recip.c:
3656  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3657  */
3658 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3659 {
3660     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3661     uint64_t exp = extract64(f, frac_size, exp_size);
3662     uint64_t frac = extract64(f, 0, frac_size);
3663 
3664     const uint8_t lookup_table[] = {
3665         52, 51, 50, 48, 47, 46, 44, 43,
3666         42, 41, 40, 39, 38, 36, 35, 34,
3667         33, 32, 31, 30, 30, 29, 28, 27,
3668         26, 25, 24, 23, 23, 22, 21, 20,
3669         19, 19, 18, 17, 16, 16, 15, 14,
3670         14, 13, 12, 12, 11, 10, 10, 9,
3671         9, 8, 7, 7, 6, 6, 5, 4,
3672         4, 3, 3, 2, 2, 1, 1, 0,
3673         127, 125, 123, 121, 119, 118, 116, 114,
3674         113, 111, 109, 108, 106, 105, 103, 102,
3675         100, 99, 97, 96, 95, 93, 92, 91,
3676         90, 88, 87, 86, 85, 84, 83, 82,
3677         80, 79, 78, 77, 76, 75, 74, 73,
3678         72, 71, 70, 70, 69, 68, 67, 66,
3679         65, 64, 63, 63, 62, 61, 60, 59,
3680         59, 58, 57, 56, 56, 55, 54, 53
3681     };
3682     const int precision = 7;
3683 
3684     if (exp == 0 && frac != 0) { /* subnormal */
3685         /* Normalize the subnormal. */
3686         while (extract64(frac, frac_size - 1, 1) == 0) {
3687             exp--;
3688             frac <<= 1;
3689         }
3690 
3691         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3692     }
3693 
3694     int idx = ((exp & 1) << (precision - 1)) |
3695                 (frac >> (frac_size - precision + 1));
3696     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3697                             (frac_size - precision);
3698     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3699 
3700     uint64_t val = 0;
3701     val = deposit64(val, 0, frac_size, out_frac);
3702     val = deposit64(val, frac_size, exp_size, out_exp);
3703     val = deposit64(val, frac_size + exp_size, 1, sign);
3704     return val;
3705 }
3706 
3707 static float16 frsqrt7_h(float16 f, float_status *s)
3708 {
3709     int exp_size = 5, frac_size = 10;
3710     bool sign = float16_is_neg(f);
3711 
3712     /*
3713      * frsqrt7(sNaN) = canonical NaN
3714      * frsqrt7(-inf) = canonical NaN
3715      * frsqrt7(-normal) = canonical NaN
3716      * frsqrt7(-subnormal) = canonical NaN
3717      */
3718     if (float16_is_signaling_nan(f, s) ||
3719             (float16_is_infinity(f) && sign) ||
3720             (float16_is_normal(f) && sign) ||
3721             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3722         s->float_exception_flags |= float_flag_invalid;
3723         return float16_default_nan(s);
3724     }
3725 
3726     /* frsqrt7(qNaN) = canonical NaN */
3727     if (float16_is_quiet_nan(f, s)) {
3728         return float16_default_nan(s);
3729     }
3730 
3731     /* frsqrt7(+-0) = +-inf */
3732     if (float16_is_zero(f)) {
3733         s->float_exception_flags |= float_flag_divbyzero;
3734         return float16_set_sign(float16_infinity, sign);
3735     }
3736 
3737     /* frsqrt7(+inf) = +0 */
3738     if (float16_is_infinity(f) && !sign) {
3739         return float16_set_sign(float16_zero, sign);
3740     }
3741 
3742     /* +normal, +subnormal */
3743     uint64_t val = frsqrt7(f, exp_size, frac_size);
3744     return make_float16(val);
3745 }
3746 
3747 static float32 frsqrt7_s(float32 f, float_status *s)
3748 {
3749     int exp_size = 8, frac_size = 23;
3750     bool sign = float32_is_neg(f);
3751 
3752     /*
3753      * frsqrt7(sNaN) = canonical NaN
3754      * frsqrt7(-inf) = canonical NaN
3755      * frsqrt7(-normal) = canonical NaN
3756      * frsqrt7(-subnormal) = canonical NaN
3757      */
3758     if (float32_is_signaling_nan(f, s) ||
3759             (float32_is_infinity(f) && sign) ||
3760             (float32_is_normal(f) && sign) ||
3761             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3762         s->float_exception_flags |= float_flag_invalid;
3763         return float32_default_nan(s);
3764     }
3765 
3766     /* frsqrt7(qNaN) = canonical NaN */
3767     if (float32_is_quiet_nan(f, s)) {
3768         return float32_default_nan(s);
3769     }
3770 
3771     /* frsqrt7(+-0) = +-inf */
3772     if (float32_is_zero(f)) {
3773         s->float_exception_flags |= float_flag_divbyzero;
3774         return float32_set_sign(float32_infinity, sign);
3775     }
3776 
3777     /* frsqrt7(+inf) = +0 */
3778     if (float32_is_infinity(f) && !sign) {
3779         return float32_set_sign(float32_zero, sign);
3780     }
3781 
3782     /* +normal, +subnormal */
3783     uint64_t val = frsqrt7(f, exp_size, frac_size);
3784     return make_float32(val);
3785 }
3786 
3787 static float64 frsqrt7_d(float64 f, float_status *s)
3788 {
3789     int exp_size = 11, frac_size = 52;
3790     bool sign = float64_is_neg(f);
3791 
3792     /*
3793      * frsqrt7(sNaN) = canonical NaN
3794      * frsqrt7(-inf) = canonical NaN
3795      * frsqrt7(-normal) = canonical NaN
3796      * frsqrt7(-subnormal) = canonical NaN
3797      */
3798     if (float64_is_signaling_nan(f, s) ||
3799             (float64_is_infinity(f) && sign) ||
3800             (float64_is_normal(f) && sign) ||
3801             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3802         s->float_exception_flags |= float_flag_invalid;
3803         return float64_default_nan(s);
3804     }
3805 
3806     /* frsqrt7(qNaN) = canonical NaN */
3807     if (float64_is_quiet_nan(f, s)) {
3808         return float64_default_nan(s);
3809     }
3810 
3811     /* frsqrt7(+-0) = +-inf */
3812     if (float64_is_zero(f)) {
3813         s->float_exception_flags |= float_flag_divbyzero;
3814         return float64_set_sign(float64_infinity, sign);
3815     }
3816 
3817     /* frsqrt7(+inf) = +0 */
3818     if (float64_is_infinity(f) && !sign) {
3819         return float64_set_sign(float64_zero, sign);
3820     }
3821 
3822     /* +normal, +subnormal */
3823     uint64_t val = frsqrt7(f, exp_size, frac_size);
3824     return make_float64(val);
3825 }
3826 
3827 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3828 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3829 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3830 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3831 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3832 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3833 
3834 /*
3835  * Vector Floating-Point Reciprocal Estimate Instruction
3836  *
3837  * Adapted from riscv-v-spec recip.c:
3838  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3839  */
3840 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3841                       float_status *s)
3842 {
3843     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3844     uint64_t exp = extract64(f, frac_size, exp_size);
3845     uint64_t frac = extract64(f, 0, frac_size);
3846 
3847     const uint8_t lookup_table[] = {
3848         127, 125, 123, 121, 119, 117, 116, 114,
3849         112, 110, 109, 107, 105, 104, 102, 100,
3850         99, 97, 96, 94, 93, 91, 90, 88,
3851         87, 85, 84, 83, 81, 80, 79, 77,
3852         76, 75, 74, 72, 71, 70, 69, 68,
3853         66, 65, 64, 63, 62, 61, 60, 59,
3854         58, 57, 56, 55, 54, 53, 52, 51,
3855         50, 49, 48, 47, 46, 45, 44, 43,
3856         42, 41, 40, 40, 39, 38, 37, 36,
3857         35, 35, 34, 33, 32, 31, 31, 30,
3858         29, 28, 28, 27, 26, 25, 25, 24,
3859         23, 23, 22, 21, 21, 20, 19, 19,
3860         18, 17, 17, 16, 15, 15, 14, 14,
3861         13, 12, 12, 11, 11, 10, 9, 9,
3862         8, 8, 7, 7, 6, 5, 5, 4,
3863         4, 3, 3, 2, 2, 1, 1, 0
3864     };
3865     const int precision = 7;
3866 
3867     if (exp == 0 && frac != 0) { /* subnormal */
3868         /* Normalize the subnormal. */
3869         while (extract64(frac, frac_size - 1, 1) == 0) {
3870             exp--;
3871             frac <<= 1;
3872         }
3873 
3874         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3875 
3876         if (exp != 0 && exp != UINT64_MAX) {
3877             /*
3878              * Overflow to inf or max value of same sign,
3879              * depending on sign and rounding mode.
3880              */
3881             s->float_exception_flags |= (float_flag_inexact |
3882                                          float_flag_overflow);
3883 
3884             if ((s->float_rounding_mode == float_round_to_zero) ||
3885                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3886                 ((s->float_rounding_mode == float_round_up) && sign)) {
3887                 /* Return greatest/negative finite value. */
3888                 return (sign << (exp_size + frac_size)) |
3889                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3890             } else {
3891                 /* Return +-inf. */
3892                 return (sign << (exp_size + frac_size)) |
3893                     MAKE_64BIT_MASK(frac_size, exp_size);
3894             }
3895         }
3896     }
3897 
3898     int idx = frac >> (frac_size - precision);
3899     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3900                             (frac_size - precision);
3901     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3902 
3903     if (out_exp == 0 || out_exp == UINT64_MAX) {
3904         /*
3905          * The result is subnormal, but don't raise the underflow exception,
3906          * because there's no additional loss of precision.
3907          */
3908         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3909         if (out_exp == UINT64_MAX) {
3910             out_frac >>= 1;
3911             out_exp = 0;
3912         }
3913     }
3914 
3915     uint64_t val = 0;
3916     val = deposit64(val, 0, frac_size, out_frac);
3917     val = deposit64(val, frac_size, exp_size, out_exp);
3918     val = deposit64(val, frac_size + exp_size, 1, sign);
3919     return val;
3920 }
3921 
3922 static float16 frec7_h(float16 f, float_status *s)
3923 {
3924     int exp_size = 5, frac_size = 10;
3925     bool sign = float16_is_neg(f);
3926 
3927     /* frec7(+-inf) = +-0 */
3928     if (float16_is_infinity(f)) {
3929         return float16_set_sign(float16_zero, sign);
3930     }
3931 
3932     /* frec7(+-0) = +-inf */
3933     if (float16_is_zero(f)) {
3934         s->float_exception_flags |= float_flag_divbyzero;
3935         return float16_set_sign(float16_infinity, sign);
3936     }
3937 
3938     /* frec7(sNaN) = canonical NaN */
3939     if (float16_is_signaling_nan(f, s)) {
3940         s->float_exception_flags |= float_flag_invalid;
3941         return float16_default_nan(s);
3942     }
3943 
3944     /* frec7(qNaN) = canonical NaN */
3945     if (float16_is_quiet_nan(f, s)) {
3946         return float16_default_nan(s);
3947     }
3948 
3949     /* +-normal, +-subnormal */
3950     uint64_t val = frec7(f, exp_size, frac_size, s);
3951     return make_float16(val);
3952 }
3953 
3954 static float32 frec7_s(float32 f, float_status *s)
3955 {
3956     int exp_size = 8, frac_size = 23;
3957     bool sign = float32_is_neg(f);
3958 
3959     /* frec7(+-inf) = +-0 */
3960     if (float32_is_infinity(f)) {
3961         return float32_set_sign(float32_zero, sign);
3962     }
3963 
3964     /* frec7(+-0) = +-inf */
3965     if (float32_is_zero(f)) {
3966         s->float_exception_flags |= float_flag_divbyzero;
3967         return float32_set_sign(float32_infinity, sign);
3968     }
3969 
3970     /* frec7(sNaN) = canonical NaN */
3971     if (float32_is_signaling_nan(f, s)) {
3972         s->float_exception_flags |= float_flag_invalid;
3973         return float32_default_nan(s);
3974     }
3975 
3976     /* frec7(qNaN) = canonical NaN */
3977     if (float32_is_quiet_nan(f, s)) {
3978         return float32_default_nan(s);
3979     }
3980 
3981     /* +-normal, +-subnormal */
3982     uint64_t val = frec7(f, exp_size, frac_size, s);
3983     return make_float32(val);
3984 }
3985 
3986 static float64 frec7_d(float64 f, float_status *s)
3987 {
3988     int exp_size = 11, frac_size = 52;
3989     bool sign = float64_is_neg(f);
3990 
3991     /* frec7(+-inf) = +-0 */
3992     if (float64_is_infinity(f)) {
3993         return float64_set_sign(float64_zero, sign);
3994     }
3995 
3996     /* frec7(+-0) = +-inf */
3997     if (float64_is_zero(f)) {
3998         s->float_exception_flags |= float_flag_divbyzero;
3999         return float64_set_sign(float64_infinity, sign);
4000     }
4001 
4002     /* frec7(sNaN) = canonical NaN */
4003     if (float64_is_signaling_nan(f, s)) {
4004         s->float_exception_flags |= float_flag_invalid;
4005         return float64_default_nan(s);
4006     }
4007 
4008     /* frec7(qNaN) = canonical NaN */
4009     if (float64_is_quiet_nan(f, s)) {
4010         return float64_default_nan(s);
4011     }
4012 
4013     /* +-normal, +-subnormal */
4014     uint64_t val = frec7(f, exp_size, frac_size, s);
4015     return make_float64(val);
4016 }
4017 
4018 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4019 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4020 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4021 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4022 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4023 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4024 
4025 /* Vector Floating-Point MIN/MAX Instructions */
4026 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4027 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4028 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4029 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4030 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4031 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4032 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4033 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4034 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4035 GEN_VEXT_VF(vfmin_vf_h, 2)
4036 GEN_VEXT_VF(vfmin_vf_w, 4)
4037 GEN_VEXT_VF(vfmin_vf_d, 8)
4038 
4039 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4040 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4041 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4042 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4043 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4044 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4045 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4046 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4047 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4048 GEN_VEXT_VF(vfmax_vf_h, 2)
4049 GEN_VEXT_VF(vfmax_vf_w, 4)
4050 GEN_VEXT_VF(vfmax_vf_d, 8)
4051 
4052 /* Vector Floating-Point Sign-Injection Instructions */
4053 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4054 {
4055     return deposit64(b, 0, 15, a);
4056 }
4057 
4058 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4059 {
4060     return deposit64(b, 0, 31, a);
4061 }
4062 
4063 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4064 {
4065     return deposit64(b, 0, 63, a);
4066 }
4067 
4068 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4069 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4070 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4071 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4072 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4073 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4074 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4075 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4076 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4077 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4078 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4079 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4080 
4081 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4082 {
4083     return deposit64(~b, 0, 15, a);
4084 }
4085 
4086 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4087 {
4088     return deposit64(~b, 0, 31, a);
4089 }
4090 
4091 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4092 {
4093     return deposit64(~b, 0, 63, a);
4094 }
4095 
4096 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4097 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4098 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4099 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4100 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4101 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4102 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4103 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4104 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4105 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4106 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4107 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4108 
4109 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4110 {
4111     return deposit64(b ^ a, 0, 15, a);
4112 }
4113 
4114 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4115 {
4116     return deposit64(b ^ a, 0, 31, a);
4117 }
4118 
4119 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4120 {
4121     return deposit64(b ^ a, 0, 63, a);
4122 }
4123 
4124 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4125 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4126 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4127 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4128 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4129 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4130 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4131 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4132 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4133 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4134 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4135 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4136 
4137 /* Vector Floating-Point Compare Instructions */
4138 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4139 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4140                   CPURISCVState *env, uint32_t desc)          \
4141 {                                                             \
4142     uint32_t vm = vext_vm(desc);                              \
4143     uint32_t vl = env->vl;                                    \
4144     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4145     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4146     uint32_t i;                                               \
4147                                                               \
4148     for (i = env->vstart; i < vl; i++) {                      \
4149         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4150         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4151         if (!vm && !vext_elem_mask(v0, i)) {                  \
4152             continue;                                         \
4153         }                                                     \
4154         vext_set_elem_mask(vd, i,                             \
4155                            DO_OP(s2, s1, &env->fp_status));   \
4156     }                                                         \
4157     env->vstart = 0;                                          \
4158     /* mask destination register are always tail-agnostic */  \
4159     /* set tail elements to 1s */                             \
4160     if (vta_all_1s) {                                         \
4161         for (; i < total_elems; i++) {                        \
4162             vext_set_elem_mask(vd, i, 1);                     \
4163         }                                                     \
4164     }                                                         \
4165 }
4166 
4167 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4168 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4169 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4170 
4171 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4172 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4173                   CPURISCVState *env, uint32_t desc)                \
4174 {                                                                   \
4175     uint32_t vm = vext_vm(desc);                                    \
4176     uint32_t vl = env->vl;                                          \
4177     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4178     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4179     uint32_t i;                                                     \
4180                                                                     \
4181     for (i = env->vstart; i < vl; i++) {                            \
4182         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4183         if (!vm && !vext_elem_mask(v0, i)) {                        \
4184             continue;                                               \
4185         }                                                           \
4186         vext_set_elem_mask(vd, i,                                   \
4187                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4188     }                                                               \
4189     env->vstart = 0;                                                \
4190     /* mask destination register are always tail-agnostic */        \
4191     /* set tail elements to 1s */                                   \
4192     if (vta_all_1s) {                                               \
4193         for (; i < total_elems; i++) {                              \
4194             vext_set_elem_mask(vd, i, 1);                           \
4195         }                                                           \
4196     }                                                               \
4197 }
4198 
4199 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4200 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4201 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4202 
4203 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4204 {
4205     FloatRelation compare = float16_compare_quiet(a, b, s);
4206     return compare != float_relation_equal;
4207 }
4208 
4209 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4210 {
4211     FloatRelation compare = float32_compare_quiet(a, b, s);
4212     return compare != float_relation_equal;
4213 }
4214 
4215 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4216 {
4217     FloatRelation compare = float64_compare_quiet(a, b, s);
4218     return compare != float_relation_equal;
4219 }
4220 
4221 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4222 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4223 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4224 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4225 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4226 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4227 
4228 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4229 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4230 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4231 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4232 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4233 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4234 
4235 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4236 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4237 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4238 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4239 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4240 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4241 
4242 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4243 {
4244     FloatRelation compare = float16_compare(a, b, s);
4245     return compare == float_relation_greater;
4246 }
4247 
4248 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4249 {
4250     FloatRelation compare = float32_compare(a, b, s);
4251     return compare == float_relation_greater;
4252 }
4253 
4254 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4255 {
4256     FloatRelation compare = float64_compare(a, b, s);
4257     return compare == float_relation_greater;
4258 }
4259 
4260 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4261 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4262 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4263 
4264 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4265 {
4266     FloatRelation compare = float16_compare(a, b, s);
4267     return compare == float_relation_greater ||
4268            compare == float_relation_equal;
4269 }
4270 
4271 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4272 {
4273     FloatRelation compare = float32_compare(a, b, s);
4274     return compare == float_relation_greater ||
4275            compare == float_relation_equal;
4276 }
4277 
4278 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4279 {
4280     FloatRelation compare = float64_compare(a, b, s);
4281     return compare == float_relation_greater ||
4282            compare == float_relation_equal;
4283 }
4284 
4285 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4286 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4287 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4288 
4289 /* Vector Floating-Point Classify Instruction */
4290 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4291 static void do_##NAME(void *vd, void *vs2, int i)      \
4292 {                                                      \
4293     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4294     *((TD *)vd + HD(i)) = OP(s2);                      \
4295 }
4296 
4297 #define GEN_VEXT_V(NAME, ESZ)                          \
4298 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4299                   CPURISCVState *env, uint32_t desc)   \
4300 {                                                      \
4301     uint32_t vm = vext_vm(desc);                       \
4302     uint32_t vl = env->vl;                             \
4303     uint32_t total_elems =                             \
4304         vext_get_total_elems(env, desc, ESZ);          \
4305     uint32_t vta = vext_vta(desc);                     \
4306     uint32_t i;                                        \
4307                                                        \
4308     for (i = env->vstart; i < vl; i++) {               \
4309         if (!vm && !vext_elem_mask(v0, i)) {           \
4310             continue;                                  \
4311         }                                              \
4312         do_##NAME(vd, vs2, i);                         \
4313     }                                                  \
4314     env->vstart = 0;                                   \
4315     /* set tail elements to 1s */                      \
4316     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4317                       total_elems * ESZ);              \
4318 }
4319 
4320 target_ulong fclass_h(uint64_t frs1)
4321 {
4322     float16 f = frs1;
4323     bool sign = float16_is_neg(f);
4324 
4325     if (float16_is_infinity(f)) {
4326         return sign ? 1 << 0 : 1 << 7;
4327     } else if (float16_is_zero(f)) {
4328         return sign ? 1 << 3 : 1 << 4;
4329     } else if (float16_is_zero_or_denormal(f)) {
4330         return sign ? 1 << 2 : 1 << 5;
4331     } else if (float16_is_any_nan(f)) {
4332         float_status s = { }; /* for snan_bit_is_one */
4333         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4334     } else {
4335         return sign ? 1 << 1 : 1 << 6;
4336     }
4337 }
4338 
4339 target_ulong fclass_s(uint64_t frs1)
4340 {
4341     float32 f = frs1;
4342     bool sign = float32_is_neg(f);
4343 
4344     if (float32_is_infinity(f)) {
4345         return sign ? 1 << 0 : 1 << 7;
4346     } else if (float32_is_zero(f)) {
4347         return sign ? 1 << 3 : 1 << 4;
4348     } else if (float32_is_zero_or_denormal(f)) {
4349         return sign ? 1 << 2 : 1 << 5;
4350     } else if (float32_is_any_nan(f)) {
4351         float_status s = { }; /* for snan_bit_is_one */
4352         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4353     } else {
4354         return sign ? 1 << 1 : 1 << 6;
4355     }
4356 }
4357 
4358 target_ulong fclass_d(uint64_t frs1)
4359 {
4360     float64 f = frs1;
4361     bool sign = float64_is_neg(f);
4362 
4363     if (float64_is_infinity(f)) {
4364         return sign ? 1 << 0 : 1 << 7;
4365     } else if (float64_is_zero(f)) {
4366         return sign ? 1 << 3 : 1 << 4;
4367     } else if (float64_is_zero_or_denormal(f)) {
4368         return sign ? 1 << 2 : 1 << 5;
4369     } else if (float64_is_any_nan(f)) {
4370         float_status s = { }; /* for snan_bit_is_one */
4371         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4372     } else {
4373         return sign ? 1 << 1 : 1 << 6;
4374     }
4375 }
4376 
4377 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4378 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4379 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4380 GEN_VEXT_V(vfclass_v_h, 2)
4381 GEN_VEXT_V(vfclass_v_w, 4)
4382 GEN_VEXT_V(vfclass_v_d, 8)
4383 
4384 /* Vector Floating-Point Merge Instruction */
4385 
4386 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4387 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4388                   CPURISCVState *env, uint32_t desc)          \
4389 {                                                             \
4390     uint32_t vm = vext_vm(desc);                              \
4391     uint32_t vl = env->vl;                                    \
4392     uint32_t esz = sizeof(ETYPE);                             \
4393     uint32_t total_elems =                                    \
4394         vext_get_total_elems(env, desc, esz);                 \
4395     uint32_t vta = vext_vta(desc);                            \
4396     uint32_t i;                                               \
4397                                                               \
4398     for (i = env->vstart; i < vl; i++) {                      \
4399         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4400         *((ETYPE *)vd + H(i))                                 \
4401           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4402     }                                                         \
4403     env->vstart = 0;                                          \
4404     /* set tail elements to 1s */                             \
4405     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4406 }
4407 
4408 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4409 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4410 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4411 
4412 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4413 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4414 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4415 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4416 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4417 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4418 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4419 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4420 
4421 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4422 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4423 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4424 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4425 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4426 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4427 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4428 
4429 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4430 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4431 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4432 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4433 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4434 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4435 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4436 
4437 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4438 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4439 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4440 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4441 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4442 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4443 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4444 
4445 /* Widening Floating-Point/Integer Type-Convert Instructions */
4446 /* (TD, T2, TX2) */
4447 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4448 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4449 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4450 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4451 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4452 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4453 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4454 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4455 
4456 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4457 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4458 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4459 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4460 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4461 
4462 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4463 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4464 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4465 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4466 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4467 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4468 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4469 
4470 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4471 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4472 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4473 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4474 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4475 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4476 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4477 
4478 /*
4479  * vfwcvt.f.f.v vd, vs2, vm
4480  * Convert single-width float to double-width float.
4481  */
4482 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4483 {
4484     return float16_to_float32(a, true, s);
4485 }
4486 
4487 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4488 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4489 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4490 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4491 
4492 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4493 /* (TD, T2, TX2) */
4494 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4495 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4496 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4497 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4498 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4499 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4500 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4501 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4502 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4503 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4504 
4505 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4506 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4507 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4508 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4509 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4510 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4511 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4512 
4513 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4514 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4515 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4516 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4517 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4518 
4519 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4520 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4521 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4522 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4523 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4524 
4525 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4526 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4527 {
4528     return float32_to_float16(a, true, s);
4529 }
4530 
4531 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4532 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4533 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4534 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4535 
4536 /*
4537  *** Vector Reduction Operations
4538  */
4539 /* Vector Single-Width Integer Reduction Instructions */
4540 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4541 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4542         void *vs2, CPURISCVState *env, uint32_t desc)     \
4543 {                                                         \
4544     uint32_t vm = vext_vm(desc);                          \
4545     uint32_t vl = env->vl;                                \
4546     uint32_t esz = sizeof(TD);                            \
4547     uint32_t vlenb = simd_maxsz(desc);                    \
4548     uint32_t vta = vext_vta(desc);                        \
4549     uint32_t i;                                           \
4550     TD s1 =  *((TD *)vs1 + HD(0));                        \
4551                                                           \
4552     for (i = env->vstart; i < vl; i++) {                  \
4553         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4554         if (!vm && !vext_elem_mask(v0, i)) {              \
4555             continue;                                     \
4556         }                                                 \
4557         s1 = OP(s1, (TD)s2);                              \
4558     }                                                     \
4559     *((TD *)vd + HD(0)) = s1;                             \
4560     env->vstart = 0;                                      \
4561     /* set tail elements to 1s */                         \
4562     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4563 }
4564 
4565 /* vd[0] = sum(vs1[0], vs2[*]) */
4566 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4567 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4568 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4569 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4570 
4571 /* vd[0] = maxu(vs1[0], vs2[*]) */
4572 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4573 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4574 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4575 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4576 
4577 /* vd[0] = max(vs1[0], vs2[*]) */
4578 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4579 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4580 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4581 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4582 
4583 /* vd[0] = minu(vs1[0], vs2[*]) */
4584 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4585 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4586 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4587 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4588 
4589 /* vd[0] = min(vs1[0], vs2[*]) */
4590 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4591 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4592 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4593 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4594 
4595 /* vd[0] = and(vs1[0], vs2[*]) */
4596 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4597 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4598 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4599 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4600 
4601 /* vd[0] = or(vs1[0], vs2[*]) */
4602 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4603 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4604 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4605 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4606 
4607 /* vd[0] = xor(vs1[0], vs2[*]) */
4608 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4609 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4610 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4611 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4612 
4613 /* Vector Widening Integer Reduction Instructions */
4614 /* signed sum reduction into double-width accumulator */
4615 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4616 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4617 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4618 
4619 /* Unsigned sum reduction into double-width accumulator */
4620 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4621 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4622 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4623 
4624 /* Vector Single-Width Floating-Point Reduction Instructions */
4625 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4626 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4627                   void *vs2, CPURISCVState *env,           \
4628                   uint32_t desc)                           \
4629 {                                                          \
4630     uint32_t vm = vext_vm(desc);                           \
4631     uint32_t vl = env->vl;                                 \
4632     uint32_t esz = sizeof(TD);                             \
4633     uint32_t vlenb = simd_maxsz(desc);                     \
4634     uint32_t vta = vext_vta(desc);                         \
4635     uint32_t i;                                            \
4636     TD s1 =  *((TD *)vs1 + HD(0));                         \
4637                                                            \
4638     for (i = env->vstart; i < vl; i++) {                   \
4639         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4640         if (!vm && !vext_elem_mask(v0, i)) {               \
4641             continue;                                      \
4642         }                                                  \
4643         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4644     }                                                      \
4645     *((TD *)vd + HD(0)) = s1;                              \
4646     env->vstart = 0;                                       \
4647     /* set tail elements to 1s */                          \
4648     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4649 }
4650 
4651 /* Unordered sum */
4652 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4653 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4654 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4655 
4656 /* Maximum value */
4657 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4658 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4659 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4660 
4661 /* Minimum value */
4662 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4663 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4664 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4665 
4666 /* Vector Widening Floating-Point Reduction Instructions */
4667 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4668 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4669                             void *vs2, CPURISCVState *env, uint32_t desc)
4670 {
4671     uint32_t vm = vext_vm(desc);
4672     uint32_t vl = env->vl;
4673     uint32_t esz = sizeof(uint32_t);
4674     uint32_t vlenb = simd_maxsz(desc);
4675     uint32_t vta = vext_vta(desc);
4676     uint32_t i;
4677     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4678 
4679     for (i = env->vstart; i < vl; i++) {
4680         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4681         if (!vm && !vext_elem_mask(v0, i)) {
4682             continue;
4683         }
4684         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4685                          &env->fp_status);
4686     }
4687     *((uint32_t *)vd + H4(0)) = s1;
4688     env->vstart = 0;
4689     /* set tail elements to 1s */
4690     vext_set_elems_1s(vd, vta, esz, vlenb);
4691 }
4692 
4693 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4694                             void *vs2, CPURISCVState *env, uint32_t desc)
4695 {
4696     uint32_t vm = vext_vm(desc);
4697     uint32_t vl = env->vl;
4698     uint32_t esz = sizeof(uint64_t);
4699     uint32_t vlenb = simd_maxsz(desc);
4700     uint32_t vta = vext_vta(desc);
4701     uint32_t i;
4702     uint64_t s1 =  *((uint64_t *)vs1);
4703 
4704     for (i = env->vstart; i < vl; i++) {
4705         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4706         if (!vm && !vext_elem_mask(v0, i)) {
4707             continue;
4708         }
4709         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4710                          &env->fp_status);
4711     }
4712     *((uint64_t *)vd) = s1;
4713     env->vstart = 0;
4714     /* set tail elements to 1s */
4715     vext_set_elems_1s(vd, vta, esz, vlenb);
4716 }
4717 
4718 /*
4719  *** Vector Mask Operations
4720  */
4721 /* Vector Mask-Register Logical Instructions */
4722 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4723 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4724                   void *vs2, CPURISCVState *env,          \
4725                   uint32_t desc)                          \
4726 {                                                         \
4727     uint32_t vl = env->vl;                                \
4728     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4729     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4730     uint32_t i;                                           \
4731     int a, b;                                             \
4732                                                           \
4733     for (i = env->vstart; i < vl; i++) {                  \
4734         a = vext_elem_mask(vs1, i);                       \
4735         b = vext_elem_mask(vs2, i);                       \
4736         vext_set_elem_mask(vd, i, OP(b, a));              \
4737     }                                                     \
4738     env->vstart = 0;                                      \
4739     /* mask destination register are always tail-         \
4740      * agnostic                                           \
4741      */                                                   \
4742     /* set tail elements to 1s */                         \
4743     if (vta_all_1s) {                                     \
4744         for (; i < total_elems; i++) {                    \
4745             vext_set_elem_mask(vd, i, 1);                 \
4746         }                                                 \
4747     }                                                     \
4748 }
4749 
4750 #define DO_NAND(N, M)  (!(N & M))
4751 #define DO_ANDNOT(N, M)  (N & !M)
4752 #define DO_NOR(N, M)  (!(N | M))
4753 #define DO_ORNOT(N, M)  (N | !M)
4754 #define DO_XNOR(N, M)  (!(N ^ M))
4755 
4756 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4757 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4758 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4759 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4760 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4761 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4762 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4763 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4764 
4765 /* Vector count population in mask vcpop */
4766 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4767                              uint32_t desc)
4768 {
4769     target_ulong cnt = 0;
4770     uint32_t vm = vext_vm(desc);
4771     uint32_t vl = env->vl;
4772     int i;
4773 
4774     for (i = env->vstart; i < vl; i++) {
4775         if (vm || vext_elem_mask(v0, i)) {
4776             if (vext_elem_mask(vs2, i)) {
4777                 cnt++;
4778             }
4779         }
4780     }
4781     env->vstart = 0;
4782     return cnt;
4783 }
4784 
4785 /* vfirst find-first-set mask bit*/
4786 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4787                               uint32_t desc)
4788 {
4789     uint32_t vm = vext_vm(desc);
4790     uint32_t vl = env->vl;
4791     int i;
4792 
4793     for (i = env->vstart; i < vl; i++) {
4794         if (vm || vext_elem_mask(v0, i)) {
4795             if (vext_elem_mask(vs2, i)) {
4796                 return i;
4797             }
4798         }
4799     }
4800     env->vstart = 0;
4801     return -1LL;
4802 }
4803 
4804 enum set_mask_type {
4805     ONLY_FIRST = 1,
4806     INCLUDE_FIRST,
4807     BEFORE_FIRST,
4808 };
4809 
4810 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4811                    uint32_t desc, enum set_mask_type type)
4812 {
4813     uint32_t vm = vext_vm(desc);
4814     uint32_t vl = env->vl;
4815     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4816     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4817     int i;
4818     bool first_mask_bit = false;
4819 
4820     for (i = env->vstart; i < vl; i++) {
4821         if (!vm && !vext_elem_mask(v0, i)) {
4822             continue;
4823         }
4824         /* write a zero to all following active elements */
4825         if (first_mask_bit) {
4826             vext_set_elem_mask(vd, i, 0);
4827             continue;
4828         }
4829         if (vext_elem_mask(vs2, i)) {
4830             first_mask_bit = true;
4831             if (type == BEFORE_FIRST) {
4832                 vext_set_elem_mask(vd, i, 0);
4833             } else {
4834                 vext_set_elem_mask(vd, i, 1);
4835             }
4836         } else {
4837             if (type == ONLY_FIRST) {
4838                 vext_set_elem_mask(vd, i, 0);
4839             } else {
4840                 vext_set_elem_mask(vd, i, 1);
4841             }
4842         }
4843     }
4844     env->vstart = 0;
4845     /* mask destination register are always tail-agnostic */
4846     /* set tail elements to 1s */
4847     if (vta_all_1s) {
4848         for (; i < total_elems; i++) {
4849             vext_set_elem_mask(vd, i, 1);
4850         }
4851     }
4852 }
4853 
4854 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4855                      uint32_t desc)
4856 {
4857     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4858 }
4859 
4860 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4861                      uint32_t desc)
4862 {
4863     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4864 }
4865 
4866 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4867                      uint32_t desc)
4868 {
4869     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4870 }
4871 
4872 /* Vector Iota Instruction */
4873 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4874 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4875                   uint32_t desc)                                          \
4876 {                                                                         \
4877     uint32_t vm = vext_vm(desc);                                          \
4878     uint32_t vl = env->vl;                                                \
4879     uint32_t esz = sizeof(ETYPE);                                         \
4880     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4881     uint32_t vta = vext_vta(desc);                                        \
4882     uint32_t sum = 0;                                                     \
4883     int i;                                                                \
4884                                                                           \
4885     for (i = env->vstart; i < vl; i++) {                                  \
4886         if (!vm && !vext_elem_mask(v0, i)) {                              \
4887             continue;                                                     \
4888         }                                                                 \
4889         *((ETYPE *)vd + H(i)) = sum;                                      \
4890         if (vext_elem_mask(vs2, i)) {                                     \
4891             sum++;                                                        \
4892         }                                                                 \
4893     }                                                                     \
4894     env->vstart = 0;                                                      \
4895     /* set tail elements to 1s */                                         \
4896     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4897 }
4898 
4899 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4900 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4901 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4902 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4903 
4904 /* Vector Element Index Instruction */
4905 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4906 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4907 {                                                                         \
4908     uint32_t vm = vext_vm(desc);                                          \
4909     uint32_t vl = env->vl;                                                \
4910     uint32_t esz = sizeof(ETYPE);                                         \
4911     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4912     uint32_t vta = vext_vta(desc);                                        \
4913     int i;                                                                \
4914                                                                           \
4915     for (i = env->vstart; i < vl; i++) {                                  \
4916         if (!vm && !vext_elem_mask(v0, i)) {                              \
4917             continue;                                                     \
4918         }                                                                 \
4919         *((ETYPE *)vd + H(i)) = i;                                        \
4920     }                                                                     \
4921     env->vstart = 0;                                                      \
4922     /* set tail elements to 1s */                                         \
4923     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4924 }
4925 
4926 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4927 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4928 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4929 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4930 
4931 /*
4932  *** Vector Permutation Instructions
4933  */
4934 
4935 /* Vector Slide Instructions */
4936 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4937 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4938                   CPURISCVState *env, uint32_t desc)                      \
4939 {                                                                         \
4940     uint32_t vm = vext_vm(desc);                                          \
4941     uint32_t vl = env->vl;                                                \
4942     uint32_t esz = sizeof(ETYPE);                                         \
4943     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4944     uint32_t vta = vext_vta(desc);                                        \
4945     target_ulong offset = s1, i_min, i;                                   \
4946                                                                           \
4947     i_min = MAX(env->vstart, offset);                                     \
4948     for (i = i_min; i < vl; i++) {                                        \
4949         if (!vm && !vext_elem_mask(v0, i)) {                              \
4950             continue;                                                     \
4951         }                                                                 \
4952         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4953     }                                                                     \
4954     /* set tail elements to 1s */                                         \
4955     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4956 }
4957 
4958 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4959 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4960 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4961 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4962 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4963 
4964 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4965 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4966                   CPURISCVState *env, uint32_t desc)                      \
4967 {                                                                         \
4968     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4969     uint32_t vm = vext_vm(desc);                                          \
4970     uint32_t vl = env->vl;                                                \
4971     uint32_t esz = sizeof(ETYPE);                                         \
4972     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4973     uint32_t vta = vext_vta(desc);                                        \
4974     target_ulong i_max, i;                                                \
4975                                                                           \
4976     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4977     for (i = env->vstart; i < i_max; ++i) {                               \
4978         if (vm || vext_elem_mask(v0, i)) {                                \
4979             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4980         }                                                                 \
4981     }                                                                     \
4982                                                                           \
4983     for (i = i_max; i < vl; ++i) {                                        \
4984         if (vm || vext_elem_mask(v0, i)) {                                \
4985             *((ETYPE *)vd + H(i)) = 0;                                    \
4986         }                                                                 \
4987     }                                                                     \
4988                                                                           \
4989     env->vstart = 0;                                                      \
4990     /* set tail elements to 1s */                                         \
4991     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4992 }
4993 
4994 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4995 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4996 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4997 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4998 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4999 
5000 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5001 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5002                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5003 {                                                                           \
5004     typedef uint##BITWIDTH##_t ETYPE;                                       \
5005     uint32_t vm = vext_vm(desc);                                            \
5006     uint32_t vl = env->vl;                                                  \
5007     uint32_t esz = sizeof(ETYPE);                                           \
5008     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5009     uint32_t vta = vext_vta(desc);                                          \
5010     uint32_t i;                                                             \
5011                                                                             \
5012     for (i = env->vstart; i < vl; i++) {                                    \
5013         if (!vm && !vext_elem_mask(v0, i)) {                                \
5014             continue;                                                       \
5015         }                                                                   \
5016         if (i == 0) {                                                       \
5017             *((ETYPE *)vd + H(i)) = s1;                                     \
5018         } else {                                                            \
5019             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5020         }                                                                   \
5021     }                                                                       \
5022     env->vstart = 0;                                                        \
5023     /* set tail elements to 1s */                                           \
5024     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5025 }
5026 
5027 GEN_VEXT_VSLIE1UP(8,  H1)
5028 GEN_VEXT_VSLIE1UP(16, H2)
5029 GEN_VEXT_VSLIE1UP(32, H4)
5030 GEN_VEXT_VSLIE1UP(64, H8)
5031 
5032 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5033 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5034                   CPURISCVState *env, uint32_t desc)              \
5035 {                                                                 \
5036     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5037 }
5038 
5039 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5040 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5041 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5042 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5043 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5044 
5045 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5046 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5047                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5048 {                                                                             \
5049     typedef uint##BITWIDTH##_t ETYPE;                                         \
5050     uint32_t vm = vext_vm(desc);                                              \
5051     uint32_t vl = env->vl;                                                    \
5052     uint32_t esz = sizeof(ETYPE);                                             \
5053     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5054     uint32_t vta = vext_vta(desc);                                            \
5055     uint32_t i;                                                               \
5056                                                                               \
5057     for (i = env->vstart; i < vl; i++) {                                      \
5058         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5059             continue;                                                         \
5060         }                                                                     \
5061         if (i == vl - 1) {                                                    \
5062             *((ETYPE *)vd + H(i)) = s1;                                       \
5063         } else {                                                              \
5064             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5065         }                                                                     \
5066     }                                                                         \
5067     env->vstart = 0;                                                          \
5068     /* set tail elements to 1s */                                             \
5069     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5070 }
5071 
5072 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5073 GEN_VEXT_VSLIDE1DOWN(16, H2)
5074 GEN_VEXT_VSLIDE1DOWN(32, H4)
5075 GEN_VEXT_VSLIDE1DOWN(64, H8)
5076 
5077 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5078 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5079                   CPURISCVState *env, uint32_t desc)              \
5080 {                                                                 \
5081     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5082 }
5083 
5084 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5085 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5086 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5087 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5088 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5089 
5090 /* Vector Floating-Point Slide Instructions */
5091 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5092 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5093                   CPURISCVState *env, uint32_t desc)          \
5094 {                                                             \
5095     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5096 }
5097 
5098 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5099 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5100 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5101 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5102 
5103 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5104 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5105                   CPURISCVState *env, uint32_t desc)          \
5106 {                                                             \
5107     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5108 }
5109 
5110 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5111 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5112 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5113 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5114 
5115 /* Vector Register Gather Instruction */
5116 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5117 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5118                   CPURISCVState *env, uint32_t desc)                      \
5119 {                                                                         \
5120     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5121     uint32_t vm = vext_vm(desc);                                          \
5122     uint32_t vl = env->vl;                                                \
5123     uint32_t esz = sizeof(TS2);                                           \
5124     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5125     uint32_t vta = vext_vta(desc);                                        \
5126     uint64_t index;                                                       \
5127     uint32_t i;                                                           \
5128                                                                           \
5129     for (i = env->vstart; i < vl; i++) {                                  \
5130         if (!vm && !vext_elem_mask(v0, i)) {                              \
5131             continue;                                                     \
5132         }                                                                 \
5133         index = *((TS1 *)vs1 + HS1(i));                                   \
5134         if (index >= vlmax) {                                             \
5135             *((TS2 *)vd + HS2(i)) = 0;                                    \
5136         } else {                                                          \
5137             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5138         }                                                                 \
5139     }                                                                     \
5140     env->vstart = 0;                                                      \
5141     /* set tail elements to 1s */                                         \
5142     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5143 }
5144 
5145 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5146 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5147 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5148 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5149 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5150 
5151 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5152 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5153 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5154 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5155 
5156 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5157 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5158                   CPURISCVState *env, uint32_t desc)                      \
5159 {                                                                         \
5160     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5161     uint32_t vm = vext_vm(desc);                                          \
5162     uint32_t vl = env->vl;                                                \
5163     uint32_t esz = sizeof(ETYPE);                                         \
5164     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5165     uint32_t vta = vext_vta(desc);                                        \
5166     uint64_t index = s1;                                                  \
5167     uint32_t i;                                                           \
5168                                                                           \
5169     for (i = env->vstart; i < vl; i++) {                                  \
5170         if (!vm && !vext_elem_mask(v0, i)) {                              \
5171             continue;                                                     \
5172         }                                                                 \
5173         if (index >= vlmax) {                                             \
5174             *((ETYPE *)vd + H(i)) = 0;                                    \
5175         } else {                                                          \
5176             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5177         }                                                                 \
5178     }                                                                     \
5179     env->vstart = 0;                                                      \
5180     /* set tail elements to 1s */                                         \
5181     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5182 }
5183 
5184 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5185 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5186 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5187 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5188 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5189 
5190 /* Vector Compress Instruction */
5191 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5192 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5193                   CPURISCVState *env, uint32_t desc)                      \
5194 {                                                                         \
5195     uint32_t vl = env->vl;                                                \
5196     uint32_t esz = sizeof(ETYPE);                                         \
5197     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5198     uint32_t vta = vext_vta(desc);                                        \
5199     uint32_t num = 0, i;                                                  \
5200                                                                           \
5201     for (i = env->vstart; i < vl; i++) {                                  \
5202         if (!vext_elem_mask(vs1, i)) {                                    \
5203             continue;                                                     \
5204         }                                                                 \
5205         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5206         num++;                                                            \
5207     }                                                                     \
5208     env->vstart = 0;                                                      \
5209     /* set tail elements to 1s */                                         \
5210     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5211 }
5212 
5213 /* Compress into vd elements of vs2 where vs1 is enabled */
5214 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5215 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5216 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5217 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5218 
5219 /* Vector Whole Register Move */
5220 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5221 {
5222     /* EEW = SEW */
5223     uint32_t maxsz = simd_maxsz(desc);
5224     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5225     uint32_t startb = env->vstart * sewb;
5226     uint32_t i = startb;
5227 
5228     memcpy((uint8_t *)vd + H1(i),
5229            (uint8_t *)vs2 + H1(i),
5230            maxsz - startb);
5231 
5232     env->vstart = 0;
5233 }
5234 
5235 /* Vector Integer Extension */
5236 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5237 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5238                   CPURISCVState *env, uint32_t desc)             \
5239 {                                                                \
5240     uint32_t vl = env->vl;                                       \
5241     uint32_t vm = vext_vm(desc);                                 \
5242     uint32_t esz = sizeof(ETYPE);                                \
5243     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5244     uint32_t vta = vext_vta(desc);                               \
5245     uint32_t i;                                                  \
5246                                                                  \
5247     for (i = env->vstart; i < vl; i++) {                         \
5248         if (!vm && !vext_elem_mask(v0, i)) {                     \
5249             continue;                                            \
5250         }                                                        \
5251         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5252     }                                                            \
5253     env->vstart = 0;                                             \
5254     /* set tail elements to 1s */                                \
5255     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5256 }
5257 
5258 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5259 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5260 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5261 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5262 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5263 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5264 
5265 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5266 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5267 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5268 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5269 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5270 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5271