xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 5f88dd43)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "internals.h"
30 #include <math.h>
31 
32 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
33                             target_ulong s2)
34 {
35     int vlmax, vl;
36     RISCVCPU *cpu = env_archcpu(env);
37     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
38     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
39     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
40     int xlen = riscv_cpu_xlen(env);
41     bool vill = (s2 >> (xlen - 1)) & 0x1;
42     target_ulong reserved = s2 &
43                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
44                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
45 
46     if (lmul & 4) {
47         /* Fractional LMUL - check LMUL * VLEN >= SEW */
48         if (lmul == 4 ||
49             cpu->cfg.vlen >> (8 - lmul) < sew) {
50             vill = true;
51         }
52     }
53 
54     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
55         /* only set vill bit. */
56         env->vill = 1;
57         env->vtype = 0;
58         env->vl = 0;
59         env->vstart = 0;
60         return 0;
61     }
62 
63     vlmax = vext_get_vlmax(cpu, s2);
64     if (s1 <= vlmax) {
65         vl = s1;
66     } else {
67         vl = vlmax;
68     }
69     env->vl = vl;
70     env->vtype = s2;
71     env->vstart = 0;
72     env->vill = 0;
73     return vl;
74 }
75 
76 /*
77  * Note that vector data is stored in host-endian 64-bit chunks,
78  * so addressing units smaller than that needs a host-endian fixup.
79  */
80 #if HOST_BIG_ENDIAN
81 #define H1(x)   ((x) ^ 7)
82 #define H1_2(x) ((x) ^ 6)
83 #define H1_4(x) ((x) ^ 4)
84 #define H2(x)   ((x) ^ 3)
85 #define H4(x)   ((x) ^ 1)
86 #define H8(x)   ((x))
87 #else
88 #define H1(x)   (x)
89 #define H1_2(x) (x)
90 #define H1_4(x) (x)
91 #define H2(x)   (x)
92 #define H4(x)   (x)
93 #define H8(x)   (x)
94 #endif
95 
96 static inline uint32_t vext_nf(uint32_t desc)
97 {
98     return FIELD_EX32(simd_data(desc), VDATA, NF);
99 }
100 
101 static inline uint32_t vext_vm(uint32_t desc)
102 {
103     return FIELD_EX32(simd_data(desc), VDATA, VM);
104 }
105 
106 /*
107  * Encode LMUL to lmul as following:
108  *     LMUL    vlmul    lmul
109  *      1       000       0
110  *      2       001       1
111  *      4       010       2
112  *      8       011       3
113  *      -       100       -
114  *     1/8      101      -3
115  *     1/4      110      -2
116  *     1/2      111      -1
117  */
118 static inline int32_t vext_lmul(uint32_t desc)
119 {
120     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
121 }
122 
123 static inline uint32_t vext_vta(uint32_t desc)
124 {
125     return FIELD_EX32(simd_data(desc), VDATA, VTA);
126 }
127 
128 static inline uint32_t vext_vma(uint32_t desc)
129 {
130     return FIELD_EX32(simd_data(desc), VDATA, VMA);
131 }
132 
133 static inline uint32_t vext_vta_all_1s(uint32_t desc)
134 {
135     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
136 }
137 
138 /*
139  * Get the maximum number of elements can be operated.
140  *
141  * log2_esz: log2 of element size in bytes.
142  */
143 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
144 {
145     /*
146      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
147      * so vlen in bytes (vlenb) is encoded as maxsz.
148      */
149     uint32_t vlenb = simd_maxsz(desc);
150 
151     /* Return VLMAX */
152     int scale = vext_lmul(desc) - log2_esz;
153     return scale < 0 ? vlenb >> -scale : vlenb << scale;
154 }
155 
156 /*
157  * Get number of total elements, including prestart, body and tail elements.
158  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
159  * are held in the same vector register.
160  */
161 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
162                                             uint32_t esz)
163 {
164     uint32_t vlenb = simd_maxsz(desc);
165     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
166     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
167                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
168     return (vlenb << emul) / esz;
169 }
170 
171 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
172 {
173     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
174 }
175 
176 /*
177  * This function checks watchpoint before real load operation.
178  *
179  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
180  * In user mode, there is no watchpoint support now.
181  *
182  * It will trigger an exception if there is no mapping in TLB
183  * and page table walk can't fill the TLB entry. Then the guest
184  * software can return here after process the exception or never return.
185  */
186 static void probe_pages(CPURISCVState *env, target_ulong addr,
187                         target_ulong len, uintptr_t ra,
188                         MMUAccessType access_type)
189 {
190     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
191     target_ulong curlen = MIN(pagelen, len);
192 
193     probe_access(env, adjust_addr(env, addr), curlen, access_type,
194                  cpu_mmu_index(env, false), ra);
195     if (len > curlen) {
196         addr += curlen;
197         curlen = len - curlen;
198         probe_access(env, adjust_addr(env, addr), curlen, access_type,
199                      cpu_mmu_index(env, false), ra);
200     }
201 }
202 
203 /* set agnostic elements to 1s */
204 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
205                               uint32_t tot)
206 {
207     if (is_agnostic == 0) {
208         /* policy undisturbed */
209         return;
210     }
211     if (tot - cnt == 0) {
212         return;
213     }
214     memset(base + cnt, -1, tot - cnt);
215 }
216 
217 static inline void vext_set_elem_mask(void *v0, int index,
218                                       uint8_t value)
219 {
220     int idx = index / 64;
221     int pos = index % 64;
222     uint64_t old = ((uint64_t *)v0)[idx];
223     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
224 }
225 
226 /*
227  * Earlier designs (pre-0.9) had a varying number of bits
228  * per mask value (MLEN). In the 0.9 design, MLEN=1.
229  * (Section 4.5)
230  */
231 static inline int vext_elem_mask(void *v0, int index)
232 {
233     int idx = index / 64;
234     int pos = index  % 64;
235     return (((uint64_t *)v0)[idx] >> pos) & 1;
236 }
237 
238 /* elements operations for load and store */
239 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
240                                uint32_t idx, void *vd, uintptr_t retaddr);
241 
242 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
243 static void NAME(CPURISCVState *env, abi_ptr addr,         \
244                  uint32_t idx, void *vd, uintptr_t retaddr)\
245 {                                                          \
246     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
247     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
248 }                                                          \
249 
250 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
251 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
252 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
253 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
254 
255 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
256 static void NAME(CPURISCVState *env, abi_ptr addr,         \
257                  uint32_t idx, void *vd, uintptr_t retaddr)\
258 {                                                          \
259     ETYPE data = *((ETYPE *)vd + H(idx));                  \
260     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
261 }
262 
263 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
264 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
265 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
266 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
267 
268 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
269                                    uint32_t desc, uint32_t nf,
270                                    uint32_t esz, uint32_t max_elems)
271 {
272     uint32_t vta = vext_vta(desc);
273     int k;
274 
275     if (vta == 0) {
276         return;
277     }
278 
279     for (k = 0; k < nf; ++k) {
280         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
281                           (k * max_elems + max_elems) * esz);
282     }
283 }
284 
285 /*
286  * stride: access vector element from strided memory
287  */
288 static void
289 vext_ldst_stride(void *vd, void *v0, target_ulong base,
290                  target_ulong stride, CPURISCVState *env,
291                  uint32_t desc, uint32_t vm,
292                  vext_ldst_elem_fn *ldst_elem,
293                  uint32_t log2_esz, uintptr_t ra)
294 {
295     uint32_t i, k;
296     uint32_t nf = vext_nf(desc);
297     uint32_t max_elems = vext_max_elems(desc, log2_esz);
298     uint32_t esz = 1 << log2_esz;
299     uint32_t vma = vext_vma(desc);
300 
301     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
302         k = 0;
303         while (k < nf) {
304             if (!vm && !vext_elem_mask(v0, i)) {
305                 /* set masked-off elements to 1s */
306                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
307                                   (i + k * max_elems + 1) * esz);
308                 k++;
309                 continue;
310             }
311             target_ulong addr = base + stride * i + (k << log2_esz);
312             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
313             k++;
314         }
315     }
316     env->vstart = 0;
317 
318     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
319 }
320 
321 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
322 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
323                   target_ulong stride, CPURISCVState *env,              \
324                   uint32_t desc)                                        \
325 {                                                                       \
326     uint32_t vm = vext_vm(desc);                                        \
327     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
328                      ctzl(sizeof(ETYPE)), GETPC());                     \
329 }
330 
331 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
332 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
333 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
334 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
335 
336 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
337 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
338                   target_ulong stride, CPURISCVState *env,              \
339                   uint32_t desc)                                        \
340 {                                                                       \
341     uint32_t vm = vext_vm(desc);                                        \
342     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
343                      ctzl(sizeof(ETYPE)), GETPC());                     \
344 }
345 
346 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
347 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
348 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
349 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
350 
351 /*
352  * unit-stride: access elements stored contiguously in memory
353  */
354 
355 /* unmasked unit-stride load and store operation */
356 static void
357 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
358              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
359              uintptr_t ra)
360 {
361     uint32_t i, k;
362     uint32_t nf = vext_nf(desc);
363     uint32_t max_elems = vext_max_elems(desc, log2_esz);
364     uint32_t esz = 1 << log2_esz;
365 
366     /* load bytes from guest memory */
367     for (i = env->vstart; i < evl; i++, env->vstart++) {
368         k = 0;
369         while (k < nf) {
370             target_ulong addr = base + ((i * nf + k) << log2_esz);
371             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
372             k++;
373         }
374     }
375     env->vstart = 0;
376 
377     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
378 }
379 
380 /*
381  * masked unit-stride load and store operation will be a special case of
382  * stride, stride = NF * sizeof (ETYPE)
383  */
384 
385 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
386 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
387                          CPURISCVState *env, uint32_t desc)             \
388 {                                                                       \
389     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
390     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
391                      ctzl(sizeof(ETYPE)), GETPC());                     \
392 }                                                                       \
393                                                                         \
394 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
395                   CPURISCVState *env, uint32_t desc)                    \
396 {                                                                       \
397     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
398                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
399 }
400 
401 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
402 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
403 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
404 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
405 
406 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
407 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
408                          CPURISCVState *env, uint32_t desc)              \
409 {                                                                        \
410     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
411     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
412                      ctzl(sizeof(ETYPE)), GETPC());                      \
413 }                                                                        \
414                                                                          \
415 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
416                   CPURISCVState *env, uint32_t desc)                     \
417 {                                                                        \
418     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
419                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
420 }
421 
422 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
423 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
424 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
425 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
426 
427 /*
428  * unit stride mask load and store, EEW = 1
429  */
430 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
431                     CPURISCVState *env, uint32_t desc)
432 {
433     /* evl = ceil(vl/8) */
434     uint8_t evl = (env->vl + 7) >> 3;
435     vext_ldst_us(vd, base, env, desc, lde_b,
436                  0, evl, GETPC());
437 }
438 
439 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, ste_b,
445                  0, evl, GETPC());
446 }
447 
448 /*
449  * index: access vector element from indexed memory
450  */
451 typedef target_ulong vext_get_index_addr(target_ulong base,
452         uint32_t idx, void *vs2);
453 
454 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
455 static target_ulong NAME(target_ulong base,            \
456                          uint32_t idx, void *vs2)      \
457 {                                                      \
458     return (base + *((ETYPE *)vs2 + H(idx)));          \
459 }
460 
461 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
462 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
463 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
464 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
465 
466 static inline void
467 vext_ldst_index(void *vd, void *v0, target_ulong base,
468                 void *vs2, CPURISCVState *env, uint32_t desc,
469                 vext_get_index_addr get_index_addr,
470                 vext_ldst_elem_fn *ldst_elem,
471                 uint32_t log2_esz, uintptr_t ra)
472 {
473     uint32_t i, k;
474     uint32_t nf = vext_nf(desc);
475     uint32_t vm = vext_vm(desc);
476     uint32_t max_elems = vext_max_elems(desc, log2_esz);
477     uint32_t esz = 1 << log2_esz;
478     uint32_t vma = vext_vma(desc);
479 
480     /* load bytes from guest memory */
481     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
482         k = 0;
483         while (k < nf) {
484             if (!vm && !vext_elem_mask(v0, i)) {
485                 /* set masked-off elements to 1s */
486                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
487                                   (i + k * max_elems + 1) * esz);
488                 k++;
489                 continue;
490             }
491             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
492             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
493             k++;
494         }
495     }
496     env->vstart = 0;
497 
498     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
499 }
500 
501 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
502 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
503                   void *vs2, CPURISCVState *env, uint32_t desc)            \
504 {                                                                          \
505     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
506                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
507 }
508 
509 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
510 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
511 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
512 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
513 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
514 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
515 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
516 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
517 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
525 
526 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
527 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
528                   void *vs2, CPURISCVState *env, uint32_t desc)  \
529 {                                                                \
530     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
531                     STORE_FN, ctzl(sizeof(ETYPE)),               \
532                     GETPC());                                    \
533 }
534 
535 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
536 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
537 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
538 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
539 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
540 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
541 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
542 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
543 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
551 
552 /*
553  * unit-stride fault-only-fisrt load instructions
554  */
555 static inline void
556 vext_ldff(void *vd, void *v0, target_ulong base,
557           CPURISCVState *env, uint32_t desc,
558           vext_ldst_elem_fn *ldst_elem,
559           uint32_t log2_esz, uintptr_t ra)
560 {
561     void *host;
562     uint32_t i, k, vl = 0;
563     uint32_t nf = vext_nf(desc);
564     uint32_t vm = vext_vm(desc);
565     uint32_t max_elems = vext_max_elems(desc, log2_esz);
566     uint32_t esz = 1 << log2_esz;
567     uint32_t vma = vext_vma(desc);
568     target_ulong addr, offset, remain;
569 
570     /* probe every access */
571     for (i = env->vstart; i < env->vl; i++) {
572         if (!vm && !vext_elem_mask(v0, i)) {
573             continue;
574         }
575         addr = adjust_addr(env, base + i * (nf << log2_esz));
576         if (i == 0) {
577             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
578         } else {
579             /* if it triggers an exception, no need to check watchpoint */
580             remain = nf << log2_esz;
581             while (remain > 0) {
582                 offset = -(addr | TARGET_PAGE_MASK);
583                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
584                                          cpu_mmu_index(env, false));
585                 if (host) {
586 #ifdef CONFIG_USER_ONLY
587                     if (page_check_range(addr, offset, PAGE_READ)) {
588                         vl = i;
589                         goto ProbeSuccess;
590                     }
591 #else
592                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
593 #endif
594                 } else {
595                     vl = i;
596                     goto ProbeSuccess;
597                 }
598                 if (remain <=  offset) {
599                     break;
600                 }
601                 remain -= offset;
602                 addr = adjust_addr(env, addr + offset);
603             }
604         }
605     }
606 ProbeSuccess:
607     /* load bytes from guest memory */
608     if (vl != 0) {
609         env->vl = vl;
610     }
611     for (i = env->vstart; i < env->vl; i++) {
612         k = 0;
613         while (k < nf) {
614             if (!vm && !vext_elem_mask(v0, i)) {
615                 /* set masked-off elements to 1s */
616                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
617                                   (i + k * max_elems + 1) * esz);
618                 k++;
619                 continue;
620             }
621             target_ulong addr = base + ((i * nf + k) << log2_esz);
622             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
623             k++;
624         }
625     }
626     env->vstart = 0;
627 
628     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
629 }
630 
631 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
632 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
633                   CPURISCVState *env, uint32_t desc)      \
634 {                                                         \
635     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
636               ctzl(sizeof(ETYPE)), GETPC());              \
637 }
638 
639 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
640 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
641 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
642 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
643 
644 #define DO_SWAP(N, M) (M)
645 #define DO_AND(N, M)  (N & M)
646 #define DO_XOR(N, M)  (N ^ M)
647 #define DO_OR(N, M)   (N | M)
648 #define DO_ADD(N, M)  (N + M)
649 
650 /* Signed min/max */
651 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
652 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
653 
654 /*
655  * load and store whole register instructions
656  */
657 static void
658 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
659                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
660 {
661     uint32_t i, k, off, pos;
662     uint32_t nf = vext_nf(desc);
663     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
664     uint32_t max_elems = vlenb >> log2_esz;
665 
666     k = env->vstart / max_elems;
667     off = env->vstart % max_elems;
668 
669     if (off) {
670         /* load/store rest of elements of current segment pointed by vstart */
671         for (pos = off; pos < max_elems; pos++, env->vstart++) {
672             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
673             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
674                       ra);
675         }
676         k++;
677     }
678 
679     /* load/store elements for rest of segments */
680     for (; k < nf; k++) {
681         for (i = 0; i < max_elems; i++, env->vstart++) {
682             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
683             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
684         }
685     }
686 
687     env->vstart = 0;
688 }
689 
690 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
691 void HELPER(NAME)(void *vd, target_ulong base,       \
692                   CPURISCVState *env, uint32_t desc) \
693 {                                                    \
694     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
695                     ctzl(sizeof(ETYPE)), GETPC());   \
696 }
697 
698 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
699 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
700 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
701 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
702 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
703 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
704 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
705 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
706 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
707 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
708 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
709 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
710 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
711 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
712 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
713 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
714 
715 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
716 void HELPER(NAME)(void *vd, target_ulong base,       \
717                   CPURISCVState *env, uint32_t desc) \
718 {                                                    \
719     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
720                     ctzl(sizeof(ETYPE)), GETPC());   \
721 }
722 
723 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
724 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
725 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
726 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
727 
728 /*
729  * Vector Integer Arithmetic Instructions
730  */
731 
732 /* expand macro args before macro */
733 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
734 
735 /* (TD, T1, T2, TX1, TX2) */
736 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
737 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
738 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
739 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
740 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
741 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
742 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
743 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
744 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
745 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
746 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
747 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
748 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
749 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
750 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
751 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
752 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
753 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
754 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
755 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
756 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
757 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
758 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
759 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
760 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
761 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
762 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
763 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
764 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
765 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
766 
767 /* operation of two vector elements */
768 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
769 
770 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
771 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
772 {                                                               \
773     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
774     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
775     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
776 }
777 #define DO_SUB(N, M) (N - M)
778 #define DO_RSUB(N, M) (M - N)
779 
780 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
781 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
782 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
783 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
784 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
785 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
786 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
787 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
788 
789 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
790                        CPURISCVState *env, uint32_t desc,
791                        opivv2_fn *fn, uint32_t esz)
792 {
793     uint32_t vm = vext_vm(desc);
794     uint32_t vl = env->vl;
795     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
796     uint32_t vta = vext_vta(desc);
797     uint32_t vma = vext_vma(desc);
798     uint32_t i;
799 
800     for (i = env->vstart; i < vl; i++) {
801         if (!vm && !vext_elem_mask(v0, i)) {
802             /* set masked-off elements to 1s */
803             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
804             continue;
805         }
806         fn(vd, vs1, vs2, i);
807     }
808     env->vstart = 0;
809     /* set tail elements to 1s */
810     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
811 }
812 
813 /* generate the helpers for OPIVV */
814 #define GEN_VEXT_VV(NAME, ESZ)                            \
815 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
816                   void *vs2, CPURISCVState *env,          \
817                   uint32_t desc)                          \
818 {                                                         \
819     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
820                do_##NAME, ESZ);                           \
821 }
822 
823 GEN_VEXT_VV(vadd_vv_b, 1)
824 GEN_VEXT_VV(vadd_vv_h, 2)
825 GEN_VEXT_VV(vadd_vv_w, 4)
826 GEN_VEXT_VV(vadd_vv_d, 8)
827 GEN_VEXT_VV(vsub_vv_b, 1)
828 GEN_VEXT_VV(vsub_vv_h, 2)
829 GEN_VEXT_VV(vsub_vv_w, 4)
830 GEN_VEXT_VV(vsub_vv_d, 8)
831 
832 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
833 
834 /*
835  * (T1)s1 gives the real operator type.
836  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
837  */
838 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
839 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
840 {                                                                   \
841     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
842     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
843 }
844 
845 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
846 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
847 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
848 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
849 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
850 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
851 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
852 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
853 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
854 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
855 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
856 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
857 
858 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
859                        CPURISCVState *env, uint32_t desc,
860                        opivx2_fn fn, uint32_t esz)
861 {
862     uint32_t vm = vext_vm(desc);
863     uint32_t vl = env->vl;
864     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
865     uint32_t vta = vext_vta(desc);
866     uint32_t vma = vext_vma(desc);
867     uint32_t i;
868 
869     for (i = env->vstart; i < vl; i++) {
870         if (!vm && !vext_elem_mask(v0, i)) {
871             /* set masked-off elements to 1s */
872             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
873             continue;
874         }
875         fn(vd, s1, vs2, i);
876     }
877     env->vstart = 0;
878     /* set tail elements to 1s */
879     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
880 }
881 
882 /* generate the helpers for OPIVX */
883 #define GEN_VEXT_VX(NAME, ESZ)                            \
884 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
885                   void *vs2, CPURISCVState *env,          \
886                   uint32_t desc)                          \
887 {                                                         \
888     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
889                do_##NAME, ESZ);                           \
890 }
891 
892 GEN_VEXT_VX(vadd_vx_b, 1)
893 GEN_VEXT_VX(vadd_vx_h, 2)
894 GEN_VEXT_VX(vadd_vx_w, 4)
895 GEN_VEXT_VX(vadd_vx_d, 8)
896 GEN_VEXT_VX(vsub_vx_b, 1)
897 GEN_VEXT_VX(vsub_vx_h, 2)
898 GEN_VEXT_VX(vsub_vx_w, 4)
899 GEN_VEXT_VX(vsub_vx_d, 8)
900 GEN_VEXT_VX(vrsub_vx_b, 1)
901 GEN_VEXT_VX(vrsub_vx_h, 2)
902 GEN_VEXT_VX(vrsub_vx_w, 4)
903 GEN_VEXT_VX(vrsub_vx_d, 8)
904 
905 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
906 {
907     intptr_t oprsz = simd_oprsz(desc);
908     intptr_t i;
909 
910     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
911         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
912     }
913 }
914 
915 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
916 {
917     intptr_t oprsz = simd_oprsz(desc);
918     intptr_t i;
919 
920     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
921         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
922     }
923 }
924 
925 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
926 {
927     intptr_t oprsz = simd_oprsz(desc);
928     intptr_t i;
929 
930     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
931         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
932     }
933 }
934 
935 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
936 {
937     intptr_t oprsz = simd_oprsz(desc);
938     intptr_t i;
939 
940     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
941         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
942     }
943 }
944 
945 /* Vector Widening Integer Add/Subtract */
946 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
947 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
948 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
949 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
950 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
951 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
952 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
953 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
954 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
955 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
956 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
957 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
958 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
959 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
960 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
961 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
962 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
963 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
964 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
965 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
966 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
967 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
968 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
969 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
970 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
972 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
973 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
975 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
976 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
978 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
979 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
981 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
982 GEN_VEXT_VV(vwaddu_vv_b, 2)
983 GEN_VEXT_VV(vwaddu_vv_h, 4)
984 GEN_VEXT_VV(vwaddu_vv_w, 8)
985 GEN_VEXT_VV(vwsubu_vv_b, 2)
986 GEN_VEXT_VV(vwsubu_vv_h, 4)
987 GEN_VEXT_VV(vwsubu_vv_w, 8)
988 GEN_VEXT_VV(vwadd_vv_b, 2)
989 GEN_VEXT_VV(vwadd_vv_h, 4)
990 GEN_VEXT_VV(vwadd_vv_w, 8)
991 GEN_VEXT_VV(vwsub_vv_b, 2)
992 GEN_VEXT_VV(vwsub_vv_h, 4)
993 GEN_VEXT_VV(vwsub_vv_w, 8)
994 GEN_VEXT_VV(vwaddu_wv_b, 2)
995 GEN_VEXT_VV(vwaddu_wv_h, 4)
996 GEN_VEXT_VV(vwaddu_wv_w, 8)
997 GEN_VEXT_VV(vwsubu_wv_b, 2)
998 GEN_VEXT_VV(vwsubu_wv_h, 4)
999 GEN_VEXT_VV(vwsubu_wv_w, 8)
1000 GEN_VEXT_VV(vwadd_wv_b, 2)
1001 GEN_VEXT_VV(vwadd_wv_h, 4)
1002 GEN_VEXT_VV(vwadd_wv_w, 8)
1003 GEN_VEXT_VV(vwsub_wv_b, 2)
1004 GEN_VEXT_VV(vwsub_wv_h, 4)
1005 GEN_VEXT_VV(vwsub_wv_w, 8)
1006 
1007 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1008 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1009 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1010 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1011 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1012 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1013 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1014 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1015 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1016 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1017 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1018 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1019 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1022 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1025 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1028 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1031 GEN_VEXT_VX(vwaddu_vx_b, 2)
1032 GEN_VEXT_VX(vwaddu_vx_h, 4)
1033 GEN_VEXT_VX(vwaddu_vx_w, 8)
1034 GEN_VEXT_VX(vwsubu_vx_b, 2)
1035 GEN_VEXT_VX(vwsubu_vx_h, 4)
1036 GEN_VEXT_VX(vwsubu_vx_w, 8)
1037 GEN_VEXT_VX(vwadd_vx_b, 2)
1038 GEN_VEXT_VX(vwadd_vx_h, 4)
1039 GEN_VEXT_VX(vwadd_vx_w, 8)
1040 GEN_VEXT_VX(vwsub_vx_b, 2)
1041 GEN_VEXT_VX(vwsub_vx_h, 4)
1042 GEN_VEXT_VX(vwsub_vx_w, 8)
1043 GEN_VEXT_VX(vwaddu_wx_b, 2)
1044 GEN_VEXT_VX(vwaddu_wx_h, 4)
1045 GEN_VEXT_VX(vwaddu_wx_w, 8)
1046 GEN_VEXT_VX(vwsubu_wx_b, 2)
1047 GEN_VEXT_VX(vwsubu_wx_h, 4)
1048 GEN_VEXT_VX(vwsubu_wx_w, 8)
1049 GEN_VEXT_VX(vwadd_wx_b, 2)
1050 GEN_VEXT_VX(vwadd_wx_h, 4)
1051 GEN_VEXT_VX(vwadd_wx_w, 8)
1052 GEN_VEXT_VX(vwsub_wx_b, 2)
1053 GEN_VEXT_VX(vwsub_wx_h, 4)
1054 GEN_VEXT_VX(vwsub_wx_w, 8)
1055 
1056 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1057 #define DO_VADC(N, M, C) (N + M + C)
1058 #define DO_VSBC(N, M, C) (N - M - C)
1059 
1060 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1061 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1062                   CPURISCVState *env, uint32_t desc)          \
1063 {                                                             \
1064     uint32_t vl = env->vl;                                    \
1065     uint32_t esz = sizeof(ETYPE);                             \
1066     uint32_t total_elems =                                    \
1067         vext_get_total_elems(env, desc, esz);                 \
1068     uint32_t vta = vext_vta(desc);                            \
1069     uint32_t i;                                               \
1070                                                               \
1071     for (i = env->vstart; i < vl; i++) {                      \
1072         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1073         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1074         ETYPE carry = vext_elem_mask(v0, i);                  \
1075                                                               \
1076         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1077     }                                                         \
1078     env->vstart = 0;                                          \
1079     /* set tail elements to 1s */                             \
1080     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1081 }
1082 
1083 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1084 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1085 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1086 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1087 
1088 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1089 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1090 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1091 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1092 
1093 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1094 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1095                   CPURISCVState *env, uint32_t desc)                     \
1096 {                                                                        \
1097     uint32_t vl = env->vl;                                               \
1098     uint32_t esz = sizeof(ETYPE);                                        \
1099     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1100     uint32_t vta = vext_vta(desc);                                       \
1101     uint32_t i;                                                          \
1102                                                                          \
1103     for (i = env->vstart; i < vl; i++) {                                 \
1104         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1105         ETYPE carry = vext_elem_mask(v0, i);                             \
1106                                                                          \
1107         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1108     }                                                                    \
1109     env->vstart = 0;                                                     \
1110     /* set tail elements to 1s */                                        \
1111     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1112 }
1113 
1114 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1115 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1116 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1117 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1118 
1119 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1120 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1121 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1122 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1123 
1124 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1125                           (__typeof(N))(N + M) < N)
1126 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1127 
1128 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1129 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1130                   CPURISCVState *env, uint32_t desc)          \
1131 {                                                             \
1132     uint32_t vl = env->vl;                                    \
1133     uint32_t vm = vext_vm(desc);                              \
1134     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1135     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1136     uint32_t i;                                               \
1137                                                               \
1138     for (i = env->vstart; i < vl; i++) {                      \
1139         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1141         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1142         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1143     }                                                         \
1144     env->vstart = 0;                                          \
1145     /*
1146      * mask destination register are always tail-agnostic
1147      * set tail elements to 1s
1148      */                                                       \
1149     if (vta_all_1s) {                                         \
1150         for (; i < total_elems; i++) {                        \
1151             vext_set_elem_mask(vd, i, 1);                     \
1152         }                                                     \
1153     }                                                         \
1154 }
1155 
1156 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1157 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1158 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1159 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1160 
1161 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1162 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1163 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1164 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1165 
1166 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1167 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1168                   void *vs2, CPURISCVState *env, uint32_t desc) \
1169 {                                                               \
1170     uint32_t vl = env->vl;                                      \
1171     uint32_t vm = vext_vm(desc);                                \
1172     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1173     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1174     uint32_t i;                                                 \
1175                                                                 \
1176     for (i = env->vstart; i < vl; i++) {                        \
1177         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1178         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1179         vext_set_elem_mask(vd, i,                               \
1180                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1181     }                                                           \
1182     env->vstart = 0;                                            \
1183     /*
1184      * mask destination register are always tail-agnostic
1185      * set tail elements to 1s
1186      */                                                         \
1187     if (vta_all_1s) {                                           \
1188         for (; i < total_elems; i++) {                          \
1189             vext_set_elem_mask(vd, i, 1);                       \
1190         }                                                       \
1191     }                                                           \
1192 }
1193 
1194 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1195 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1196 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1197 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1198 
1199 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1200 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1201 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1202 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1203 
1204 /* Vector Bitwise Logical Instructions */
1205 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1206 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1207 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1208 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1209 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1210 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1211 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1212 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1213 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1214 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1215 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1216 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1217 GEN_VEXT_VV(vand_vv_b, 1)
1218 GEN_VEXT_VV(vand_vv_h, 2)
1219 GEN_VEXT_VV(vand_vv_w, 4)
1220 GEN_VEXT_VV(vand_vv_d, 8)
1221 GEN_VEXT_VV(vor_vv_b, 1)
1222 GEN_VEXT_VV(vor_vv_h, 2)
1223 GEN_VEXT_VV(vor_vv_w, 4)
1224 GEN_VEXT_VV(vor_vv_d, 8)
1225 GEN_VEXT_VV(vxor_vv_b, 1)
1226 GEN_VEXT_VV(vxor_vv_h, 2)
1227 GEN_VEXT_VV(vxor_vv_w, 4)
1228 GEN_VEXT_VV(vxor_vv_d, 8)
1229 
1230 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1231 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1232 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1233 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1234 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1235 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1236 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1237 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1238 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1239 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1240 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1241 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1242 GEN_VEXT_VX(vand_vx_b, 1)
1243 GEN_VEXT_VX(vand_vx_h, 2)
1244 GEN_VEXT_VX(vand_vx_w, 4)
1245 GEN_VEXT_VX(vand_vx_d, 8)
1246 GEN_VEXT_VX(vor_vx_b, 1)
1247 GEN_VEXT_VX(vor_vx_h, 2)
1248 GEN_VEXT_VX(vor_vx_w, 4)
1249 GEN_VEXT_VX(vor_vx_d, 8)
1250 GEN_VEXT_VX(vxor_vx_b, 1)
1251 GEN_VEXT_VX(vxor_vx_h, 2)
1252 GEN_VEXT_VX(vxor_vx_w, 4)
1253 GEN_VEXT_VX(vxor_vx_d, 8)
1254 
1255 /* Vector Single-Width Bit Shift Instructions */
1256 #define DO_SLL(N, M)  (N << (M))
1257 #define DO_SRL(N, M)  (N >> (M))
1258 
1259 /* generate the helpers for shift instructions with two vector operators */
1260 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1261 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1262                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1263 {                                                                         \
1264     uint32_t vm = vext_vm(desc);                                          \
1265     uint32_t vl = env->vl;                                                \
1266     uint32_t esz = sizeof(TS1);                                           \
1267     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1268     uint32_t vta = vext_vta(desc);                                        \
1269     uint32_t vma = vext_vma(desc);                                        \
1270     uint32_t i;                                                           \
1271                                                                           \
1272     for (i = env->vstart; i < vl; i++) {                                  \
1273         if (!vm && !vext_elem_mask(v0, i)) {                              \
1274             /* set masked-off elements to 1s */                           \
1275             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1276             continue;                                                     \
1277         }                                                                 \
1278         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1279         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1280         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1281     }                                                                     \
1282     env->vstart = 0;                                                      \
1283     /* set tail elements to 1s */                                         \
1284     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1285 }
1286 
1287 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1288 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1289 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1290 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1291 
1292 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1293 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1294 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1295 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1296 
1297 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1298 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1299 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1300 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1301 
1302 /*
1303  * generate the helpers for shift instructions with one vector and one scalar
1304  */
1305 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1306 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1307                   void *vs2, CPURISCVState *env,            \
1308                   uint32_t desc)                            \
1309 {                                                           \
1310     uint32_t vm = vext_vm(desc);                            \
1311     uint32_t vl = env->vl;                                  \
1312     uint32_t esz = sizeof(TD);                              \
1313     uint32_t total_elems =                                  \
1314         vext_get_total_elems(env, desc, esz);               \
1315     uint32_t vta = vext_vta(desc);                          \
1316     uint32_t vma = vext_vma(desc);                          \
1317     uint32_t i;                                             \
1318                                                             \
1319     for (i = env->vstart; i < vl; i++) {                    \
1320         if (!vm && !vext_elem_mask(v0, i)) {                \
1321             /* set masked-off elements to 1s */             \
1322             vext_set_elems_1s(vd, vma, i * esz,             \
1323                               (i + 1) * esz);               \
1324             continue;                                       \
1325         }                                                   \
1326         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1327         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1328     }                                                       \
1329     env->vstart = 0;                                        \
1330     /* set tail elements to 1s */                           \
1331     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1332 }
1333 
1334 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1336 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1337 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1338 
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1341 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1342 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1343 
1344 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1346 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1347 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1348 
1349 /* Vector Narrowing Integer Right Shift Instructions */
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1351 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1352 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1354 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1355 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1357 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1358 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1360 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1361 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1362 
1363 /* Vector Integer Comparison Instructions */
1364 #define DO_MSEQ(N, M) (N == M)
1365 #define DO_MSNE(N, M) (N != M)
1366 #define DO_MSLT(N, M) (N < M)
1367 #define DO_MSLE(N, M) (N <= M)
1368 #define DO_MSGT(N, M) (N > M)
1369 
1370 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1371 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1372                   CPURISCVState *env, uint32_t desc)          \
1373 {                                                             \
1374     uint32_t vm = vext_vm(desc);                              \
1375     uint32_t vl = env->vl;                                    \
1376     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1377     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1378     uint32_t vma = vext_vma(desc);                            \
1379     uint32_t i;                                               \
1380                                                               \
1381     for (i = env->vstart; i < vl; i++) {                      \
1382         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1383         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1384         if (!vm && !vext_elem_mask(v0, i)) {                  \
1385             /* set masked-off elements to 1s */               \
1386             if (vma) {                                        \
1387                 vext_set_elem_mask(vd, i, 1);                 \
1388             }                                                 \
1389             continue;                                         \
1390         }                                                     \
1391         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1392     }                                                         \
1393     env->vstart = 0;                                          \
1394     /*
1395      * mask destination register are always tail-agnostic
1396      * set tail elements to 1s
1397      */                                                       \
1398     if (vta_all_1s) {                                         \
1399         for (; i < total_elems; i++) {                        \
1400             vext_set_elem_mask(vd, i, 1);                     \
1401         }                                                     \
1402     }                                                         \
1403 }
1404 
1405 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1406 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1408 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1409 
1410 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1411 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1413 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1414 
1415 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1416 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1418 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1419 
1420 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1421 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1423 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1424 
1425 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1426 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1428 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1429 
1430 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1431 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1433 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1434 
1435 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1436 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1437                   CPURISCVState *env, uint32_t desc)                \
1438 {                                                                   \
1439     uint32_t vm = vext_vm(desc);                                    \
1440     uint32_t vl = env->vl;                                          \
1441     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1442     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1443     uint32_t vma = vext_vma(desc);                                  \
1444     uint32_t i;                                                     \
1445                                                                     \
1446     for (i = env->vstart; i < vl; i++) {                            \
1447         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1448         if (!vm && !vext_elem_mask(v0, i)) {                        \
1449             /* set masked-off elements to 1s */                     \
1450             if (vma) {                                              \
1451                 vext_set_elem_mask(vd, i, 1);                       \
1452             }                                                       \
1453             continue;                                               \
1454         }                                                           \
1455         vext_set_elem_mask(vd, i,                                   \
1456                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1457     }                                                               \
1458     env->vstart = 0;                                                \
1459     /*
1460      * mask destination register are always tail-agnostic
1461      * set tail elements to 1s
1462      */                                                             \
1463     if (vta_all_1s) {                                               \
1464         for (; i < total_elems; i++) {                              \
1465             vext_set_elem_mask(vd, i, 1);                           \
1466         }                                                           \
1467     }                                                               \
1468 }
1469 
1470 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1471 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1472 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1473 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1474 
1475 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1476 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1477 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1478 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1479 
1480 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1481 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1482 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1483 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1484 
1485 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1488 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1489 
1490 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1491 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1492 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1493 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1494 
1495 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1498 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1499 
1500 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1501 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1502 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1503 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1504 
1505 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1508 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1509 
1510 /* Vector Integer Min/Max Instructions */
1511 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1512 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1513 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1514 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1515 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1516 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1517 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1518 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1519 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1520 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1521 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1522 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1523 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1524 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1525 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1526 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1527 GEN_VEXT_VV(vminu_vv_b, 1)
1528 GEN_VEXT_VV(vminu_vv_h, 2)
1529 GEN_VEXT_VV(vminu_vv_w, 4)
1530 GEN_VEXT_VV(vminu_vv_d, 8)
1531 GEN_VEXT_VV(vmin_vv_b, 1)
1532 GEN_VEXT_VV(vmin_vv_h, 2)
1533 GEN_VEXT_VV(vmin_vv_w, 4)
1534 GEN_VEXT_VV(vmin_vv_d, 8)
1535 GEN_VEXT_VV(vmaxu_vv_b, 1)
1536 GEN_VEXT_VV(vmaxu_vv_h, 2)
1537 GEN_VEXT_VV(vmaxu_vv_w, 4)
1538 GEN_VEXT_VV(vmaxu_vv_d, 8)
1539 GEN_VEXT_VV(vmax_vv_b, 1)
1540 GEN_VEXT_VV(vmax_vv_h, 2)
1541 GEN_VEXT_VV(vmax_vv_w, 4)
1542 GEN_VEXT_VV(vmax_vv_d, 8)
1543 
1544 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1545 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1546 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1547 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1548 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1549 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1550 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1551 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1552 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1553 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1554 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1555 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1556 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1557 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1558 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1559 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1560 GEN_VEXT_VX(vminu_vx_b, 1)
1561 GEN_VEXT_VX(vminu_vx_h, 2)
1562 GEN_VEXT_VX(vminu_vx_w, 4)
1563 GEN_VEXT_VX(vminu_vx_d, 8)
1564 GEN_VEXT_VX(vmin_vx_b, 1)
1565 GEN_VEXT_VX(vmin_vx_h, 2)
1566 GEN_VEXT_VX(vmin_vx_w, 4)
1567 GEN_VEXT_VX(vmin_vx_d, 8)
1568 GEN_VEXT_VX(vmaxu_vx_b, 1)
1569 GEN_VEXT_VX(vmaxu_vx_h, 2)
1570 GEN_VEXT_VX(vmaxu_vx_w, 4)
1571 GEN_VEXT_VX(vmaxu_vx_d, 8)
1572 GEN_VEXT_VX(vmax_vx_b, 1)
1573 GEN_VEXT_VX(vmax_vx_h, 2)
1574 GEN_VEXT_VX(vmax_vx_w, 4)
1575 GEN_VEXT_VX(vmax_vx_d, 8)
1576 
1577 /* Vector Single-Width Integer Multiply Instructions */
1578 #define DO_MUL(N, M) (N * M)
1579 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1580 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1581 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1582 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1583 GEN_VEXT_VV(vmul_vv_b, 1)
1584 GEN_VEXT_VV(vmul_vv_h, 2)
1585 GEN_VEXT_VV(vmul_vv_w, 4)
1586 GEN_VEXT_VV(vmul_vv_d, 8)
1587 
1588 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1589 {
1590     return (int16_t)s2 * (int16_t)s1 >> 8;
1591 }
1592 
1593 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1594 {
1595     return (int32_t)s2 * (int32_t)s1 >> 16;
1596 }
1597 
1598 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1599 {
1600     return (int64_t)s2 * (int64_t)s1 >> 32;
1601 }
1602 
1603 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1604 {
1605     uint64_t hi_64, lo_64;
1606 
1607     muls64(&lo_64, &hi_64, s1, s2);
1608     return hi_64;
1609 }
1610 
1611 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1612 {
1613     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1614 }
1615 
1616 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1617 {
1618     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1619 }
1620 
1621 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1622 {
1623     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1624 }
1625 
1626 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1627 {
1628     uint64_t hi_64, lo_64;
1629 
1630     mulu64(&lo_64, &hi_64, s2, s1);
1631     return hi_64;
1632 }
1633 
1634 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1635 {
1636     return (int16_t)s2 * (uint16_t)s1 >> 8;
1637 }
1638 
1639 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1640 {
1641     return (int32_t)s2 * (uint32_t)s1 >> 16;
1642 }
1643 
1644 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1645 {
1646     return (int64_t)s2 * (uint64_t)s1 >> 32;
1647 }
1648 
1649 /*
1650  * Let  A = signed operand,
1651  *      B = unsigned operand
1652  *      P = mulu64(A, B), unsigned product
1653  *
1654  * LET  X = 2 ** 64  - A, 2's complement of A
1655  *      SP = signed product
1656  * THEN
1657  *      IF A < 0
1658  *          SP = -X * B
1659  *             = -(2 ** 64 - A) * B
1660  *             = A * B - 2 ** 64 * B
1661  *             = P - 2 ** 64 * B
1662  *      ELSE
1663  *          SP = P
1664  * THEN
1665  *      HI_P -= (A < 0 ? B : 0)
1666  */
1667 
1668 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1669 {
1670     uint64_t hi_64, lo_64;
1671 
1672     mulu64(&lo_64, &hi_64, s2, s1);
1673 
1674     hi_64 -= s2 < 0 ? s1 : 0;
1675     return hi_64;
1676 }
1677 
1678 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1679 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1680 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1681 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1682 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1683 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1684 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1685 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1686 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1687 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1688 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1689 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1690 GEN_VEXT_VV(vmulh_vv_b, 1)
1691 GEN_VEXT_VV(vmulh_vv_h, 2)
1692 GEN_VEXT_VV(vmulh_vv_w, 4)
1693 GEN_VEXT_VV(vmulh_vv_d, 8)
1694 GEN_VEXT_VV(vmulhu_vv_b, 1)
1695 GEN_VEXT_VV(vmulhu_vv_h, 2)
1696 GEN_VEXT_VV(vmulhu_vv_w, 4)
1697 GEN_VEXT_VV(vmulhu_vv_d, 8)
1698 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1699 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1700 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1701 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1702 
1703 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1704 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1705 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1706 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1707 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1708 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1709 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1710 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1711 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1712 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1713 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1714 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1715 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1716 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1717 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1718 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1719 GEN_VEXT_VX(vmul_vx_b, 1)
1720 GEN_VEXT_VX(vmul_vx_h, 2)
1721 GEN_VEXT_VX(vmul_vx_w, 4)
1722 GEN_VEXT_VX(vmul_vx_d, 8)
1723 GEN_VEXT_VX(vmulh_vx_b, 1)
1724 GEN_VEXT_VX(vmulh_vx_h, 2)
1725 GEN_VEXT_VX(vmulh_vx_w, 4)
1726 GEN_VEXT_VX(vmulh_vx_d, 8)
1727 GEN_VEXT_VX(vmulhu_vx_b, 1)
1728 GEN_VEXT_VX(vmulhu_vx_h, 2)
1729 GEN_VEXT_VX(vmulhu_vx_w, 4)
1730 GEN_VEXT_VX(vmulhu_vx_d, 8)
1731 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1732 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1733 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1734 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1735 
1736 /* Vector Integer Divide Instructions */
1737 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1738 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1739 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1740         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1741 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1742         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1743 
1744 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1745 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1746 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1747 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1748 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1749 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1750 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1751 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1752 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1753 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1754 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1755 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1756 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1757 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1758 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1759 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1760 GEN_VEXT_VV(vdivu_vv_b, 1)
1761 GEN_VEXT_VV(vdivu_vv_h, 2)
1762 GEN_VEXT_VV(vdivu_vv_w, 4)
1763 GEN_VEXT_VV(vdivu_vv_d, 8)
1764 GEN_VEXT_VV(vdiv_vv_b, 1)
1765 GEN_VEXT_VV(vdiv_vv_h, 2)
1766 GEN_VEXT_VV(vdiv_vv_w, 4)
1767 GEN_VEXT_VV(vdiv_vv_d, 8)
1768 GEN_VEXT_VV(vremu_vv_b, 1)
1769 GEN_VEXT_VV(vremu_vv_h, 2)
1770 GEN_VEXT_VV(vremu_vv_w, 4)
1771 GEN_VEXT_VV(vremu_vv_d, 8)
1772 GEN_VEXT_VV(vrem_vv_b, 1)
1773 GEN_VEXT_VV(vrem_vv_h, 2)
1774 GEN_VEXT_VV(vrem_vv_w, 4)
1775 GEN_VEXT_VV(vrem_vv_d, 8)
1776 
1777 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1778 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1779 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1780 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1781 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1782 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1783 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1784 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1785 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1786 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1787 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1788 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1789 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1790 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1791 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1792 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1793 GEN_VEXT_VX(vdivu_vx_b, 1)
1794 GEN_VEXT_VX(vdivu_vx_h, 2)
1795 GEN_VEXT_VX(vdivu_vx_w, 4)
1796 GEN_VEXT_VX(vdivu_vx_d, 8)
1797 GEN_VEXT_VX(vdiv_vx_b, 1)
1798 GEN_VEXT_VX(vdiv_vx_h, 2)
1799 GEN_VEXT_VX(vdiv_vx_w, 4)
1800 GEN_VEXT_VX(vdiv_vx_d, 8)
1801 GEN_VEXT_VX(vremu_vx_b, 1)
1802 GEN_VEXT_VX(vremu_vx_h, 2)
1803 GEN_VEXT_VX(vremu_vx_w, 4)
1804 GEN_VEXT_VX(vremu_vx_d, 8)
1805 GEN_VEXT_VX(vrem_vx_b, 1)
1806 GEN_VEXT_VX(vrem_vx_h, 2)
1807 GEN_VEXT_VX(vrem_vx_w, 4)
1808 GEN_VEXT_VX(vrem_vx_d, 8)
1809 
1810 /* Vector Widening Integer Multiply Instructions */
1811 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1812 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1813 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1814 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1815 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1816 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1817 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1818 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1819 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1820 GEN_VEXT_VV(vwmul_vv_b, 2)
1821 GEN_VEXT_VV(vwmul_vv_h, 4)
1822 GEN_VEXT_VV(vwmul_vv_w, 8)
1823 GEN_VEXT_VV(vwmulu_vv_b, 2)
1824 GEN_VEXT_VV(vwmulu_vv_h, 4)
1825 GEN_VEXT_VV(vwmulu_vv_w, 8)
1826 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1827 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1828 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1829 
1830 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1831 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1832 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1833 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1834 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1835 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1836 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1837 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1838 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1839 GEN_VEXT_VX(vwmul_vx_b, 2)
1840 GEN_VEXT_VX(vwmul_vx_h, 4)
1841 GEN_VEXT_VX(vwmul_vx_w, 8)
1842 GEN_VEXT_VX(vwmulu_vx_b, 2)
1843 GEN_VEXT_VX(vwmulu_vx_h, 4)
1844 GEN_VEXT_VX(vwmulu_vx_w, 8)
1845 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1846 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1847 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1848 
1849 /* Vector Single-Width Integer Multiply-Add Instructions */
1850 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1851 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1852 {                                                                  \
1853     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1854     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1855     TD d = *((TD *)vd + HD(i));                                    \
1856     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1857 }
1858 
1859 #define DO_MACC(N, M, D) (M * N + D)
1860 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1861 #define DO_MADD(N, M, D) (M * D + N)
1862 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1863 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1864 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1865 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1866 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1867 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1868 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1869 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1870 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1871 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1872 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1873 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1874 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1875 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1876 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1877 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1878 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1879 GEN_VEXT_VV(vmacc_vv_b, 1)
1880 GEN_VEXT_VV(vmacc_vv_h, 2)
1881 GEN_VEXT_VV(vmacc_vv_w, 4)
1882 GEN_VEXT_VV(vmacc_vv_d, 8)
1883 GEN_VEXT_VV(vnmsac_vv_b, 1)
1884 GEN_VEXT_VV(vnmsac_vv_h, 2)
1885 GEN_VEXT_VV(vnmsac_vv_w, 4)
1886 GEN_VEXT_VV(vnmsac_vv_d, 8)
1887 GEN_VEXT_VV(vmadd_vv_b, 1)
1888 GEN_VEXT_VV(vmadd_vv_h, 2)
1889 GEN_VEXT_VV(vmadd_vv_w, 4)
1890 GEN_VEXT_VV(vmadd_vv_d, 8)
1891 GEN_VEXT_VV(vnmsub_vv_b, 1)
1892 GEN_VEXT_VV(vnmsub_vv_h, 2)
1893 GEN_VEXT_VV(vnmsub_vv_w, 4)
1894 GEN_VEXT_VV(vnmsub_vv_d, 8)
1895 
1896 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1897 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1898 {                                                                   \
1899     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1900     TD d = *((TD *)vd + HD(i));                                     \
1901     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1902 }
1903 
1904 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1905 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1906 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1907 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1908 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1909 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1910 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1911 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1912 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1913 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1914 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1915 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1916 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1917 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1918 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1919 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1920 GEN_VEXT_VX(vmacc_vx_b, 1)
1921 GEN_VEXT_VX(vmacc_vx_h, 2)
1922 GEN_VEXT_VX(vmacc_vx_w, 4)
1923 GEN_VEXT_VX(vmacc_vx_d, 8)
1924 GEN_VEXT_VX(vnmsac_vx_b, 1)
1925 GEN_VEXT_VX(vnmsac_vx_h, 2)
1926 GEN_VEXT_VX(vnmsac_vx_w, 4)
1927 GEN_VEXT_VX(vnmsac_vx_d, 8)
1928 GEN_VEXT_VX(vmadd_vx_b, 1)
1929 GEN_VEXT_VX(vmadd_vx_h, 2)
1930 GEN_VEXT_VX(vmadd_vx_w, 4)
1931 GEN_VEXT_VX(vmadd_vx_d, 8)
1932 GEN_VEXT_VX(vnmsub_vx_b, 1)
1933 GEN_VEXT_VX(vnmsub_vx_h, 2)
1934 GEN_VEXT_VX(vnmsub_vx_w, 4)
1935 GEN_VEXT_VX(vnmsub_vx_d, 8)
1936 
1937 /* Vector Widening Integer Multiply-Add Instructions */
1938 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1939 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1940 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1941 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1942 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1943 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1946 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1947 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1948 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1949 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1950 GEN_VEXT_VV(vwmacc_vv_b, 2)
1951 GEN_VEXT_VV(vwmacc_vv_h, 4)
1952 GEN_VEXT_VV(vwmacc_vv_w, 8)
1953 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1954 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1955 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1956 
1957 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1958 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1959 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1960 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1961 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1962 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1965 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1968 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1969 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1970 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1971 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1972 GEN_VEXT_VX(vwmacc_vx_b, 2)
1973 GEN_VEXT_VX(vwmacc_vx_h, 4)
1974 GEN_VEXT_VX(vwmacc_vx_w, 8)
1975 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1976 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1977 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1978 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1979 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1980 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1981 
1982 /* Vector Integer Merge and Move Instructions */
1983 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1984 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1985                   uint32_t desc)                                     \
1986 {                                                                    \
1987     uint32_t vl = env->vl;                                           \
1988     uint32_t esz = sizeof(ETYPE);                                    \
1989     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1990     uint32_t vta = vext_vta(desc);                                   \
1991     uint32_t i;                                                      \
1992                                                                      \
1993     for (i = env->vstart; i < vl; i++) {                             \
1994         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1995         *((ETYPE *)vd + H(i)) = s1;                                  \
1996     }                                                                \
1997     env->vstart = 0;                                                 \
1998     /* set tail elements to 1s */                                    \
1999     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2000 }
2001 
2002 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2003 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2004 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2005 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2006 
2007 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2008 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2009                   uint32_t desc)                                     \
2010 {                                                                    \
2011     uint32_t vl = env->vl;                                           \
2012     uint32_t esz = sizeof(ETYPE);                                    \
2013     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2014     uint32_t vta = vext_vta(desc);                                   \
2015     uint32_t i;                                                      \
2016                                                                      \
2017     for (i = env->vstart; i < vl; i++) {                             \
2018         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2019     }                                                                \
2020     env->vstart = 0;                                                 \
2021     /* set tail elements to 1s */                                    \
2022     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2023 }
2024 
2025 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2026 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2027 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2028 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2029 
2030 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2031 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2032                   CPURISCVState *env, uint32_t desc)                 \
2033 {                                                                    \
2034     uint32_t vl = env->vl;                                           \
2035     uint32_t esz = sizeof(ETYPE);                                    \
2036     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2037     uint32_t vta = vext_vta(desc);                                   \
2038     uint32_t i;                                                      \
2039                                                                      \
2040     for (i = env->vstart; i < vl; i++) {                             \
2041         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2042         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2043     }                                                                \
2044     env->vstart = 0;                                                 \
2045     /* set tail elements to 1s */                                    \
2046     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2047 }
2048 
2049 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2050 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2051 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2052 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2053 
2054 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2055 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2056                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2057 {                                                                    \
2058     uint32_t vl = env->vl;                                           \
2059     uint32_t esz = sizeof(ETYPE);                                    \
2060     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2061     uint32_t vta = vext_vta(desc);                                   \
2062     uint32_t i;                                                      \
2063                                                                      \
2064     for (i = env->vstart; i < vl; i++) {                             \
2065         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2066         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2067                    (ETYPE)(target_long)s1);                          \
2068         *((ETYPE *)vd + H(i)) = d;                                   \
2069     }                                                                \
2070     env->vstart = 0;                                                 \
2071     /* set tail elements to 1s */                                    \
2072     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2073 }
2074 
2075 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2076 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2077 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2078 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2079 
2080 /*
2081  * Vector Fixed-Point Arithmetic Instructions
2082  */
2083 
2084 /* Vector Single-Width Saturating Add and Subtract */
2085 
2086 /*
2087  * As fixed point instructions probably have round mode and saturation,
2088  * define common macros for fixed point here.
2089  */
2090 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2091                           CPURISCVState *env, int vxrm);
2092 
2093 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2094 static inline void                                                  \
2095 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2096           CPURISCVState *env, int vxrm)                             \
2097 {                                                                   \
2098     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2099     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2100     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2101 }
2102 
2103 static inline void
2104 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2105              CPURISCVState *env,
2106              uint32_t vl, uint32_t vm, int vxrm,
2107              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2108 {
2109     for (uint32_t i = env->vstart; i < vl; i++) {
2110         if (!vm && !vext_elem_mask(v0, i)) {
2111             /* set masked-off elements to 1s */
2112             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2113             continue;
2114         }
2115         fn(vd, vs1, vs2, i, env, vxrm);
2116     }
2117     env->vstart = 0;
2118 }
2119 
2120 static inline void
2121 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2122              CPURISCVState *env,
2123              uint32_t desc,
2124              opivv2_rm_fn *fn, uint32_t esz)
2125 {
2126     uint32_t vm = vext_vm(desc);
2127     uint32_t vl = env->vl;
2128     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2129     uint32_t vta = vext_vta(desc);
2130     uint32_t vma = vext_vma(desc);
2131 
2132     switch (env->vxrm) {
2133     case 0: /* rnu */
2134         vext_vv_rm_1(vd, v0, vs1, vs2,
2135                      env, vl, vm, 0, fn, vma, esz);
2136         break;
2137     case 1: /* rne */
2138         vext_vv_rm_1(vd, v0, vs1, vs2,
2139                      env, vl, vm, 1, fn, vma, esz);
2140         break;
2141     case 2: /* rdn */
2142         vext_vv_rm_1(vd, v0, vs1, vs2,
2143                      env, vl, vm, 2, fn, vma, esz);
2144         break;
2145     default: /* rod */
2146         vext_vv_rm_1(vd, v0, vs1, vs2,
2147                      env, vl, vm, 3, fn, vma, esz);
2148         break;
2149     }
2150     /* set tail elements to 1s */
2151     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2152 }
2153 
2154 /* generate helpers for fixed point instructions with OPIVV format */
2155 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2156 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2157                   CPURISCVState *env, uint32_t desc)            \
2158 {                                                               \
2159     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2160                  do_##NAME, ESZ);                               \
2161 }
2162 
2163 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2164                              uint8_t b)
2165 {
2166     uint8_t res = a + b;
2167     if (res < a) {
2168         res = UINT8_MAX;
2169         env->vxsat = 0x1;
2170     }
2171     return res;
2172 }
2173 
2174 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2175                                uint16_t b)
2176 {
2177     uint16_t res = a + b;
2178     if (res < a) {
2179         res = UINT16_MAX;
2180         env->vxsat = 0x1;
2181     }
2182     return res;
2183 }
2184 
2185 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2186                                uint32_t b)
2187 {
2188     uint32_t res = a + b;
2189     if (res < a) {
2190         res = UINT32_MAX;
2191         env->vxsat = 0x1;
2192     }
2193     return res;
2194 }
2195 
2196 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2197                                uint64_t b)
2198 {
2199     uint64_t res = a + b;
2200     if (res < a) {
2201         res = UINT64_MAX;
2202         env->vxsat = 0x1;
2203     }
2204     return res;
2205 }
2206 
2207 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2209 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2210 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2211 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2212 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2213 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2214 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2215 
2216 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2217                           CPURISCVState *env, int vxrm);
2218 
2219 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2220 static inline void                                                  \
2221 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2222           CPURISCVState *env, int vxrm)                             \
2223 {                                                                   \
2224     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2225     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2226 }
2227 
2228 static inline void
2229 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2230              CPURISCVState *env,
2231              uint32_t vl, uint32_t vm, int vxrm,
2232              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2233 {
2234     for (uint32_t i = env->vstart; i < vl; i++) {
2235         if (!vm && !vext_elem_mask(v0, i)) {
2236             /* set masked-off elements to 1s */
2237             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2238             continue;
2239         }
2240         fn(vd, s1, vs2, i, env, vxrm);
2241     }
2242     env->vstart = 0;
2243 }
2244 
2245 static inline void
2246 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2247              CPURISCVState *env,
2248              uint32_t desc,
2249              opivx2_rm_fn *fn, uint32_t esz)
2250 {
2251     uint32_t vm = vext_vm(desc);
2252     uint32_t vl = env->vl;
2253     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2254     uint32_t vta = vext_vta(desc);
2255     uint32_t vma = vext_vma(desc);
2256 
2257     switch (env->vxrm) {
2258     case 0: /* rnu */
2259         vext_vx_rm_1(vd, v0, s1, vs2,
2260                      env, vl, vm, 0, fn, vma, esz);
2261         break;
2262     case 1: /* rne */
2263         vext_vx_rm_1(vd, v0, s1, vs2,
2264                      env, vl, vm, 1, fn, vma, esz);
2265         break;
2266     case 2: /* rdn */
2267         vext_vx_rm_1(vd, v0, s1, vs2,
2268                      env, vl, vm, 2, fn, vma, esz);
2269         break;
2270     default: /* rod */
2271         vext_vx_rm_1(vd, v0, s1, vs2,
2272                      env, vl, vm, 3, fn, vma, esz);
2273         break;
2274     }
2275     /* set tail elements to 1s */
2276     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2277 }
2278 
2279 /* generate helpers for fixed point instructions with OPIVX format */
2280 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2281 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2282                   void *vs2, CPURISCVState *env,          \
2283                   uint32_t desc)                          \
2284 {                                                         \
2285     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2286                  do_##NAME, ESZ);                         \
2287 }
2288 
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2290 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2291 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2292 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2293 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2294 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2295 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2296 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2297 
2298 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2299 {
2300     int8_t res = a + b;
2301     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2302         res = a > 0 ? INT8_MAX : INT8_MIN;
2303         env->vxsat = 0x1;
2304     }
2305     return res;
2306 }
2307 
2308 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2309                              int16_t b)
2310 {
2311     int16_t res = a + b;
2312     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2313         res = a > 0 ? INT16_MAX : INT16_MIN;
2314         env->vxsat = 0x1;
2315     }
2316     return res;
2317 }
2318 
2319 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2320                              int32_t b)
2321 {
2322     int32_t res = a + b;
2323     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2324         res = a > 0 ? INT32_MAX : INT32_MIN;
2325         env->vxsat = 0x1;
2326     }
2327     return res;
2328 }
2329 
2330 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2331                              int64_t b)
2332 {
2333     int64_t res = a + b;
2334     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2335         res = a > 0 ? INT64_MAX : INT64_MIN;
2336         env->vxsat = 0x1;
2337     }
2338     return res;
2339 }
2340 
2341 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2342 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2343 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2344 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2345 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2346 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2347 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2348 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2349 
2350 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2351 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2352 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2353 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2354 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2355 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2356 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2357 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2358 
2359 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2360                              uint8_t b)
2361 {
2362     uint8_t res = a - b;
2363     if (res > a) {
2364         res = 0;
2365         env->vxsat = 0x1;
2366     }
2367     return res;
2368 }
2369 
2370 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2371                                uint16_t b)
2372 {
2373     uint16_t res = a - b;
2374     if (res > a) {
2375         res = 0;
2376         env->vxsat = 0x1;
2377     }
2378     return res;
2379 }
2380 
2381 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2382                                uint32_t b)
2383 {
2384     uint32_t res = a - b;
2385     if (res > a) {
2386         res = 0;
2387         env->vxsat = 0x1;
2388     }
2389     return res;
2390 }
2391 
2392 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2393                                uint64_t b)
2394 {
2395     uint64_t res = a - b;
2396     if (res > a) {
2397         res = 0;
2398         env->vxsat = 0x1;
2399     }
2400     return res;
2401 }
2402 
2403 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2404 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2405 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2406 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2407 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2408 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2409 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2410 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2411 
2412 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2413 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2414 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2415 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2416 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2417 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2418 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2419 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2420 
2421 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2422 {
2423     int8_t res = a - b;
2424     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2425         res = a >= 0 ? INT8_MAX : INT8_MIN;
2426         env->vxsat = 0x1;
2427     }
2428     return res;
2429 }
2430 
2431 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2432                              int16_t b)
2433 {
2434     int16_t res = a - b;
2435     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2436         res = a >= 0 ? INT16_MAX : INT16_MIN;
2437         env->vxsat = 0x1;
2438     }
2439     return res;
2440 }
2441 
2442 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2443                              int32_t b)
2444 {
2445     int32_t res = a - b;
2446     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2447         res = a >= 0 ? INT32_MAX : INT32_MIN;
2448         env->vxsat = 0x1;
2449     }
2450     return res;
2451 }
2452 
2453 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2454                              int64_t b)
2455 {
2456     int64_t res = a - b;
2457     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2458         res = a >= 0 ? INT64_MAX : INT64_MIN;
2459         env->vxsat = 0x1;
2460     }
2461     return res;
2462 }
2463 
2464 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2465 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2466 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2467 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2468 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2469 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2470 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2471 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2472 
2473 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2474 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2475 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2476 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2477 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2478 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2479 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2480 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2481 
2482 /* Vector Single-Width Averaging Add and Subtract */
2483 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2484 {
2485     uint8_t d = extract64(v, shift, 1);
2486     uint8_t d1;
2487     uint64_t D1, D2;
2488 
2489     if (shift == 0 || shift > 64) {
2490         return 0;
2491     }
2492 
2493     d1 = extract64(v, shift - 1, 1);
2494     D1 = extract64(v, 0, shift);
2495     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2496         return d1;
2497     } else if (vxrm == 1) { /* round-to-nearest-even */
2498         if (shift > 1) {
2499             D2 = extract64(v, 0, shift - 1);
2500             return d1 & ((D2 != 0) | d);
2501         } else {
2502             return d1 & d;
2503         }
2504     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2505         return !d & (D1 != 0);
2506     }
2507     return 0; /* round-down (truncate) */
2508 }
2509 
2510 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2511                              int32_t b)
2512 {
2513     int64_t res = (int64_t)a + b;
2514     uint8_t round = get_round(vxrm, res, 1);
2515 
2516     return (res >> 1) + round;
2517 }
2518 
2519 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2520                              int64_t b)
2521 {
2522     int64_t res = a + b;
2523     uint8_t round = get_round(vxrm, res, 1);
2524     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2525 
2526     /* With signed overflow, bit 64 is inverse of bit 63. */
2527     return ((res >> 1) ^ over) + round;
2528 }
2529 
2530 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2531 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2532 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2533 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2534 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2535 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2536 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2537 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2538 
2539 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2540 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2541 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2542 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2543 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2544 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2545 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2546 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2547 
2548 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2549                                uint32_t a, uint32_t b)
2550 {
2551     uint64_t res = (uint64_t)a + b;
2552     uint8_t round = get_round(vxrm, res, 1);
2553 
2554     return (res >> 1) + round;
2555 }
2556 
2557 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2558                                uint64_t a, uint64_t b)
2559 {
2560     uint64_t res = a + b;
2561     uint8_t round = get_round(vxrm, res, 1);
2562     uint64_t over = (uint64_t)(res < a) << 63;
2563 
2564     return ((res >> 1) | over) + round;
2565 }
2566 
2567 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2568 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2569 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2570 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2571 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2572 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2573 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2574 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2575 
2576 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2577 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2578 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2579 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2580 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2581 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2582 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2583 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2584 
2585 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2586                              int32_t b)
2587 {
2588     int64_t res = (int64_t)a - b;
2589     uint8_t round = get_round(vxrm, res, 1);
2590 
2591     return (res >> 1) + round;
2592 }
2593 
2594 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2595                              int64_t b)
2596 {
2597     int64_t res = (int64_t)a - b;
2598     uint8_t round = get_round(vxrm, res, 1);
2599     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2600 
2601     /* With signed overflow, bit 64 is inverse of bit 63. */
2602     return ((res >> 1) ^ over) + round;
2603 }
2604 
2605 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2606 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2607 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2608 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2609 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2610 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2611 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2612 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2613 
2614 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2615 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2616 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2617 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2618 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2619 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2620 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2621 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2622 
2623 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2624                                uint32_t a, uint32_t b)
2625 {
2626     int64_t res = (int64_t)a - b;
2627     uint8_t round = get_round(vxrm, res, 1);
2628 
2629     return (res >> 1) + round;
2630 }
2631 
2632 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2633                                uint64_t a, uint64_t b)
2634 {
2635     uint64_t res = (uint64_t)a - b;
2636     uint8_t round = get_round(vxrm, res, 1);
2637     uint64_t over = (uint64_t)(res > a) << 63;
2638 
2639     return ((res >> 1) | over) + round;
2640 }
2641 
2642 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2643 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2644 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2645 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2646 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2647 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2648 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2649 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2650 
2651 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2652 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2653 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2654 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2655 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2656 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2657 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2658 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2659 
2660 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2661 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2662 {
2663     uint8_t round;
2664     int16_t res;
2665 
2666     res = (int16_t)a * (int16_t)b;
2667     round = get_round(vxrm, res, 7);
2668     res = (res >> 7) + round;
2669 
2670     if (res > INT8_MAX) {
2671         env->vxsat = 0x1;
2672         return INT8_MAX;
2673     } else if (res < INT8_MIN) {
2674         env->vxsat = 0x1;
2675         return INT8_MIN;
2676     } else {
2677         return res;
2678     }
2679 }
2680 
2681 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2682 {
2683     uint8_t round;
2684     int32_t res;
2685 
2686     res = (int32_t)a * (int32_t)b;
2687     round = get_round(vxrm, res, 15);
2688     res = (res >> 15) + round;
2689 
2690     if (res > INT16_MAX) {
2691         env->vxsat = 0x1;
2692         return INT16_MAX;
2693     } else if (res < INT16_MIN) {
2694         env->vxsat = 0x1;
2695         return INT16_MIN;
2696     } else {
2697         return res;
2698     }
2699 }
2700 
2701 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2702 {
2703     uint8_t round;
2704     int64_t res;
2705 
2706     res = (int64_t)a * (int64_t)b;
2707     round = get_round(vxrm, res, 31);
2708     res = (res >> 31) + round;
2709 
2710     if (res > INT32_MAX) {
2711         env->vxsat = 0x1;
2712         return INT32_MAX;
2713     } else if (res < INT32_MIN) {
2714         env->vxsat = 0x1;
2715         return INT32_MIN;
2716     } else {
2717         return res;
2718     }
2719 }
2720 
2721 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2722 {
2723     uint8_t round;
2724     uint64_t hi_64, lo_64;
2725     int64_t res;
2726 
2727     if (a == INT64_MIN && b == INT64_MIN) {
2728         env->vxsat = 1;
2729         return INT64_MAX;
2730     }
2731 
2732     muls64(&lo_64, &hi_64, a, b);
2733     round = get_round(vxrm, lo_64, 63);
2734     /*
2735      * Cannot overflow, as there are always
2736      * 2 sign bits after multiply.
2737      */
2738     res = (hi_64 << 1) | (lo_64 >> 63);
2739     if (round) {
2740         if (res == INT64_MAX) {
2741             env->vxsat = 1;
2742         } else {
2743             res += 1;
2744         }
2745     }
2746     return res;
2747 }
2748 
2749 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2750 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2751 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2752 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2753 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2754 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2755 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2756 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2757 
2758 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2759 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2760 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2761 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2762 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2763 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2764 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2765 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2766 
2767 /* Vector Single-Width Scaling Shift Instructions */
2768 static inline uint8_t
2769 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2770 {
2771     uint8_t round, shift = b & 0x7;
2772     uint8_t res;
2773 
2774     round = get_round(vxrm, a, shift);
2775     res = (a >> shift) + round;
2776     return res;
2777 }
2778 static inline uint16_t
2779 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2780 {
2781     uint8_t round, shift = b & 0xf;
2782 
2783     round = get_round(vxrm, a, shift);
2784     return (a >> shift) + round;
2785 }
2786 static inline uint32_t
2787 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2788 {
2789     uint8_t round, shift = b & 0x1f;
2790 
2791     round = get_round(vxrm, a, shift);
2792     return (a >> shift) + round;
2793 }
2794 static inline uint64_t
2795 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2796 {
2797     uint8_t round, shift = b & 0x3f;
2798 
2799     round = get_round(vxrm, a, shift);
2800     return (a >> shift) + round;
2801 }
2802 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2803 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2804 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2805 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2806 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2807 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2808 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2809 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2810 
2811 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2812 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2813 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2814 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2815 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2816 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2817 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2818 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2819 
2820 static inline int8_t
2821 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2822 {
2823     uint8_t round, shift = b & 0x7;
2824 
2825     round = get_round(vxrm, a, shift);
2826     return (a >> shift) + round;
2827 }
2828 static inline int16_t
2829 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2830 {
2831     uint8_t round, shift = b & 0xf;
2832 
2833     round = get_round(vxrm, a, shift);
2834     return (a >> shift) + round;
2835 }
2836 static inline int32_t
2837 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2838 {
2839     uint8_t round, shift = b & 0x1f;
2840 
2841     round = get_round(vxrm, a, shift);
2842     return (a >> shift) + round;
2843 }
2844 static inline int64_t
2845 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2846 {
2847     uint8_t round, shift = b & 0x3f;
2848 
2849     round = get_round(vxrm, a, shift);
2850     return (a >> shift) + round;
2851 }
2852 
2853 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2854 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2855 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2856 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2857 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2858 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2859 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2860 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2861 
2862 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2863 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2864 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2865 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2866 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2867 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2868 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2869 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2870 
2871 /* Vector Narrowing Fixed-Point Clip Instructions */
2872 static inline int8_t
2873 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2874 {
2875     uint8_t round, shift = b & 0xf;
2876     int16_t res;
2877 
2878     round = get_round(vxrm, a, shift);
2879     res = (a >> shift) + round;
2880     if (res > INT8_MAX) {
2881         env->vxsat = 0x1;
2882         return INT8_MAX;
2883     } else if (res < INT8_MIN) {
2884         env->vxsat = 0x1;
2885         return INT8_MIN;
2886     } else {
2887         return res;
2888     }
2889 }
2890 
2891 static inline int16_t
2892 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2893 {
2894     uint8_t round, shift = b & 0x1f;
2895     int32_t res;
2896 
2897     round = get_round(vxrm, a, shift);
2898     res = (a >> shift) + round;
2899     if (res > INT16_MAX) {
2900         env->vxsat = 0x1;
2901         return INT16_MAX;
2902     } else if (res < INT16_MIN) {
2903         env->vxsat = 0x1;
2904         return INT16_MIN;
2905     } else {
2906         return res;
2907     }
2908 }
2909 
2910 static inline int32_t
2911 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2912 {
2913     uint8_t round, shift = b & 0x3f;
2914     int64_t res;
2915 
2916     round = get_round(vxrm, a, shift);
2917     res = (a >> shift) + round;
2918     if (res > INT32_MAX) {
2919         env->vxsat = 0x1;
2920         return INT32_MAX;
2921     } else if (res < INT32_MIN) {
2922         env->vxsat = 0x1;
2923         return INT32_MIN;
2924     } else {
2925         return res;
2926     }
2927 }
2928 
2929 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2930 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2931 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2932 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2933 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2934 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2935 
2936 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2937 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2938 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2939 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2940 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2941 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2942 
2943 static inline uint8_t
2944 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2945 {
2946     uint8_t round, shift = b & 0xf;
2947     uint16_t res;
2948 
2949     round = get_round(vxrm, a, shift);
2950     res = (a >> shift) + round;
2951     if (res > UINT8_MAX) {
2952         env->vxsat = 0x1;
2953         return UINT8_MAX;
2954     } else {
2955         return res;
2956     }
2957 }
2958 
2959 static inline uint16_t
2960 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2961 {
2962     uint8_t round, shift = b & 0x1f;
2963     uint32_t res;
2964 
2965     round = get_round(vxrm, a, shift);
2966     res = (a >> shift) + round;
2967     if (res > UINT16_MAX) {
2968         env->vxsat = 0x1;
2969         return UINT16_MAX;
2970     } else {
2971         return res;
2972     }
2973 }
2974 
2975 static inline uint32_t
2976 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2977 {
2978     uint8_t round, shift = b & 0x3f;
2979     uint64_t res;
2980 
2981     round = get_round(vxrm, a, shift);
2982     res = (a >> shift) + round;
2983     if (res > UINT32_MAX) {
2984         env->vxsat = 0x1;
2985         return UINT32_MAX;
2986     } else {
2987         return res;
2988     }
2989 }
2990 
2991 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2992 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2993 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2994 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2995 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2996 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2997 
2998 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2999 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3000 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3001 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3002 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3003 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3004 
3005 /*
3006  * Vector Float Point Arithmetic Instructions
3007  */
3008 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3009 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3010 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3011                       CPURISCVState *env)                      \
3012 {                                                              \
3013     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3014     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3015     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3016 }
3017 
3018 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3019 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3020                   void *vs2, CPURISCVState *env,          \
3021                   uint32_t desc)                          \
3022 {                                                         \
3023     uint32_t vm = vext_vm(desc);                          \
3024     uint32_t vl = env->vl;                                \
3025     uint32_t total_elems =                                \
3026         vext_get_total_elems(env, desc, ESZ);             \
3027     uint32_t vta = vext_vta(desc);                        \
3028     uint32_t vma = vext_vma(desc);                        \
3029     uint32_t i;                                           \
3030                                                           \
3031     for (i = env->vstart; i < vl; i++) {                  \
3032         if (!vm && !vext_elem_mask(v0, i)) {              \
3033             /* set masked-off elements to 1s */           \
3034             vext_set_elems_1s(vd, vma, i * ESZ,           \
3035                               (i + 1) * ESZ);             \
3036             continue;                                     \
3037         }                                                 \
3038         do_##NAME(vd, vs1, vs2, i, env);                  \
3039     }                                                     \
3040     env->vstart = 0;                                      \
3041     /* set tail elements to 1s */                         \
3042     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3043                       total_elems * ESZ);                 \
3044 }
3045 
3046 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3047 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3048 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3049 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3050 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3051 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3052 
3053 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3054 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3055                       CPURISCVState *env)                      \
3056 {                                                              \
3057     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3058     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3059 }
3060 
3061 #define GEN_VEXT_VF(NAME, ESZ)                            \
3062 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3063                   void *vs2, CPURISCVState *env,          \
3064                   uint32_t desc)                          \
3065 {                                                         \
3066     uint32_t vm = vext_vm(desc);                          \
3067     uint32_t vl = env->vl;                                \
3068     uint32_t total_elems =                                \
3069         vext_get_total_elems(env, desc, ESZ);             \
3070     uint32_t vta = vext_vta(desc);                        \
3071     uint32_t vma = vext_vma(desc);                        \
3072     uint32_t i;                                           \
3073                                                           \
3074     for (i = env->vstart; i < vl; i++) {                  \
3075         if (!vm && !vext_elem_mask(v0, i)) {              \
3076             /* set masked-off elements to 1s */           \
3077             vext_set_elems_1s(vd, vma, i * ESZ,           \
3078                               (i + 1) * ESZ);             \
3079             continue;                                     \
3080         }                                                 \
3081         do_##NAME(vd, s1, vs2, i, env);                   \
3082     }                                                     \
3083     env->vstart = 0;                                      \
3084     /* set tail elements to 1s */                         \
3085     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3086                       total_elems * ESZ);                 \
3087 }
3088 
3089 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3090 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3091 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3092 GEN_VEXT_VF(vfadd_vf_h, 2)
3093 GEN_VEXT_VF(vfadd_vf_w, 4)
3094 GEN_VEXT_VF(vfadd_vf_d, 8)
3095 
3096 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3097 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3098 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3099 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3100 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3101 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3102 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3103 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3104 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3105 GEN_VEXT_VF(vfsub_vf_h, 2)
3106 GEN_VEXT_VF(vfsub_vf_w, 4)
3107 GEN_VEXT_VF(vfsub_vf_d, 8)
3108 
3109 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3110 {
3111     return float16_sub(b, a, s);
3112 }
3113 
3114 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3115 {
3116     return float32_sub(b, a, s);
3117 }
3118 
3119 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3120 {
3121     return float64_sub(b, a, s);
3122 }
3123 
3124 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3125 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3126 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3127 GEN_VEXT_VF(vfrsub_vf_h, 2)
3128 GEN_VEXT_VF(vfrsub_vf_w, 4)
3129 GEN_VEXT_VF(vfrsub_vf_d, 8)
3130 
3131 /* Vector Widening Floating-Point Add/Subtract Instructions */
3132 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3133 {
3134     return float32_add(float16_to_float32(a, true, s),
3135                        float16_to_float32(b, true, s), s);
3136 }
3137 
3138 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3139 {
3140     return float64_add(float32_to_float64(a, s),
3141                        float32_to_float64(b, s), s);
3142 
3143 }
3144 
3145 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3146 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3147 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3148 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3149 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3150 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3151 GEN_VEXT_VF(vfwadd_vf_h, 4)
3152 GEN_VEXT_VF(vfwadd_vf_w, 8)
3153 
3154 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3155 {
3156     return float32_sub(float16_to_float32(a, true, s),
3157                        float16_to_float32(b, true, s), s);
3158 }
3159 
3160 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3161 {
3162     return float64_sub(float32_to_float64(a, s),
3163                        float32_to_float64(b, s), s);
3164 
3165 }
3166 
3167 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3168 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3169 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3170 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3171 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3172 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3173 GEN_VEXT_VF(vfwsub_vf_h, 4)
3174 GEN_VEXT_VF(vfwsub_vf_w, 8)
3175 
3176 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3177 {
3178     return float32_add(a, float16_to_float32(b, true, s), s);
3179 }
3180 
3181 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3182 {
3183     return float64_add(a, float32_to_float64(b, s), s);
3184 }
3185 
3186 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3187 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3188 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3189 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3190 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3191 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3192 GEN_VEXT_VF(vfwadd_wf_h, 4)
3193 GEN_VEXT_VF(vfwadd_wf_w, 8)
3194 
3195 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3196 {
3197     return float32_sub(a, float16_to_float32(b, true, s), s);
3198 }
3199 
3200 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3201 {
3202     return float64_sub(a, float32_to_float64(b, s), s);
3203 }
3204 
3205 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3206 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3207 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3208 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3209 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3210 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3211 GEN_VEXT_VF(vfwsub_wf_h, 4)
3212 GEN_VEXT_VF(vfwsub_wf_w, 8)
3213 
3214 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3215 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3216 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3217 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3218 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3219 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3220 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3221 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3222 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3223 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3224 GEN_VEXT_VF(vfmul_vf_h, 2)
3225 GEN_VEXT_VF(vfmul_vf_w, 4)
3226 GEN_VEXT_VF(vfmul_vf_d, 8)
3227 
3228 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3229 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3230 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3231 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3232 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3233 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3234 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3235 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3236 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3237 GEN_VEXT_VF(vfdiv_vf_h, 2)
3238 GEN_VEXT_VF(vfdiv_vf_w, 4)
3239 GEN_VEXT_VF(vfdiv_vf_d, 8)
3240 
3241 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3242 {
3243     return float16_div(b, a, s);
3244 }
3245 
3246 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3247 {
3248     return float32_div(b, a, s);
3249 }
3250 
3251 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3252 {
3253     return float64_div(b, a, s);
3254 }
3255 
3256 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3257 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3258 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3259 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3260 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3261 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3262 
3263 /* Vector Widening Floating-Point Multiply */
3264 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3265 {
3266     return float32_mul(float16_to_float32(a, true, s),
3267                        float16_to_float32(b, true, s), s);
3268 }
3269 
3270 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3271 {
3272     return float64_mul(float32_to_float64(a, s),
3273                        float32_to_float64(b, s), s);
3274 
3275 }
3276 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3277 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3278 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3279 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3280 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3281 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3282 GEN_VEXT_VF(vfwmul_vf_h, 4)
3283 GEN_VEXT_VF(vfwmul_vf_w, 8)
3284 
3285 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3286 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3287 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3288                       CPURISCVState *env)                          \
3289 {                                                                  \
3290     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3291     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3292     TD d = *((TD *)vd + HD(i));                                    \
3293     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3294 }
3295 
3296 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3297 {
3298     return float16_muladd(a, b, d, 0, s);
3299 }
3300 
3301 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3302 {
3303     return float32_muladd(a, b, d, 0, s);
3304 }
3305 
3306 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3307 {
3308     return float64_muladd(a, b, d, 0, s);
3309 }
3310 
3311 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3312 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3313 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3314 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3315 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3316 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3317 
3318 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3319 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3320                       CPURISCVState *env)                         \
3321 {                                                                 \
3322     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3323     TD d = *((TD *)vd + HD(i));                                   \
3324     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3325 }
3326 
3327 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3328 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3329 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3330 GEN_VEXT_VF(vfmacc_vf_h, 2)
3331 GEN_VEXT_VF(vfmacc_vf_w, 4)
3332 GEN_VEXT_VF(vfmacc_vf_d, 8)
3333 
3334 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3335 {
3336     return float16_muladd(a, b, d, float_muladd_negate_c |
3337                                    float_muladd_negate_product, s);
3338 }
3339 
3340 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3341 {
3342     return float32_muladd(a, b, d, float_muladd_negate_c |
3343                                    float_muladd_negate_product, s);
3344 }
3345 
3346 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3347 {
3348     return float64_muladd(a, b, d, float_muladd_negate_c |
3349                                    float_muladd_negate_product, s);
3350 }
3351 
3352 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3353 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3354 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3355 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3356 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3357 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3358 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3359 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3360 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3361 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3362 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3363 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3364 
3365 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3366 {
3367     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3368 }
3369 
3370 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3371 {
3372     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3373 }
3374 
3375 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3376 {
3377     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3378 }
3379 
3380 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3381 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3382 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3383 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3384 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3385 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3386 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3387 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3388 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3389 GEN_VEXT_VF(vfmsac_vf_h, 2)
3390 GEN_VEXT_VF(vfmsac_vf_w, 4)
3391 GEN_VEXT_VF(vfmsac_vf_d, 8)
3392 
3393 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3394 {
3395     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3396 }
3397 
3398 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3399 {
3400     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3401 }
3402 
3403 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3404 {
3405     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3406 }
3407 
3408 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3409 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3410 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3411 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3412 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3413 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3414 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3415 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3416 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3417 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3418 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3419 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3420 
3421 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3422 {
3423     return float16_muladd(d, b, a, 0, s);
3424 }
3425 
3426 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3427 {
3428     return float32_muladd(d, b, a, 0, s);
3429 }
3430 
3431 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3432 {
3433     return float64_muladd(d, b, a, 0, s);
3434 }
3435 
3436 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3437 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3438 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3439 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3440 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3441 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3442 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3443 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3444 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3445 GEN_VEXT_VF(vfmadd_vf_h, 2)
3446 GEN_VEXT_VF(vfmadd_vf_w, 4)
3447 GEN_VEXT_VF(vfmadd_vf_d, 8)
3448 
3449 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3450 {
3451     return float16_muladd(d, b, a, float_muladd_negate_c |
3452                                    float_muladd_negate_product, s);
3453 }
3454 
3455 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3456 {
3457     return float32_muladd(d, b, a, float_muladd_negate_c |
3458                                    float_muladd_negate_product, s);
3459 }
3460 
3461 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3462 {
3463     return float64_muladd(d, b, a, float_muladd_negate_c |
3464                                    float_muladd_negate_product, s);
3465 }
3466 
3467 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3468 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3469 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3470 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3471 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3472 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3473 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3474 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3475 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3476 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3477 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3478 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3479 
3480 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3481 {
3482     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3483 }
3484 
3485 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3486 {
3487     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3488 }
3489 
3490 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3491 {
3492     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3493 }
3494 
3495 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3496 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3497 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3498 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3499 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3500 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3501 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3502 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3503 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3504 GEN_VEXT_VF(vfmsub_vf_h, 2)
3505 GEN_VEXT_VF(vfmsub_vf_w, 4)
3506 GEN_VEXT_VF(vfmsub_vf_d, 8)
3507 
3508 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3509 {
3510     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3511 }
3512 
3513 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3514 {
3515     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3516 }
3517 
3518 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3519 {
3520     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3521 }
3522 
3523 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3524 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3525 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3526 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3527 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3528 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3529 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3530 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3531 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3532 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3533 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3534 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3535 
3536 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3537 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3538 {
3539     return float32_muladd(float16_to_float32(a, true, s),
3540                           float16_to_float32(b, true, s), d, 0, s);
3541 }
3542 
3543 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3544 {
3545     return float64_muladd(float32_to_float64(a, s),
3546                           float32_to_float64(b, s), d, 0, s);
3547 }
3548 
3549 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3550 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3551 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3552 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3553 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3554 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3555 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3556 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3557 
3558 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3559 {
3560     return float32_muladd(bfloat16_to_float32(a, s),
3561                           bfloat16_to_float32(b, s), d, 0, s);
3562 }
3563 
3564 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3565 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3566 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmacc16)
3567 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3568 
3569 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3570 {
3571     return float32_muladd(float16_to_float32(a, true, s),
3572                           float16_to_float32(b, true, s), d,
3573                           float_muladd_negate_c | float_muladd_negate_product,
3574                           s);
3575 }
3576 
3577 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3578 {
3579     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3580                           d, float_muladd_negate_c |
3581                              float_muladd_negate_product, s);
3582 }
3583 
3584 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3585 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3586 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3587 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3588 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3589 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3590 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3591 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3592 
3593 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3594 {
3595     return float32_muladd(float16_to_float32(a, true, s),
3596                           float16_to_float32(b, true, s), d,
3597                           float_muladd_negate_c, s);
3598 }
3599 
3600 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3601 {
3602     return float64_muladd(float32_to_float64(a, s),
3603                           float32_to_float64(b, s), d,
3604                           float_muladd_negate_c, s);
3605 }
3606 
3607 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3608 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3609 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3610 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3611 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3612 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3613 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3614 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3615 
3616 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3617 {
3618     return float32_muladd(float16_to_float32(a, true, s),
3619                           float16_to_float32(b, true, s), d,
3620                           float_muladd_negate_product, s);
3621 }
3622 
3623 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3624 {
3625     return float64_muladd(float32_to_float64(a, s),
3626                           float32_to_float64(b, s), d,
3627                           float_muladd_negate_product, s);
3628 }
3629 
3630 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3631 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3632 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3633 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3634 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3635 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3636 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3637 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3638 
3639 /* Vector Floating-Point Square-Root Instruction */
3640 /* (TD, T2, TX2) */
3641 #define OP_UU_H uint16_t, uint16_t, uint16_t
3642 #define OP_UU_W uint32_t, uint32_t, uint32_t
3643 #define OP_UU_D uint64_t, uint64_t, uint64_t
3644 
3645 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3646 static void do_##NAME(void *vd, void *vs2, int i,      \
3647                       CPURISCVState *env)              \
3648 {                                                      \
3649     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3650     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3651 }
3652 
3653 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3654 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3655                   CPURISCVState *env, uint32_t desc)   \
3656 {                                                      \
3657     uint32_t vm = vext_vm(desc);                       \
3658     uint32_t vl = env->vl;                             \
3659     uint32_t total_elems =                             \
3660         vext_get_total_elems(env, desc, ESZ);          \
3661     uint32_t vta = vext_vta(desc);                     \
3662     uint32_t vma = vext_vma(desc);                     \
3663     uint32_t i;                                        \
3664                                                        \
3665     if (vl == 0) {                                     \
3666         return;                                        \
3667     }                                                  \
3668     for (i = env->vstart; i < vl; i++) {               \
3669         if (!vm && !vext_elem_mask(v0, i)) {           \
3670             /* set masked-off elements to 1s */        \
3671             vext_set_elems_1s(vd, vma, i * ESZ,        \
3672                               (i + 1) * ESZ);          \
3673             continue;                                  \
3674         }                                              \
3675         do_##NAME(vd, vs2, i, env);                    \
3676     }                                                  \
3677     env->vstart = 0;                                   \
3678     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3679                       total_elems * ESZ);              \
3680 }
3681 
3682 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3683 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3684 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3685 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3686 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3687 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3688 
3689 /*
3690  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3691  *
3692  * Adapted from riscv-v-spec recip.c:
3693  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3694  */
3695 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3696 {
3697     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3698     uint64_t exp = extract64(f, frac_size, exp_size);
3699     uint64_t frac = extract64(f, 0, frac_size);
3700 
3701     const uint8_t lookup_table[] = {
3702         52, 51, 50, 48, 47, 46, 44, 43,
3703         42, 41, 40, 39, 38, 36, 35, 34,
3704         33, 32, 31, 30, 30, 29, 28, 27,
3705         26, 25, 24, 23, 23, 22, 21, 20,
3706         19, 19, 18, 17, 16, 16, 15, 14,
3707         14, 13, 12, 12, 11, 10, 10, 9,
3708         9, 8, 7, 7, 6, 6, 5, 4,
3709         4, 3, 3, 2, 2, 1, 1, 0,
3710         127, 125, 123, 121, 119, 118, 116, 114,
3711         113, 111, 109, 108, 106, 105, 103, 102,
3712         100, 99, 97, 96, 95, 93, 92, 91,
3713         90, 88, 87, 86, 85, 84, 83, 82,
3714         80, 79, 78, 77, 76, 75, 74, 73,
3715         72, 71, 70, 70, 69, 68, 67, 66,
3716         65, 64, 63, 63, 62, 61, 60, 59,
3717         59, 58, 57, 56, 56, 55, 54, 53
3718     };
3719     const int precision = 7;
3720 
3721     if (exp == 0 && frac != 0) { /* subnormal */
3722         /* Normalize the subnormal. */
3723         while (extract64(frac, frac_size - 1, 1) == 0) {
3724             exp--;
3725             frac <<= 1;
3726         }
3727 
3728         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3729     }
3730 
3731     int idx = ((exp & 1) << (precision - 1)) |
3732               (frac >> (frac_size - precision + 1));
3733     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3734                         (frac_size - precision);
3735     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3736 
3737     uint64_t val = 0;
3738     val = deposit64(val, 0, frac_size, out_frac);
3739     val = deposit64(val, frac_size, exp_size, out_exp);
3740     val = deposit64(val, frac_size + exp_size, 1, sign);
3741     return val;
3742 }
3743 
3744 static float16 frsqrt7_h(float16 f, float_status *s)
3745 {
3746     int exp_size = 5, frac_size = 10;
3747     bool sign = float16_is_neg(f);
3748 
3749     /*
3750      * frsqrt7(sNaN) = canonical NaN
3751      * frsqrt7(-inf) = canonical NaN
3752      * frsqrt7(-normal) = canonical NaN
3753      * frsqrt7(-subnormal) = canonical NaN
3754      */
3755     if (float16_is_signaling_nan(f, s) ||
3756         (float16_is_infinity(f) && sign) ||
3757         (float16_is_normal(f) && sign) ||
3758         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3759         s->float_exception_flags |= float_flag_invalid;
3760         return float16_default_nan(s);
3761     }
3762 
3763     /* frsqrt7(qNaN) = canonical NaN */
3764     if (float16_is_quiet_nan(f, s)) {
3765         return float16_default_nan(s);
3766     }
3767 
3768     /* frsqrt7(+-0) = +-inf */
3769     if (float16_is_zero(f)) {
3770         s->float_exception_flags |= float_flag_divbyzero;
3771         return float16_set_sign(float16_infinity, sign);
3772     }
3773 
3774     /* frsqrt7(+inf) = +0 */
3775     if (float16_is_infinity(f) && !sign) {
3776         return float16_set_sign(float16_zero, sign);
3777     }
3778 
3779     /* +normal, +subnormal */
3780     uint64_t val = frsqrt7(f, exp_size, frac_size);
3781     return make_float16(val);
3782 }
3783 
3784 static float32 frsqrt7_s(float32 f, float_status *s)
3785 {
3786     int exp_size = 8, frac_size = 23;
3787     bool sign = float32_is_neg(f);
3788 
3789     /*
3790      * frsqrt7(sNaN) = canonical NaN
3791      * frsqrt7(-inf) = canonical NaN
3792      * frsqrt7(-normal) = canonical NaN
3793      * frsqrt7(-subnormal) = canonical NaN
3794      */
3795     if (float32_is_signaling_nan(f, s) ||
3796         (float32_is_infinity(f) && sign) ||
3797         (float32_is_normal(f) && sign) ||
3798         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3799         s->float_exception_flags |= float_flag_invalid;
3800         return float32_default_nan(s);
3801     }
3802 
3803     /* frsqrt7(qNaN) = canonical NaN */
3804     if (float32_is_quiet_nan(f, s)) {
3805         return float32_default_nan(s);
3806     }
3807 
3808     /* frsqrt7(+-0) = +-inf */
3809     if (float32_is_zero(f)) {
3810         s->float_exception_flags |= float_flag_divbyzero;
3811         return float32_set_sign(float32_infinity, sign);
3812     }
3813 
3814     /* frsqrt7(+inf) = +0 */
3815     if (float32_is_infinity(f) && !sign) {
3816         return float32_set_sign(float32_zero, sign);
3817     }
3818 
3819     /* +normal, +subnormal */
3820     uint64_t val = frsqrt7(f, exp_size, frac_size);
3821     return make_float32(val);
3822 }
3823 
3824 static float64 frsqrt7_d(float64 f, float_status *s)
3825 {
3826     int exp_size = 11, frac_size = 52;
3827     bool sign = float64_is_neg(f);
3828 
3829     /*
3830      * frsqrt7(sNaN) = canonical NaN
3831      * frsqrt7(-inf) = canonical NaN
3832      * frsqrt7(-normal) = canonical NaN
3833      * frsqrt7(-subnormal) = canonical NaN
3834      */
3835     if (float64_is_signaling_nan(f, s) ||
3836         (float64_is_infinity(f) && sign) ||
3837         (float64_is_normal(f) && sign) ||
3838         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3839         s->float_exception_flags |= float_flag_invalid;
3840         return float64_default_nan(s);
3841     }
3842 
3843     /* frsqrt7(qNaN) = canonical NaN */
3844     if (float64_is_quiet_nan(f, s)) {
3845         return float64_default_nan(s);
3846     }
3847 
3848     /* frsqrt7(+-0) = +-inf */
3849     if (float64_is_zero(f)) {
3850         s->float_exception_flags |= float_flag_divbyzero;
3851         return float64_set_sign(float64_infinity, sign);
3852     }
3853 
3854     /* frsqrt7(+inf) = +0 */
3855     if (float64_is_infinity(f) && !sign) {
3856         return float64_set_sign(float64_zero, sign);
3857     }
3858 
3859     /* +normal, +subnormal */
3860     uint64_t val = frsqrt7(f, exp_size, frac_size);
3861     return make_float64(val);
3862 }
3863 
3864 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3865 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3866 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3867 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3868 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3869 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3870 
3871 /*
3872  * Vector Floating-Point Reciprocal Estimate Instruction
3873  *
3874  * Adapted from riscv-v-spec recip.c:
3875  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3876  */
3877 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3878                       float_status *s)
3879 {
3880     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3881     uint64_t exp = extract64(f, frac_size, exp_size);
3882     uint64_t frac = extract64(f, 0, frac_size);
3883 
3884     const uint8_t lookup_table[] = {
3885         127, 125, 123, 121, 119, 117, 116, 114,
3886         112, 110, 109, 107, 105, 104, 102, 100,
3887         99, 97, 96, 94, 93, 91, 90, 88,
3888         87, 85, 84, 83, 81, 80, 79, 77,
3889         76, 75, 74, 72, 71, 70, 69, 68,
3890         66, 65, 64, 63, 62, 61, 60, 59,
3891         58, 57, 56, 55, 54, 53, 52, 51,
3892         50, 49, 48, 47, 46, 45, 44, 43,
3893         42, 41, 40, 40, 39, 38, 37, 36,
3894         35, 35, 34, 33, 32, 31, 31, 30,
3895         29, 28, 28, 27, 26, 25, 25, 24,
3896         23, 23, 22, 21, 21, 20, 19, 19,
3897         18, 17, 17, 16, 15, 15, 14, 14,
3898         13, 12, 12, 11, 11, 10, 9, 9,
3899         8, 8, 7, 7, 6, 5, 5, 4,
3900         4, 3, 3, 2, 2, 1, 1, 0
3901     };
3902     const int precision = 7;
3903 
3904     if (exp == 0 && frac != 0) { /* subnormal */
3905         /* Normalize the subnormal. */
3906         while (extract64(frac, frac_size - 1, 1) == 0) {
3907             exp--;
3908             frac <<= 1;
3909         }
3910 
3911         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3912 
3913         if (exp != 0 && exp != UINT64_MAX) {
3914             /*
3915              * Overflow to inf or max value of same sign,
3916              * depending on sign and rounding mode.
3917              */
3918             s->float_exception_flags |= (float_flag_inexact |
3919                                          float_flag_overflow);
3920 
3921             if ((s->float_rounding_mode == float_round_to_zero) ||
3922                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3923                 ((s->float_rounding_mode == float_round_up) && sign)) {
3924                 /* Return greatest/negative finite value. */
3925                 return (sign << (exp_size + frac_size)) |
3926                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3927             } else {
3928                 /* Return +-inf. */
3929                 return (sign << (exp_size + frac_size)) |
3930                        MAKE_64BIT_MASK(frac_size, exp_size);
3931             }
3932         }
3933     }
3934 
3935     int idx = frac >> (frac_size - precision);
3936     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3937                         (frac_size - precision);
3938     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3939 
3940     if (out_exp == 0 || out_exp == UINT64_MAX) {
3941         /*
3942          * The result is subnormal, but don't raise the underflow exception,
3943          * because there's no additional loss of precision.
3944          */
3945         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3946         if (out_exp == UINT64_MAX) {
3947             out_frac >>= 1;
3948             out_exp = 0;
3949         }
3950     }
3951 
3952     uint64_t val = 0;
3953     val = deposit64(val, 0, frac_size, out_frac);
3954     val = deposit64(val, frac_size, exp_size, out_exp);
3955     val = deposit64(val, frac_size + exp_size, 1, sign);
3956     return val;
3957 }
3958 
3959 static float16 frec7_h(float16 f, float_status *s)
3960 {
3961     int exp_size = 5, frac_size = 10;
3962     bool sign = float16_is_neg(f);
3963 
3964     /* frec7(+-inf) = +-0 */
3965     if (float16_is_infinity(f)) {
3966         return float16_set_sign(float16_zero, sign);
3967     }
3968 
3969     /* frec7(+-0) = +-inf */
3970     if (float16_is_zero(f)) {
3971         s->float_exception_flags |= float_flag_divbyzero;
3972         return float16_set_sign(float16_infinity, sign);
3973     }
3974 
3975     /* frec7(sNaN) = canonical NaN */
3976     if (float16_is_signaling_nan(f, s)) {
3977         s->float_exception_flags |= float_flag_invalid;
3978         return float16_default_nan(s);
3979     }
3980 
3981     /* frec7(qNaN) = canonical NaN */
3982     if (float16_is_quiet_nan(f, s)) {
3983         return float16_default_nan(s);
3984     }
3985 
3986     /* +-normal, +-subnormal */
3987     uint64_t val = frec7(f, exp_size, frac_size, s);
3988     return make_float16(val);
3989 }
3990 
3991 static float32 frec7_s(float32 f, float_status *s)
3992 {
3993     int exp_size = 8, frac_size = 23;
3994     bool sign = float32_is_neg(f);
3995 
3996     /* frec7(+-inf) = +-0 */
3997     if (float32_is_infinity(f)) {
3998         return float32_set_sign(float32_zero, sign);
3999     }
4000 
4001     /* frec7(+-0) = +-inf */
4002     if (float32_is_zero(f)) {
4003         s->float_exception_flags |= float_flag_divbyzero;
4004         return float32_set_sign(float32_infinity, sign);
4005     }
4006 
4007     /* frec7(sNaN) = canonical NaN */
4008     if (float32_is_signaling_nan(f, s)) {
4009         s->float_exception_flags |= float_flag_invalid;
4010         return float32_default_nan(s);
4011     }
4012 
4013     /* frec7(qNaN) = canonical NaN */
4014     if (float32_is_quiet_nan(f, s)) {
4015         return float32_default_nan(s);
4016     }
4017 
4018     /* +-normal, +-subnormal */
4019     uint64_t val = frec7(f, exp_size, frac_size, s);
4020     return make_float32(val);
4021 }
4022 
4023 static float64 frec7_d(float64 f, float_status *s)
4024 {
4025     int exp_size = 11, frac_size = 52;
4026     bool sign = float64_is_neg(f);
4027 
4028     /* frec7(+-inf) = +-0 */
4029     if (float64_is_infinity(f)) {
4030         return float64_set_sign(float64_zero, sign);
4031     }
4032 
4033     /* frec7(+-0) = +-inf */
4034     if (float64_is_zero(f)) {
4035         s->float_exception_flags |= float_flag_divbyzero;
4036         return float64_set_sign(float64_infinity, sign);
4037     }
4038 
4039     /* frec7(sNaN) = canonical NaN */
4040     if (float64_is_signaling_nan(f, s)) {
4041         s->float_exception_flags |= float_flag_invalid;
4042         return float64_default_nan(s);
4043     }
4044 
4045     /* frec7(qNaN) = canonical NaN */
4046     if (float64_is_quiet_nan(f, s)) {
4047         return float64_default_nan(s);
4048     }
4049 
4050     /* +-normal, +-subnormal */
4051     uint64_t val = frec7(f, exp_size, frac_size, s);
4052     return make_float64(val);
4053 }
4054 
4055 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4056 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4057 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4058 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4059 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4060 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4061 
4062 /* Vector Floating-Point MIN/MAX Instructions */
4063 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4064 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4065 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4066 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4067 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4068 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4069 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4070 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4071 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4072 GEN_VEXT_VF(vfmin_vf_h, 2)
4073 GEN_VEXT_VF(vfmin_vf_w, 4)
4074 GEN_VEXT_VF(vfmin_vf_d, 8)
4075 
4076 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4077 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4078 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4079 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4080 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4081 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4082 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4083 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4084 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4085 GEN_VEXT_VF(vfmax_vf_h, 2)
4086 GEN_VEXT_VF(vfmax_vf_w, 4)
4087 GEN_VEXT_VF(vfmax_vf_d, 8)
4088 
4089 /* Vector Floating-Point Sign-Injection Instructions */
4090 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4091 {
4092     return deposit64(b, 0, 15, a);
4093 }
4094 
4095 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4096 {
4097     return deposit64(b, 0, 31, a);
4098 }
4099 
4100 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4101 {
4102     return deposit64(b, 0, 63, a);
4103 }
4104 
4105 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4106 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4107 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4108 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4109 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4110 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4111 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4112 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4113 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4114 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4115 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4116 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4117 
4118 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4119 {
4120     return deposit64(~b, 0, 15, a);
4121 }
4122 
4123 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4124 {
4125     return deposit64(~b, 0, 31, a);
4126 }
4127 
4128 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4129 {
4130     return deposit64(~b, 0, 63, a);
4131 }
4132 
4133 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4134 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4135 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4136 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4137 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4138 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4139 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4140 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4141 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4142 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4143 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4144 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4145 
4146 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4147 {
4148     return deposit64(b ^ a, 0, 15, a);
4149 }
4150 
4151 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4152 {
4153     return deposit64(b ^ a, 0, 31, a);
4154 }
4155 
4156 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4157 {
4158     return deposit64(b ^ a, 0, 63, a);
4159 }
4160 
4161 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4162 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4163 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4164 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4165 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4166 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4167 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4168 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4169 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4170 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4171 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4172 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4173 
4174 /* Vector Floating-Point Compare Instructions */
4175 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4176 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4177                   CPURISCVState *env, uint32_t desc)          \
4178 {                                                             \
4179     uint32_t vm = vext_vm(desc);                              \
4180     uint32_t vl = env->vl;                                    \
4181     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4182     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4183     uint32_t vma = vext_vma(desc);                            \
4184     uint32_t i;                                               \
4185                                                               \
4186     for (i = env->vstart; i < vl; i++) {                      \
4187         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4188         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4189         if (!vm && !vext_elem_mask(v0, i)) {                  \
4190             /* set masked-off elements to 1s */               \
4191             if (vma) {                                        \
4192                 vext_set_elem_mask(vd, i, 1);                 \
4193             }                                                 \
4194             continue;                                         \
4195         }                                                     \
4196         vext_set_elem_mask(vd, i,                             \
4197                            DO_OP(s2, s1, &env->fp_status));   \
4198     }                                                         \
4199     env->vstart = 0;                                          \
4200     /*
4201      * mask destination register are always tail-agnostic
4202      * set tail elements to 1s
4203      */                                                       \
4204     if (vta_all_1s) {                                         \
4205         for (; i < total_elems; i++) {                        \
4206             vext_set_elem_mask(vd, i, 1);                     \
4207         }                                                     \
4208     }                                                         \
4209 }
4210 
4211 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4212 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4213 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4214 
4215 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4216 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4217                   CPURISCVState *env, uint32_t desc)                \
4218 {                                                                   \
4219     uint32_t vm = vext_vm(desc);                                    \
4220     uint32_t vl = env->vl;                                          \
4221     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4222     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4223     uint32_t vma = vext_vma(desc);                                  \
4224     uint32_t i;                                                     \
4225                                                                     \
4226     for (i = env->vstart; i < vl; i++) {                            \
4227         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4228         if (!vm && !vext_elem_mask(v0, i)) {                        \
4229             /* set masked-off elements to 1s */                     \
4230             if (vma) {                                              \
4231                 vext_set_elem_mask(vd, i, 1);                       \
4232             }                                                       \
4233             continue;                                               \
4234         }                                                           \
4235         vext_set_elem_mask(vd, i,                                   \
4236                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4237     }                                                               \
4238     env->vstart = 0;                                                \
4239     /*
4240      * mask destination register are always tail-agnostic
4241      * set tail elements to 1s
4242      */                                                             \
4243     if (vta_all_1s) {                                               \
4244         for (; i < total_elems; i++) {                              \
4245             vext_set_elem_mask(vd, i, 1);                           \
4246         }                                                           \
4247     }                                                               \
4248 }
4249 
4250 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4251 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4252 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4253 
4254 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4255 {
4256     FloatRelation compare = float16_compare_quiet(a, b, s);
4257     return compare != float_relation_equal;
4258 }
4259 
4260 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4261 {
4262     FloatRelation compare = float32_compare_quiet(a, b, s);
4263     return compare != float_relation_equal;
4264 }
4265 
4266 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4267 {
4268     FloatRelation compare = float64_compare_quiet(a, b, s);
4269     return compare != float_relation_equal;
4270 }
4271 
4272 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4273 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4274 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4275 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4276 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4277 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4278 
4279 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4280 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4281 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4282 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4283 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4284 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4285 
4286 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4287 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4288 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4289 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4290 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4291 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4292 
4293 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4294 {
4295     FloatRelation compare = float16_compare(a, b, s);
4296     return compare == float_relation_greater;
4297 }
4298 
4299 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4300 {
4301     FloatRelation compare = float32_compare(a, b, s);
4302     return compare == float_relation_greater;
4303 }
4304 
4305 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4306 {
4307     FloatRelation compare = float64_compare(a, b, s);
4308     return compare == float_relation_greater;
4309 }
4310 
4311 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4312 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4313 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4314 
4315 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4316 {
4317     FloatRelation compare = float16_compare(a, b, s);
4318     return compare == float_relation_greater ||
4319            compare == float_relation_equal;
4320 }
4321 
4322 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4323 {
4324     FloatRelation compare = float32_compare(a, b, s);
4325     return compare == float_relation_greater ||
4326            compare == float_relation_equal;
4327 }
4328 
4329 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4330 {
4331     FloatRelation compare = float64_compare(a, b, s);
4332     return compare == float_relation_greater ||
4333            compare == float_relation_equal;
4334 }
4335 
4336 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4337 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4338 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4339 
4340 /* Vector Floating-Point Classify Instruction */
4341 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4342 static void do_##NAME(void *vd, void *vs2, int i)      \
4343 {                                                      \
4344     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4345     *((TD *)vd + HD(i)) = OP(s2);                      \
4346 }
4347 
4348 #define GEN_VEXT_V(NAME, ESZ)                          \
4349 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4350                   CPURISCVState *env, uint32_t desc)   \
4351 {                                                      \
4352     uint32_t vm = vext_vm(desc);                       \
4353     uint32_t vl = env->vl;                             \
4354     uint32_t total_elems =                             \
4355         vext_get_total_elems(env, desc, ESZ);          \
4356     uint32_t vta = vext_vta(desc);                     \
4357     uint32_t vma = vext_vma(desc);                     \
4358     uint32_t i;                                        \
4359                                                        \
4360     for (i = env->vstart; i < vl; i++) {               \
4361         if (!vm && !vext_elem_mask(v0, i)) {           \
4362             /* set masked-off elements to 1s */        \
4363             vext_set_elems_1s(vd, vma, i * ESZ,        \
4364                               (i + 1) * ESZ);          \
4365             continue;                                  \
4366         }                                              \
4367         do_##NAME(vd, vs2, i);                         \
4368     }                                                  \
4369     env->vstart = 0;                                   \
4370     /* set tail elements to 1s */                      \
4371     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4372                       total_elems * ESZ);              \
4373 }
4374 
4375 target_ulong fclass_h(uint64_t frs1)
4376 {
4377     float16 f = frs1;
4378     bool sign = float16_is_neg(f);
4379 
4380     if (float16_is_infinity(f)) {
4381         return sign ? 1 << 0 : 1 << 7;
4382     } else if (float16_is_zero(f)) {
4383         return sign ? 1 << 3 : 1 << 4;
4384     } else if (float16_is_zero_or_denormal(f)) {
4385         return sign ? 1 << 2 : 1 << 5;
4386     } else if (float16_is_any_nan(f)) {
4387         float_status s = { }; /* for snan_bit_is_one */
4388         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4389     } else {
4390         return sign ? 1 << 1 : 1 << 6;
4391     }
4392 }
4393 
4394 target_ulong fclass_s(uint64_t frs1)
4395 {
4396     float32 f = frs1;
4397     bool sign = float32_is_neg(f);
4398 
4399     if (float32_is_infinity(f)) {
4400         return sign ? 1 << 0 : 1 << 7;
4401     } else if (float32_is_zero(f)) {
4402         return sign ? 1 << 3 : 1 << 4;
4403     } else if (float32_is_zero_or_denormal(f)) {
4404         return sign ? 1 << 2 : 1 << 5;
4405     } else if (float32_is_any_nan(f)) {
4406         float_status s = { }; /* for snan_bit_is_one */
4407         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4408     } else {
4409         return sign ? 1 << 1 : 1 << 6;
4410     }
4411 }
4412 
4413 target_ulong fclass_d(uint64_t frs1)
4414 {
4415     float64 f = frs1;
4416     bool sign = float64_is_neg(f);
4417 
4418     if (float64_is_infinity(f)) {
4419         return sign ? 1 << 0 : 1 << 7;
4420     } else if (float64_is_zero(f)) {
4421         return sign ? 1 << 3 : 1 << 4;
4422     } else if (float64_is_zero_or_denormal(f)) {
4423         return sign ? 1 << 2 : 1 << 5;
4424     } else if (float64_is_any_nan(f)) {
4425         float_status s = { }; /* for snan_bit_is_one */
4426         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4427     } else {
4428         return sign ? 1 << 1 : 1 << 6;
4429     }
4430 }
4431 
4432 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4433 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4434 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4435 GEN_VEXT_V(vfclass_v_h, 2)
4436 GEN_VEXT_V(vfclass_v_w, 4)
4437 GEN_VEXT_V(vfclass_v_d, 8)
4438 
4439 /* Vector Floating-Point Merge Instruction */
4440 
4441 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4442 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4443                   CPURISCVState *env, uint32_t desc)          \
4444 {                                                             \
4445     uint32_t vm = vext_vm(desc);                              \
4446     uint32_t vl = env->vl;                                    \
4447     uint32_t esz = sizeof(ETYPE);                             \
4448     uint32_t total_elems =                                    \
4449         vext_get_total_elems(env, desc, esz);                 \
4450     uint32_t vta = vext_vta(desc);                            \
4451     uint32_t i;                                               \
4452                                                               \
4453     for (i = env->vstart; i < vl; i++) {                      \
4454         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4455         *((ETYPE *)vd + H(i)) =                               \
4456             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4457     }                                                         \
4458     env->vstart = 0;                                          \
4459     /* set tail elements to 1s */                             \
4460     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4461 }
4462 
4463 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4464 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4465 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4466 
4467 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4468 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4469 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4470 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4471 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4472 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4473 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4474 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4475 
4476 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4477 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4478 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4479 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4480 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4481 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4482 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4483 
4484 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4485 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4486 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4487 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4488 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4489 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4490 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4491 
4492 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4493 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4494 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4495 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4496 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4497 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4498 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4499 
4500 /* Widening Floating-Point/Integer Type-Convert Instructions */
4501 /* (TD, T2, TX2) */
4502 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4503 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4504 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4505 /*
4506  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4507  */
4508 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4509 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4510 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4511 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4512 
4513 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4514 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4515 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4516 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4517 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4518 
4519 /*
4520  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4521  */
4522 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4523 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4524 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4525 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4526 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4527 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4528 
4529 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4530 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4531 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4532 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4533 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4534 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4535 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4536 
4537 /*
4538  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4539  */
4540 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4541 {
4542     return float16_to_float32(a, true, s);
4543 }
4544 
4545 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4546 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4547 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4548 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4549 
4550 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4551 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4552 
4553 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4554 /* (TD, T2, TX2) */
4555 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4556 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4557 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4558 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4559 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4560 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4561 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4562 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4563 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4564 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4565 
4566 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4567 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4568 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4569 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4570 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4571 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4572 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4573 
4574 /*
4575  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4576  */
4577 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4578 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4579 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4580 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4581 
4582 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4583 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4584 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4585 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4586 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4587 
4588 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4589 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4590 {
4591     return float32_to_float16(a, true, s);
4592 }
4593 
4594 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4595 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4596 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4597 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4598 
4599 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4600 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4601 
4602 /*
4603  * Vector Reduction Operations
4604  */
4605 /* Vector Single-Width Integer Reduction Instructions */
4606 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4607 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4608                   void *vs2, CPURISCVState *env,          \
4609                   uint32_t desc)                          \
4610 {                                                         \
4611     uint32_t vm = vext_vm(desc);                          \
4612     uint32_t vl = env->vl;                                \
4613     uint32_t esz = sizeof(TD);                            \
4614     uint32_t vlenb = simd_maxsz(desc);                    \
4615     uint32_t vta = vext_vta(desc);                        \
4616     uint32_t i;                                           \
4617     TD s1 =  *((TD *)vs1 + HD(0));                        \
4618                                                           \
4619     for (i = env->vstart; i < vl; i++) {                  \
4620         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4621         if (!vm && !vext_elem_mask(v0, i)) {              \
4622             continue;                                     \
4623         }                                                 \
4624         s1 = OP(s1, (TD)s2);                              \
4625     }                                                     \
4626     *((TD *)vd + HD(0)) = s1;                             \
4627     env->vstart = 0;                                      \
4628     /* set tail elements to 1s */                         \
4629     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4630 }
4631 
4632 /* vd[0] = sum(vs1[0], vs2[*]) */
4633 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4634 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4635 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4636 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4637 
4638 /* vd[0] = maxu(vs1[0], vs2[*]) */
4639 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4640 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4641 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4642 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4643 
4644 /* vd[0] = max(vs1[0], vs2[*]) */
4645 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4646 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4647 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4648 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4649 
4650 /* vd[0] = minu(vs1[0], vs2[*]) */
4651 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4652 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4653 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4654 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4655 
4656 /* vd[0] = min(vs1[0], vs2[*]) */
4657 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4658 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4659 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4660 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4661 
4662 /* vd[0] = and(vs1[0], vs2[*]) */
4663 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4664 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4665 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4666 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4667 
4668 /* vd[0] = or(vs1[0], vs2[*]) */
4669 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4670 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4671 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4672 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4673 
4674 /* vd[0] = xor(vs1[0], vs2[*]) */
4675 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4676 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4677 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4678 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4679 
4680 /* Vector Widening Integer Reduction Instructions */
4681 /* signed sum reduction into double-width accumulator */
4682 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4683 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4684 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4685 
4686 /* Unsigned sum reduction into double-width accumulator */
4687 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4688 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4689 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4690 
4691 /* Vector Single-Width Floating-Point Reduction Instructions */
4692 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4693 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4694                   void *vs2, CPURISCVState *env,           \
4695                   uint32_t desc)                           \
4696 {                                                          \
4697     uint32_t vm = vext_vm(desc);                           \
4698     uint32_t vl = env->vl;                                 \
4699     uint32_t esz = sizeof(TD);                             \
4700     uint32_t vlenb = simd_maxsz(desc);                     \
4701     uint32_t vta = vext_vta(desc);                         \
4702     uint32_t i;                                            \
4703     TD s1 =  *((TD *)vs1 + HD(0));                         \
4704                                                            \
4705     for (i = env->vstart; i < vl; i++) {                   \
4706         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4707         if (!vm && !vext_elem_mask(v0, i)) {               \
4708             continue;                                      \
4709         }                                                  \
4710         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4711     }                                                      \
4712     *((TD *)vd + HD(0)) = s1;                              \
4713     env->vstart = 0;                                       \
4714     /* set tail elements to 1s */                          \
4715     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4716 }
4717 
4718 /* Unordered sum */
4719 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4720 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4721 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4722 
4723 /* Ordered sum */
4724 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4725 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4726 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4727 
4728 /* Maximum value */
4729 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4730               float16_maximum_number)
4731 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4732               float32_maximum_number)
4733 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4734               float64_maximum_number)
4735 
4736 /* Minimum value */
4737 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4738               float16_minimum_number)
4739 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4740               float32_minimum_number)
4741 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4742               float64_minimum_number)
4743 
4744 /* Vector Widening Floating-Point Add Instructions */
4745 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4746 {
4747     return float32_add(a, float16_to_float32(b, true, s), s);
4748 }
4749 
4750 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4751 {
4752     return float64_add(a, float32_to_float64(b, s), s);
4753 }
4754 
4755 /* Vector Widening Floating-Point Reduction Instructions */
4756 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4757 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4758 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4759 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4760 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4761 
4762 /*
4763  * Vector Mask Operations
4764  */
4765 /* Vector Mask-Register Logical Instructions */
4766 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4767 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4768                   void *vs2, CPURISCVState *env,          \
4769                   uint32_t desc)                          \
4770 {                                                         \
4771     uint32_t vl = env->vl;                                \
4772     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4773     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4774     uint32_t i;                                           \
4775     int a, b;                                             \
4776                                                           \
4777     for (i = env->vstart; i < vl; i++) {                  \
4778         a = vext_elem_mask(vs1, i);                       \
4779         b = vext_elem_mask(vs2, i);                       \
4780         vext_set_elem_mask(vd, i, OP(b, a));              \
4781     }                                                     \
4782     env->vstart = 0;                                      \
4783     /*
4784      * mask destination register are always tail-agnostic
4785      * set tail elements to 1s
4786      */                                                   \
4787     if (vta_all_1s) {                                     \
4788         for (; i < total_elems; i++) {                    \
4789             vext_set_elem_mask(vd, i, 1);                 \
4790         }                                                 \
4791     }                                                     \
4792 }
4793 
4794 #define DO_NAND(N, M)  (!(N & M))
4795 #define DO_ANDNOT(N, M)  (N & !M)
4796 #define DO_NOR(N, M)  (!(N | M))
4797 #define DO_ORNOT(N, M)  (N | !M)
4798 #define DO_XNOR(N, M)  (!(N ^ M))
4799 
4800 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4801 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4802 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4803 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4804 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4805 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4806 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4807 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4808 
4809 /* Vector count population in mask vcpop */
4810 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4811                              uint32_t desc)
4812 {
4813     target_ulong cnt = 0;
4814     uint32_t vm = vext_vm(desc);
4815     uint32_t vl = env->vl;
4816     int i;
4817 
4818     for (i = env->vstart; i < vl; i++) {
4819         if (vm || vext_elem_mask(v0, i)) {
4820             if (vext_elem_mask(vs2, i)) {
4821                 cnt++;
4822             }
4823         }
4824     }
4825     env->vstart = 0;
4826     return cnt;
4827 }
4828 
4829 /* vfirst find-first-set mask bit */
4830 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4831                               uint32_t desc)
4832 {
4833     uint32_t vm = vext_vm(desc);
4834     uint32_t vl = env->vl;
4835     int i;
4836 
4837     for (i = env->vstart; i < vl; i++) {
4838         if (vm || vext_elem_mask(v0, i)) {
4839             if (vext_elem_mask(vs2, i)) {
4840                 return i;
4841             }
4842         }
4843     }
4844     env->vstart = 0;
4845     return -1LL;
4846 }
4847 
4848 enum set_mask_type {
4849     ONLY_FIRST = 1,
4850     INCLUDE_FIRST,
4851     BEFORE_FIRST,
4852 };
4853 
4854 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4855                    uint32_t desc, enum set_mask_type type)
4856 {
4857     uint32_t vm = vext_vm(desc);
4858     uint32_t vl = env->vl;
4859     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4860     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4861     uint32_t vma = vext_vma(desc);
4862     int i;
4863     bool first_mask_bit = false;
4864 
4865     for (i = env->vstart; i < vl; i++) {
4866         if (!vm && !vext_elem_mask(v0, i)) {
4867             /* set masked-off elements to 1s */
4868             if (vma) {
4869                 vext_set_elem_mask(vd, i, 1);
4870             }
4871             continue;
4872         }
4873         /* write a zero to all following active elements */
4874         if (first_mask_bit) {
4875             vext_set_elem_mask(vd, i, 0);
4876             continue;
4877         }
4878         if (vext_elem_mask(vs2, i)) {
4879             first_mask_bit = true;
4880             if (type == BEFORE_FIRST) {
4881                 vext_set_elem_mask(vd, i, 0);
4882             } else {
4883                 vext_set_elem_mask(vd, i, 1);
4884             }
4885         } else {
4886             if (type == ONLY_FIRST) {
4887                 vext_set_elem_mask(vd, i, 0);
4888             } else {
4889                 vext_set_elem_mask(vd, i, 1);
4890             }
4891         }
4892     }
4893     env->vstart = 0;
4894     /*
4895      * mask destination register are always tail-agnostic
4896      * set tail elements to 1s
4897      */
4898     if (vta_all_1s) {
4899         for (; i < total_elems; i++) {
4900             vext_set_elem_mask(vd, i, 1);
4901         }
4902     }
4903 }
4904 
4905 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4906                      uint32_t desc)
4907 {
4908     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4909 }
4910 
4911 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4912                      uint32_t desc)
4913 {
4914     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4915 }
4916 
4917 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4918                      uint32_t desc)
4919 {
4920     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4921 }
4922 
4923 /* Vector Iota Instruction */
4924 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4925 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4926                   uint32_t desc)                                          \
4927 {                                                                         \
4928     uint32_t vm = vext_vm(desc);                                          \
4929     uint32_t vl = env->vl;                                                \
4930     uint32_t esz = sizeof(ETYPE);                                         \
4931     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4932     uint32_t vta = vext_vta(desc);                                        \
4933     uint32_t vma = vext_vma(desc);                                        \
4934     uint32_t sum = 0;                                                     \
4935     int i;                                                                \
4936                                                                           \
4937     for (i = env->vstart; i < vl; i++) {                                  \
4938         if (!vm && !vext_elem_mask(v0, i)) {                              \
4939             /* set masked-off elements to 1s */                           \
4940             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4941             continue;                                                     \
4942         }                                                                 \
4943         *((ETYPE *)vd + H(i)) = sum;                                      \
4944         if (vext_elem_mask(vs2, i)) {                                     \
4945             sum++;                                                        \
4946         }                                                                 \
4947     }                                                                     \
4948     env->vstart = 0;                                                      \
4949     /* set tail elements to 1s */                                         \
4950     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4951 }
4952 
4953 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4954 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4955 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4956 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4957 
4958 /* Vector Element Index Instruction */
4959 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4960 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4961 {                                                                         \
4962     uint32_t vm = vext_vm(desc);                                          \
4963     uint32_t vl = env->vl;                                                \
4964     uint32_t esz = sizeof(ETYPE);                                         \
4965     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4966     uint32_t vta = vext_vta(desc);                                        \
4967     uint32_t vma = vext_vma(desc);                                        \
4968     int i;                                                                \
4969                                                                           \
4970     for (i = env->vstart; i < vl; i++) {                                  \
4971         if (!vm && !vext_elem_mask(v0, i)) {                              \
4972             /* set masked-off elements to 1s */                           \
4973             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4974             continue;                                                     \
4975         }                                                                 \
4976         *((ETYPE *)vd + H(i)) = i;                                        \
4977     }                                                                     \
4978     env->vstart = 0;                                                      \
4979     /* set tail elements to 1s */                                         \
4980     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4981 }
4982 
4983 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4984 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4985 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4986 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4987 
4988 /*
4989  * Vector Permutation Instructions
4990  */
4991 
4992 /* Vector Slide Instructions */
4993 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4994 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4995                   CPURISCVState *env, uint32_t desc)                      \
4996 {                                                                         \
4997     uint32_t vm = vext_vm(desc);                                          \
4998     uint32_t vl = env->vl;                                                \
4999     uint32_t esz = sizeof(ETYPE);                                         \
5000     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5001     uint32_t vta = vext_vta(desc);                                        \
5002     uint32_t vma = vext_vma(desc);                                        \
5003     target_ulong offset = s1, i_min, i;                                   \
5004                                                                           \
5005     i_min = MAX(env->vstart, offset);                                     \
5006     for (i = i_min; i < vl; i++) {                                        \
5007         if (!vm && !vext_elem_mask(v0, i)) {                              \
5008             /* set masked-off elements to 1s */                           \
5009             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5010             continue;                                                     \
5011         }                                                                 \
5012         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5013     }                                                                     \
5014     /* set tail elements to 1s */                                         \
5015     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5016 }
5017 
5018 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5019 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5020 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5021 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5022 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5023 
5024 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5025 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5026                   CPURISCVState *env, uint32_t desc)                      \
5027 {                                                                         \
5028     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5029     uint32_t vm = vext_vm(desc);                                          \
5030     uint32_t vl = env->vl;                                                \
5031     uint32_t esz = sizeof(ETYPE);                                         \
5032     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5033     uint32_t vta = vext_vta(desc);                                        \
5034     uint32_t vma = vext_vma(desc);                                        \
5035     target_ulong i_max, i;                                                \
5036                                                                           \
5037     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5038     for (i = env->vstart; i < i_max; ++i) {                               \
5039         if (!vm && !vext_elem_mask(v0, i)) {                              \
5040             /* set masked-off elements to 1s */                           \
5041             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5042             continue;                                                     \
5043         }                                                                 \
5044         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5045     }                                                                     \
5046                                                                           \
5047     for (i = i_max; i < vl; ++i) {                                        \
5048         if (vm || vext_elem_mask(v0, i)) {                                \
5049             *((ETYPE *)vd + H(i)) = 0;                                    \
5050         }                                                                 \
5051     }                                                                     \
5052                                                                           \
5053     env->vstart = 0;                                                      \
5054     /* set tail elements to 1s */                                         \
5055     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5056 }
5057 
5058 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5059 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5060 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5061 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5062 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5063 
5064 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5065 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5066                                  void *vs2, CPURISCVState *env,             \
5067                                  uint32_t desc)                             \
5068 {                                                                           \
5069     typedef uint##BITWIDTH##_t ETYPE;                                       \
5070     uint32_t vm = vext_vm(desc);                                            \
5071     uint32_t vl = env->vl;                                                  \
5072     uint32_t esz = sizeof(ETYPE);                                           \
5073     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5074     uint32_t vta = vext_vta(desc);                                          \
5075     uint32_t vma = vext_vma(desc);                                          \
5076     uint32_t i;                                                             \
5077                                                                             \
5078     for (i = env->vstart; i < vl; i++) {                                    \
5079         if (!vm && !vext_elem_mask(v0, i)) {                                \
5080             /* set masked-off elements to 1s */                             \
5081             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5082             continue;                                                       \
5083         }                                                                   \
5084         if (i == 0) {                                                       \
5085             *((ETYPE *)vd + H(i)) = s1;                                     \
5086         } else {                                                            \
5087             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5088         }                                                                   \
5089     }                                                                       \
5090     env->vstart = 0;                                                        \
5091     /* set tail elements to 1s */                                           \
5092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5093 }
5094 
5095 GEN_VEXT_VSLIE1UP(8,  H1)
5096 GEN_VEXT_VSLIE1UP(16, H2)
5097 GEN_VEXT_VSLIE1UP(32, H4)
5098 GEN_VEXT_VSLIE1UP(64, H8)
5099 
5100 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5101 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5102                   CPURISCVState *env, uint32_t desc)              \
5103 {                                                                 \
5104     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5105 }
5106 
5107 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5108 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5109 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5110 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5111 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5112 
5113 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5114 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5115                                    void *vs2, CPURISCVState *env,             \
5116                                    uint32_t desc)                             \
5117 {                                                                             \
5118     typedef uint##BITWIDTH##_t ETYPE;                                         \
5119     uint32_t vm = vext_vm(desc);                                              \
5120     uint32_t vl = env->vl;                                                    \
5121     uint32_t esz = sizeof(ETYPE);                                             \
5122     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5123     uint32_t vta = vext_vta(desc);                                            \
5124     uint32_t vma = vext_vma(desc);                                            \
5125     uint32_t i;                                                               \
5126                                                                               \
5127     for (i = env->vstart; i < vl; i++) {                                      \
5128         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5129             /* set masked-off elements to 1s */                               \
5130             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5131             continue;                                                         \
5132         }                                                                     \
5133         if (i == vl - 1) {                                                    \
5134             *((ETYPE *)vd + H(i)) = s1;                                       \
5135         } else {                                                              \
5136             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5137         }                                                                     \
5138     }                                                                         \
5139     env->vstart = 0;                                                          \
5140     /* set tail elements to 1s */                                             \
5141     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5142 }
5143 
5144 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5145 GEN_VEXT_VSLIDE1DOWN(16, H2)
5146 GEN_VEXT_VSLIDE1DOWN(32, H4)
5147 GEN_VEXT_VSLIDE1DOWN(64, H8)
5148 
5149 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5150 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5151                   CPURISCVState *env, uint32_t desc)              \
5152 {                                                                 \
5153     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5154 }
5155 
5156 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5157 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5158 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5159 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5160 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5161 
5162 /* Vector Floating-Point Slide Instructions */
5163 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5164 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5165                   CPURISCVState *env, uint32_t desc)          \
5166 {                                                             \
5167     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5168 }
5169 
5170 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5171 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5172 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5173 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5174 
5175 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5176 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5177                   CPURISCVState *env, uint32_t desc)          \
5178 {                                                             \
5179     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5180 }
5181 
5182 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5183 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5184 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5185 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5186 
5187 /* Vector Register Gather Instruction */
5188 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5189 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5190                   CPURISCVState *env, uint32_t desc)                      \
5191 {                                                                         \
5192     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5193     uint32_t vm = vext_vm(desc);                                          \
5194     uint32_t vl = env->vl;                                                \
5195     uint32_t esz = sizeof(TS2);                                           \
5196     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5197     uint32_t vta = vext_vta(desc);                                        \
5198     uint32_t vma = vext_vma(desc);                                        \
5199     uint64_t index;                                                       \
5200     uint32_t i;                                                           \
5201                                                                           \
5202     for (i = env->vstart; i < vl; i++) {                                  \
5203         if (!vm && !vext_elem_mask(v0, i)) {                              \
5204             /* set masked-off elements to 1s */                           \
5205             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5206             continue;                                                     \
5207         }                                                                 \
5208         index = *((TS1 *)vs1 + HS1(i));                                   \
5209         if (index >= vlmax) {                                             \
5210             *((TS2 *)vd + HS2(i)) = 0;                                    \
5211         } else {                                                          \
5212             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5213         }                                                                 \
5214     }                                                                     \
5215     env->vstart = 0;                                                      \
5216     /* set tail elements to 1s */                                         \
5217     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5218 }
5219 
5220 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5221 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5222 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5223 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5224 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5225 
5226 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5227 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5228 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5229 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5230 
5231 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5232 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5233                   CPURISCVState *env, uint32_t desc)                      \
5234 {                                                                         \
5235     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5236     uint32_t vm = vext_vm(desc);                                          \
5237     uint32_t vl = env->vl;                                                \
5238     uint32_t esz = sizeof(ETYPE);                                         \
5239     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5240     uint32_t vta = vext_vta(desc);                                        \
5241     uint32_t vma = vext_vma(desc);                                        \
5242     uint64_t index = s1;                                                  \
5243     uint32_t i;                                                           \
5244                                                                           \
5245     for (i = env->vstart; i < vl; i++) {                                  \
5246         if (!vm && !vext_elem_mask(v0, i)) {                              \
5247             /* set masked-off elements to 1s */                           \
5248             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5249             continue;                                                     \
5250         }                                                                 \
5251         if (index >= vlmax) {                                             \
5252             *((ETYPE *)vd + H(i)) = 0;                                    \
5253         } else {                                                          \
5254             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5255         }                                                                 \
5256     }                                                                     \
5257     env->vstart = 0;                                                      \
5258     /* set tail elements to 1s */                                         \
5259     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5260 }
5261 
5262 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5263 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5264 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5265 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5266 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5267 
5268 /* Vector Compress Instruction */
5269 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5270 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5271                   CPURISCVState *env, uint32_t desc)                      \
5272 {                                                                         \
5273     uint32_t vl = env->vl;                                                \
5274     uint32_t esz = sizeof(ETYPE);                                         \
5275     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5276     uint32_t vta = vext_vta(desc);                                        \
5277     uint32_t num = 0, i;                                                  \
5278                                                                           \
5279     for (i = env->vstart; i < vl; i++) {                                  \
5280         if (!vext_elem_mask(vs1, i)) {                                    \
5281             continue;                                                     \
5282         }                                                                 \
5283         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5284         num++;                                                            \
5285     }                                                                     \
5286     env->vstart = 0;                                                      \
5287     /* set tail elements to 1s */                                         \
5288     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5289 }
5290 
5291 /* Compress into vd elements of vs2 where vs1 is enabled */
5292 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5293 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5294 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5295 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5296 
5297 /* Vector Whole Register Move */
5298 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5299 {
5300     /* EEW = SEW */
5301     uint32_t maxsz = simd_maxsz(desc);
5302     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5303     uint32_t startb = env->vstart * sewb;
5304     uint32_t i = startb;
5305 
5306     memcpy((uint8_t *)vd + H1(i),
5307            (uint8_t *)vs2 + H1(i),
5308            maxsz - startb);
5309 
5310     env->vstart = 0;
5311 }
5312 
5313 /* Vector Integer Extension */
5314 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5315 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5316                   CPURISCVState *env, uint32_t desc)             \
5317 {                                                                \
5318     uint32_t vl = env->vl;                                       \
5319     uint32_t vm = vext_vm(desc);                                 \
5320     uint32_t esz = sizeof(ETYPE);                                \
5321     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5322     uint32_t vta = vext_vta(desc);                               \
5323     uint32_t vma = vext_vma(desc);                               \
5324     uint32_t i;                                                  \
5325                                                                  \
5326     for (i = env->vstart; i < vl; i++) {                         \
5327         if (!vm && !vext_elem_mask(v0, i)) {                     \
5328             /* set masked-off elements to 1s */                  \
5329             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5330             continue;                                            \
5331         }                                                        \
5332         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5333     }                                                            \
5334     env->vstart = 0;                                             \
5335     /* set tail elements to 1s */                                \
5336     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5337 }
5338 
5339 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5340 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5341 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5342 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5343 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5344 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5345 
5346 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5347 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5348 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5349 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5350 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5351 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5352