xref: /openbmc/qemu/target/riscv/vector_helper.c (revision c45eff30)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
54         /* only set vill bit. */
55         env->vill = 1;
56         env->vtype = 0;
57         env->vl = 0;
58         env->vstart = 0;
59         return 0;
60     }
61 
62     vlmax = vext_get_vlmax(cpu, s2);
63     if (s1 <= vlmax) {
64         vl = s1;
65     } else {
66         vl = vlmax;
67     }
68     env->vl = vl;
69     env->vtype = s2;
70     env->vstart = 0;
71     env->vill = 0;
72     return vl;
73 }
74 
75 /*
76  * Note that vector data is stored in host-endian 64-bit chunks,
77  * so addressing units smaller than that needs a host-endian fixup.
78  */
79 #if HOST_BIG_ENDIAN
80 #define H1(x)   ((x) ^ 7)
81 #define H1_2(x) ((x) ^ 6)
82 #define H1_4(x) ((x) ^ 4)
83 #define H2(x)   ((x) ^ 3)
84 #define H4(x)   ((x) ^ 1)
85 #define H8(x)   ((x))
86 #else
87 #define H1(x)   (x)
88 #define H1_2(x) (x)
89 #define H1_4(x) (x)
90 #define H2(x)   (x)
91 #define H4(x)   (x)
92 #define H8(x)   (x)
93 #endif
94 
95 static inline uint32_t vext_nf(uint32_t desc)
96 {
97     return FIELD_EX32(simd_data(desc), VDATA, NF);
98 }
99 
100 static inline uint32_t vext_vm(uint32_t desc)
101 {
102     return FIELD_EX32(simd_data(desc), VDATA, VM);
103 }
104 
105 /*
106  * Encode LMUL to lmul as following:
107  *     LMUL    vlmul    lmul
108  *      1       000       0
109  *      2       001       1
110  *      4       010       2
111  *      8       011       3
112  *      -       100       -
113  *     1/8      101      -3
114  *     1/4      110      -2
115  *     1/2      111      -1
116  */
117 static inline int32_t vext_lmul(uint32_t desc)
118 {
119     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
120 }
121 
122 static inline uint32_t vext_vta(uint32_t desc)
123 {
124     return FIELD_EX32(simd_data(desc), VDATA, VTA);
125 }
126 
127 static inline uint32_t vext_vma(uint32_t desc)
128 {
129     return FIELD_EX32(simd_data(desc), VDATA, VMA);
130 }
131 
132 static inline uint32_t vext_vta_all_1s(uint32_t desc)
133 {
134     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
135 }
136 
137 /*
138  * Get the maximum number of elements can be operated.
139  *
140  * log2_esz: log2 of element size in bytes.
141  */
142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
143 {
144     /*
145      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
146      * so vlen in bytes (vlenb) is encoded as maxsz.
147      */
148     uint32_t vlenb = simd_maxsz(desc);
149 
150     /* Return VLMAX */
151     int scale = vext_lmul(desc) - log2_esz;
152     return scale < 0 ? vlenb >> -scale : vlenb << scale;
153 }
154 
155 /*
156  * Get number of total elements, including prestart, body and tail elements.
157  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
158  * are held in the same vector register.
159  */
160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
161                                             uint32_t esz)
162 {
163     uint32_t vlenb = simd_maxsz(desc);
164     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
165     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
166                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
167     return (vlenb << emul) / esz;
168 }
169 
170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
171 {
172     return (addr & env->cur_pmmask) | env->cur_pmbase;
173 }
174 
175 /*
176  * This function checks watchpoint before real load operation.
177  *
178  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
179  * In user mode, there is no watchpoint support now.
180  *
181  * It will trigger an exception if there is no mapping in TLB
182  * and page table walk can't fill the TLB entry. Then the guest
183  * software can return here after process the exception or never return.
184  */
185 static void probe_pages(CPURISCVState *env, target_ulong addr,
186                         target_ulong len, uintptr_t ra,
187                         MMUAccessType access_type)
188 {
189     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
190     target_ulong curlen = MIN(pagelen, len);
191 
192     probe_access(env, adjust_addr(env, addr), curlen, access_type,
193                  cpu_mmu_index(env, false), ra);
194     if (len > curlen) {
195         addr += curlen;
196         curlen = len - curlen;
197         probe_access(env, adjust_addr(env, addr), curlen, access_type,
198                      cpu_mmu_index(env, false), ra);
199     }
200 }
201 
202 /* set agnostic elements to 1s */
203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
204                               uint32_t tot)
205 {
206     if (is_agnostic == 0) {
207         /* policy undisturbed */
208         return;
209     }
210     if (tot - cnt == 0) {
211         return;
212     }
213     memset(base + cnt, -1, tot - cnt);
214 }
215 
216 static inline void vext_set_elem_mask(void *v0, int index,
217                                       uint8_t value)
218 {
219     int idx = index / 64;
220     int pos = index % 64;
221     uint64_t old = ((uint64_t *)v0)[idx];
222     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
223 }
224 
225 /*
226  * Earlier designs (pre-0.9) had a varying number of bits
227  * per mask value (MLEN). In the 0.9 design, MLEN=1.
228  * (Section 4.5)
229  */
230 static inline int vext_elem_mask(void *v0, int index)
231 {
232     int idx = index / 64;
233     int pos = index  % 64;
234     return (((uint64_t *)v0)[idx] >> pos) & 1;
235 }
236 
237 /* elements operations for load and store */
238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
239                                uint32_t idx, void *vd, uintptr_t retaddr);
240 
241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
242 static void NAME(CPURISCVState *env, abi_ptr addr,         \
243                  uint32_t idx, void *vd, uintptr_t retaddr)\
244 {                                                          \
245     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
246     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
247 }                                                          \
248 
249 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
253 
254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
255 static void NAME(CPURISCVState *env, abi_ptr addr,         \
256                  uint32_t idx, void *vd, uintptr_t retaddr)\
257 {                                                          \
258     ETYPE data = *((ETYPE *)vd + H(idx));                  \
259     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
260 }
261 
262 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
266 
267 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl,
268                                    void *vd, uint32_t desc, uint32_t nf,
269                                    uint32_t esz, uint32_t max_elems)
270 {
271     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
272     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
273     uint32_t vta = vext_vta(desc);
274     uint32_t registers_used;
275     int k;
276 
277     for (k = 0; k < nf; ++k) {
278         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
279                           (k * max_elems + max_elems) * esz);
280     }
281 
282     if (nf * max_elems % total_elems != 0) {
283         registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
284         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
285                           registers_used * vlenb);
286     }
287 }
288 
289 /*
290  *** stride: access vector element from strided memory
291  */
292 static void
293 vext_ldst_stride(void *vd, void *v0, target_ulong base,
294                  target_ulong stride, CPURISCVState *env,
295                  uint32_t desc, uint32_t vm,
296                  vext_ldst_elem_fn *ldst_elem,
297                  uint32_t log2_esz, uintptr_t ra)
298 {
299     uint32_t i, k;
300     uint32_t nf = vext_nf(desc);
301     uint32_t max_elems = vext_max_elems(desc, log2_esz);
302     uint32_t esz = 1 << log2_esz;
303     uint32_t vma = vext_vma(desc);
304 
305     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
306         k = 0;
307         while (k < nf) {
308             if (!vm && !vext_elem_mask(v0, i)) {
309                 /* set masked-off elements to 1s */
310                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
311                                   (i + k * max_elems + 1) * esz);
312                 k++;
313                 continue;
314             }
315             target_ulong addr = base + stride * i + (k << log2_esz);
316             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
317             k++;
318         }
319     }
320     env->vstart = 0;
321 
322     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
323 }
324 
325 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
326 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
327                   target_ulong stride, CPURISCVState *env,              \
328                   uint32_t desc)                                        \
329 {                                                                       \
330     uint32_t vm = vext_vm(desc);                                        \
331     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
332                      ctzl(sizeof(ETYPE)), GETPC());                     \
333 }
334 
335 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
336 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
337 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
338 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
339 
340 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
341 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
342                   target_ulong stride, CPURISCVState *env,              \
343                   uint32_t desc)                                        \
344 {                                                                       \
345     uint32_t vm = vext_vm(desc);                                        \
346     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
347                      ctzl(sizeof(ETYPE)), GETPC());                     \
348 }
349 
350 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
351 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
352 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
353 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
354 
355 /*
356  *** unit-stride: access elements stored contiguously in memory
357  */
358 
359 /* unmasked unit-stride load and store operation*/
360 static void
361 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
362              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
363              uintptr_t ra)
364 {
365     uint32_t i, k;
366     uint32_t nf = vext_nf(desc);
367     uint32_t max_elems = vext_max_elems(desc, log2_esz);
368     uint32_t esz = 1 << log2_esz;
369 
370     /* load bytes from guest memory */
371     for (i = env->vstart; i < evl; i++, env->vstart++) {
372         k = 0;
373         while (k < nf) {
374             target_ulong addr = base + ((i * nf + k) << log2_esz);
375             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
376             k++;
377         }
378     }
379     env->vstart = 0;
380 
381     vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems);
382 }
383 
384 /*
385  * masked unit-stride load and store operation will be a special case of stride,
386  * stride = NF * sizeof (MTYPE)
387  */
388 
389 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
390 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
391                          CPURISCVState *env, uint32_t desc)             \
392 {                                                                       \
393     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
394     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
395                      ctzl(sizeof(ETYPE)), GETPC());                     \
396 }                                                                       \
397                                                                         \
398 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
399                   CPURISCVState *env, uint32_t desc)                    \
400 {                                                                       \
401     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
402                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
403 }
404 
405 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
406 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
407 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
408 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
409 
410 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
411 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
412                          CPURISCVState *env, uint32_t desc)              \
413 {                                                                        \
414     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
415     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
416                      ctzl(sizeof(ETYPE)), GETPC());                      \
417 }                                                                        \
418                                                                          \
419 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
420                   CPURISCVState *env, uint32_t desc)                     \
421 {                                                                        \
422     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
423                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
424 }
425 
426 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
427 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
428 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
429 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
430 
431 /*
432  *** unit stride mask load and store, EEW = 1
433  */
434 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
435                     CPURISCVState *env, uint32_t desc)
436 {
437     /* evl = ceil(vl/8) */
438     uint8_t evl = (env->vl + 7) >> 3;
439     vext_ldst_us(vd, base, env, desc, lde_b,
440                  0, evl, GETPC());
441 }
442 
443 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
444                     CPURISCVState *env, uint32_t desc)
445 {
446     /* evl = ceil(vl/8) */
447     uint8_t evl = (env->vl + 7) >> 3;
448     vext_ldst_us(vd, base, env, desc, ste_b,
449                  0, evl, GETPC());
450 }
451 
452 /*
453  *** index: access vector element from indexed memory
454  */
455 typedef target_ulong vext_get_index_addr(target_ulong base,
456         uint32_t idx, void *vs2);
457 
458 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
459 static target_ulong NAME(target_ulong base,            \
460                          uint32_t idx, void *vs2)      \
461 {                                                      \
462     return (base + *((ETYPE *)vs2 + H(idx)));          \
463 }
464 
465 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
466 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
467 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
468 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
469 
470 static inline void
471 vext_ldst_index(void *vd, void *v0, target_ulong base,
472                 void *vs2, CPURISCVState *env, uint32_t desc,
473                 vext_get_index_addr get_index_addr,
474                 vext_ldst_elem_fn *ldst_elem,
475                 uint32_t log2_esz, uintptr_t ra)
476 {
477     uint32_t i, k;
478     uint32_t nf = vext_nf(desc);
479     uint32_t vm = vext_vm(desc);
480     uint32_t max_elems = vext_max_elems(desc, log2_esz);
481     uint32_t esz = 1 << log2_esz;
482     uint32_t vma = vext_vma(desc);
483 
484     /* load bytes from guest memory */
485     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
486         k = 0;
487         while (k < nf) {
488             if (!vm && !vext_elem_mask(v0, i)) {
489                 /* set masked-off elements to 1s */
490                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
491                                   (i + k * max_elems + 1) * esz);
492                 k++;
493                 continue;
494             }
495             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
496             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
497             k++;
498         }
499     }
500     env->vstart = 0;
501 
502     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
503 }
504 
505 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
506 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
507                   void *vs2, CPURISCVState *env, uint32_t desc)            \
508 {                                                                          \
509     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
510                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
511 }
512 
513 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
514 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
515 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
516 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
517 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
529 
530 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
531 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
532                   void *vs2, CPURISCVState *env, uint32_t desc)  \
533 {                                                                \
534     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
535                     STORE_FN, ctzl(sizeof(ETYPE)),               \
536                     GETPC());                                    \
537 }
538 
539 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
540 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
541 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
542 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
543 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
555 
556 /*
557  *** unit-stride fault-only-fisrt load instructions
558  */
559 static inline void
560 vext_ldff(void *vd, void *v0, target_ulong base,
561           CPURISCVState *env, uint32_t desc,
562           vext_ldst_elem_fn *ldst_elem,
563           uint32_t log2_esz, uintptr_t ra)
564 {
565     void *host;
566     uint32_t i, k, vl = 0;
567     uint32_t nf = vext_nf(desc);
568     uint32_t vm = vext_vm(desc);
569     uint32_t max_elems = vext_max_elems(desc, log2_esz);
570     uint32_t esz = 1 << log2_esz;
571     uint32_t vma = vext_vma(desc);
572     target_ulong addr, offset, remain;
573 
574     /* probe every access*/
575     for (i = env->vstart; i < env->vl; i++) {
576         if (!vm && !vext_elem_mask(v0, i)) {
577             continue;
578         }
579         addr = adjust_addr(env, base + i * (nf << log2_esz));
580         if (i == 0) {
581             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
582         } else {
583             /* if it triggers an exception, no need to check watchpoint */
584             remain = nf << log2_esz;
585             while (remain > 0) {
586                 offset = -(addr | TARGET_PAGE_MASK);
587                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
588                                          cpu_mmu_index(env, false));
589                 if (host) {
590 #ifdef CONFIG_USER_ONLY
591                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
592                         vl = i;
593                         goto ProbeSuccess;
594                     }
595 #else
596                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
597 #endif
598                 } else {
599                     vl = i;
600                     goto ProbeSuccess;
601                 }
602                 if (remain <=  offset) {
603                     break;
604                 }
605                 remain -= offset;
606                 addr = adjust_addr(env, addr + offset);
607             }
608         }
609     }
610 ProbeSuccess:
611     /* load bytes from guest memory */
612     if (vl != 0) {
613         env->vl = vl;
614     }
615     for (i = env->vstart; i < env->vl; i++) {
616         k = 0;
617         while (k < nf) {
618             if (!vm && !vext_elem_mask(v0, i)) {
619                 /* set masked-off elements to 1s */
620                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
621                                   (i + k * max_elems + 1) * esz);
622                 k++;
623                 continue;
624             }
625             target_ulong addr = base + ((i * nf + k) << log2_esz);
626             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
627             k++;
628         }
629     }
630     env->vstart = 0;
631 
632     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
633 }
634 
635 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
636 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
637                   CPURISCVState *env, uint32_t desc)      \
638 {                                                         \
639     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
640               ctzl(sizeof(ETYPE)), GETPC());              \
641 }
642 
643 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
644 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
645 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
646 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
647 
648 #define DO_SWAP(N, M) (M)
649 #define DO_AND(N, M)  (N & M)
650 #define DO_XOR(N, M)  (N ^ M)
651 #define DO_OR(N, M)   (N | M)
652 #define DO_ADD(N, M)  (N + M)
653 
654 /* Signed min/max */
655 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
656 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
657 
658 /* Unsigned min/max */
659 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
660 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
661 
662 /*
663  *** load and store whole register instructions
664  */
665 static void
666 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
667                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
668 {
669     uint32_t i, k, off, pos;
670     uint32_t nf = vext_nf(desc);
671     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
672     uint32_t max_elems = vlenb >> log2_esz;
673 
674     k = env->vstart / max_elems;
675     off = env->vstart % max_elems;
676 
677     if (off) {
678         /* load/store rest of elements of current segment pointed by vstart */
679         for (pos = off; pos < max_elems; pos++, env->vstart++) {
680             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
681             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
682         }
683         k++;
684     }
685 
686     /* load/store elements for rest of segments */
687     for (; k < nf; k++) {
688         for (i = 0; i < max_elems; i++, env->vstart++) {
689             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
690             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
691         }
692     }
693 
694     env->vstart = 0;
695 }
696 
697 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
698 void HELPER(NAME)(void *vd, target_ulong base,       \
699                   CPURISCVState *env, uint32_t desc) \
700 {                                                    \
701     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
702                     ctzl(sizeof(ETYPE)), GETPC());   \
703 }
704 
705 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
706 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
707 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
708 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
709 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
710 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
711 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
712 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
713 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
714 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
715 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
716 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
717 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
718 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
719 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
720 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
721 
722 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
723 void HELPER(NAME)(void *vd, target_ulong base,       \
724                   CPURISCVState *env, uint32_t desc) \
725 {                                                    \
726     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
727                     ctzl(sizeof(ETYPE)), GETPC());   \
728 }
729 
730 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
731 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
732 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
733 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
734 
735 /*
736  *** Vector Integer Arithmetic Instructions
737  */
738 
739 /* expand macro args before macro */
740 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
741 
742 /* (TD, T1, T2, TX1, TX2) */
743 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
744 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
745 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
746 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
747 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
748 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
749 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
750 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
751 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
752 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
753 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
754 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
755 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
756 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
757 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
758 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
759 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
760 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
761 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
762 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
763 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
764 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
765 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
766 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
767 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
768 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
769 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
770 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
771 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
772 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
773 
774 /* operation of two vector elements */
775 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
776 
777 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
778 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
779 {                                                               \
780     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
781     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
782     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
783 }
784 #define DO_SUB(N, M) (N - M)
785 #define DO_RSUB(N, M) (M - N)
786 
787 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
788 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
789 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
790 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
791 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
792 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
793 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
794 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
795 
796 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
797                        CPURISCVState *env, uint32_t desc,
798                        opivv2_fn *fn, uint32_t esz)
799 {
800     uint32_t vm = vext_vm(desc);
801     uint32_t vl = env->vl;
802     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
803     uint32_t vta = vext_vta(desc);
804     uint32_t vma = vext_vma(desc);
805     uint32_t i;
806 
807     for (i = env->vstart; i < vl; i++) {
808         if (!vm && !vext_elem_mask(v0, i)) {
809             /* set masked-off elements to 1s */
810             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
811             continue;
812         }
813         fn(vd, vs1, vs2, i);
814     }
815     env->vstart = 0;
816     /* set tail elements to 1s */
817     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
818 }
819 
820 /* generate the helpers for OPIVV */
821 #define GEN_VEXT_VV(NAME, ESZ)                            \
822 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
823                   void *vs2, CPURISCVState *env,          \
824                   uint32_t desc)                          \
825 {                                                         \
826     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
827                do_##NAME, ESZ);                           \
828 }
829 
830 GEN_VEXT_VV(vadd_vv_b, 1)
831 GEN_VEXT_VV(vadd_vv_h, 2)
832 GEN_VEXT_VV(vadd_vv_w, 4)
833 GEN_VEXT_VV(vadd_vv_d, 8)
834 GEN_VEXT_VV(vsub_vv_b, 1)
835 GEN_VEXT_VV(vsub_vv_h, 2)
836 GEN_VEXT_VV(vsub_vv_w, 4)
837 GEN_VEXT_VV(vsub_vv_d, 8)
838 
839 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
840 
841 /*
842  * (T1)s1 gives the real operator type.
843  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
844  */
845 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
846 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
847 {                                                                   \
848     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
849     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
850 }
851 
852 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
853 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
854 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
855 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
856 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
857 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
858 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
859 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
860 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
861 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
862 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
863 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
864 
865 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
866                        CPURISCVState *env, uint32_t desc,
867                        opivx2_fn fn, uint32_t esz)
868 {
869     uint32_t vm = vext_vm(desc);
870     uint32_t vl = env->vl;
871     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
872     uint32_t vta = vext_vta(desc);
873     uint32_t vma = vext_vma(desc);
874     uint32_t i;
875 
876     for (i = env->vstart; i < vl; i++) {
877         if (!vm && !vext_elem_mask(v0, i)) {
878             /* set masked-off elements to 1s */
879             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
880             continue;
881         }
882         fn(vd, s1, vs2, i);
883     }
884     env->vstart = 0;
885     /* set tail elements to 1s */
886     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
887 }
888 
889 /* generate the helpers for OPIVX */
890 #define GEN_VEXT_VX(NAME, ESZ)                            \
891 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
892                   void *vs2, CPURISCVState *env,          \
893                   uint32_t desc)                          \
894 {                                                         \
895     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
896                do_##NAME, ESZ);                           \
897 }
898 
899 GEN_VEXT_VX(vadd_vx_b, 1)
900 GEN_VEXT_VX(vadd_vx_h, 2)
901 GEN_VEXT_VX(vadd_vx_w, 4)
902 GEN_VEXT_VX(vadd_vx_d, 8)
903 GEN_VEXT_VX(vsub_vx_b, 1)
904 GEN_VEXT_VX(vsub_vx_h, 2)
905 GEN_VEXT_VX(vsub_vx_w, 4)
906 GEN_VEXT_VX(vsub_vx_d, 8)
907 GEN_VEXT_VX(vrsub_vx_b, 1)
908 GEN_VEXT_VX(vrsub_vx_h, 2)
909 GEN_VEXT_VX(vrsub_vx_w, 4)
910 GEN_VEXT_VX(vrsub_vx_d, 8)
911 
912 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
913 {
914     intptr_t oprsz = simd_oprsz(desc);
915     intptr_t i;
916 
917     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
918         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
919     }
920 }
921 
922 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
923 {
924     intptr_t oprsz = simd_oprsz(desc);
925     intptr_t i;
926 
927     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
928         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
929     }
930 }
931 
932 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
933 {
934     intptr_t oprsz = simd_oprsz(desc);
935     intptr_t i;
936 
937     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
938         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
939     }
940 }
941 
942 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
943 {
944     intptr_t oprsz = simd_oprsz(desc);
945     intptr_t i;
946 
947     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
948         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
949     }
950 }
951 
952 /* Vector Widening Integer Add/Subtract */
953 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
954 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
955 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
956 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
957 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
958 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
959 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
960 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
961 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
962 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
963 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
964 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
965 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
966 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
967 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
968 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
969 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
970 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
971 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
972 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
973 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
974 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
975 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
976 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
977 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
978 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
979 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
980 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
981 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
982 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
983 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
984 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
985 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
986 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
987 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
988 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
989 GEN_VEXT_VV(vwaddu_vv_b, 2)
990 GEN_VEXT_VV(vwaddu_vv_h, 4)
991 GEN_VEXT_VV(vwaddu_vv_w, 8)
992 GEN_VEXT_VV(vwsubu_vv_b, 2)
993 GEN_VEXT_VV(vwsubu_vv_h, 4)
994 GEN_VEXT_VV(vwsubu_vv_w, 8)
995 GEN_VEXT_VV(vwadd_vv_b, 2)
996 GEN_VEXT_VV(vwadd_vv_h, 4)
997 GEN_VEXT_VV(vwadd_vv_w, 8)
998 GEN_VEXT_VV(vwsub_vv_b, 2)
999 GEN_VEXT_VV(vwsub_vv_h, 4)
1000 GEN_VEXT_VV(vwsub_vv_w, 8)
1001 GEN_VEXT_VV(vwaddu_wv_b, 2)
1002 GEN_VEXT_VV(vwaddu_wv_h, 4)
1003 GEN_VEXT_VV(vwaddu_wv_w, 8)
1004 GEN_VEXT_VV(vwsubu_wv_b, 2)
1005 GEN_VEXT_VV(vwsubu_wv_h, 4)
1006 GEN_VEXT_VV(vwsubu_wv_w, 8)
1007 GEN_VEXT_VV(vwadd_wv_b, 2)
1008 GEN_VEXT_VV(vwadd_wv_h, 4)
1009 GEN_VEXT_VV(vwadd_wv_w, 8)
1010 GEN_VEXT_VV(vwsub_wv_b, 2)
1011 GEN_VEXT_VV(vwsub_wv_h, 4)
1012 GEN_VEXT_VV(vwsub_wv_w, 8)
1013 
1014 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1015 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1016 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1017 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1018 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1019 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1020 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1021 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1022 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1023 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1024 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1025 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1026 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1027 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1028 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1029 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1030 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1031 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1032 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1033 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1034 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1035 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1036 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1037 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1038 GEN_VEXT_VX(vwaddu_vx_b, 2)
1039 GEN_VEXT_VX(vwaddu_vx_h, 4)
1040 GEN_VEXT_VX(vwaddu_vx_w, 8)
1041 GEN_VEXT_VX(vwsubu_vx_b, 2)
1042 GEN_VEXT_VX(vwsubu_vx_h, 4)
1043 GEN_VEXT_VX(vwsubu_vx_w, 8)
1044 GEN_VEXT_VX(vwadd_vx_b, 2)
1045 GEN_VEXT_VX(vwadd_vx_h, 4)
1046 GEN_VEXT_VX(vwadd_vx_w, 8)
1047 GEN_VEXT_VX(vwsub_vx_b, 2)
1048 GEN_VEXT_VX(vwsub_vx_h, 4)
1049 GEN_VEXT_VX(vwsub_vx_w, 8)
1050 GEN_VEXT_VX(vwaddu_wx_b, 2)
1051 GEN_VEXT_VX(vwaddu_wx_h, 4)
1052 GEN_VEXT_VX(vwaddu_wx_w, 8)
1053 GEN_VEXT_VX(vwsubu_wx_b, 2)
1054 GEN_VEXT_VX(vwsubu_wx_h, 4)
1055 GEN_VEXT_VX(vwsubu_wx_w, 8)
1056 GEN_VEXT_VX(vwadd_wx_b, 2)
1057 GEN_VEXT_VX(vwadd_wx_h, 4)
1058 GEN_VEXT_VX(vwadd_wx_w, 8)
1059 GEN_VEXT_VX(vwsub_wx_b, 2)
1060 GEN_VEXT_VX(vwsub_wx_h, 4)
1061 GEN_VEXT_VX(vwsub_wx_w, 8)
1062 
1063 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1064 #define DO_VADC(N, M, C) (N + M + C)
1065 #define DO_VSBC(N, M, C) (N - M - C)
1066 
1067 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1068 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1069                   CPURISCVState *env, uint32_t desc)          \
1070 {                                                             \
1071     uint32_t vl = env->vl;                                    \
1072     uint32_t esz = sizeof(ETYPE);                             \
1073     uint32_t total_elems =                                    \
1074         vext_get_total_elems(env, desc, esz);                 \
1075     uint32_t vta = vext_vta(desc);                            \
1076     uint32_t i;                                               \
1077                                                               \
1078     for (i = env->vstart; i < vl; i++) {                      \
1079         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1080         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1081         ETYPE carry = vext_elem_mask(v0, i);                  \
1082                                                               \
1083         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1084     }                                                         \
1085     env->vstart = 0;                                          \
1086     /* set tail elements to 1s */                             \
1087     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1088 }
1089 
1090 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1091 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1092 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1093 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1094 
1095 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1096 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1097 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1098 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1099 
1100 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1101 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1102                   CPURISCVState *env, uint32_t desc)                     \
1103 {                                                                        \
1104     uint32_t vl = env->vl;                                               \
1105     uint32_t esz = sizeof(ETYPE);                                        \
1106     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1107     uint32_t vta = vext_vta(desc);                                       \
1108     uint32_t i;                                                          \
1109                                                                          \
1110     for (i = env->vstart; i < vl; i++) {                                 \
1111         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1112         ETYPE carry = vext_elem_mask(v0, i);                             \
1113                                                                          \
1114         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1115     }                                                                    \
1116     env->vstart = 0;                                                     \
1117     /* set tail elements to 1s */                                        \
1118     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1119 }
1120 
1121 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1122 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1123 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1124 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1125 
1126 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1127 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1128 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1129 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1130 
1131 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1132                           (__typeof(N))(N + M) < N)
1133 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1134 
1135 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1136 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1137                   CPURISCVState *env, uint32_t desc)          \
1138 {                                                             \
1139     uint32_t vl = env->vl;                                    \
1140     uint32_t vm = vext_vm(desc);                              \
1141     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1142     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1143     uint32_t i;                                               \
1144                                                               \
1145     for (i = env->vstart; i < vl; i++) {                      \
1146         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1147         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1148         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1149         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1150     }                                                         \
1151     env->vstart = 0;                                          \
1152     /* mask destination register are always tail-agnostic */  \
1153     /* set tail elements to 1s */                             \
1154     if (vta_all_1s) {                                         \
1155         for (; i < total_elems; i++) {                        \
1156             vext_set_elem_mask(vd, i, 1);                     \
1157         }                                                     \
1158     }                                                         \
1159 }
1160 
1161 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1162 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1163 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1164 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1165 
1166 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1167 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1168 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1169 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1170 
1171 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1172 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1173                   void *vs2, CPURISCVState *env, uint32_t desc) \
1174 {                                                               \
1175     uint32_t vl = env->vl;                                      \
1176     uint32_t vm = vext_vm(desc);                                \
1177     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1178     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1179     uint32_t i;                                                 \
1180                                                                 \
1181     for (i = env->vstart; i < vl; i++) {                        \
1182         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1183         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1184         vext_set_elem_mask(vd, i,                               \
1185                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1186     }                                                           \
1187     env->vstart = 0;                                            \
1188     /* mask destination register are always tail-agnostic */    \
1189     /* set tail elements to 1s */                               \
1190     if (vta_all_1s) {                                           \
1191         for (; i < total_elems; i++) {                          \
1192             vext_set_elem_mask(vd, i, 1);                       \
1193         }                                                       \
1194     }                                                           \
1195 }
1196 
1197 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1198 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1199 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1200 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1201 
1202 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1203 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1204 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1205 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1206 
1207 /* Vector Bitwise Logical Instructions */
1208 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1209 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1210 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1211 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1212 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1213 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1214 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1215 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1216 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1217 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1218 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1219 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1220 GEN_VEXT_VV(vand_vv_b, 1)
1221 GEN_VEXT_VV(vand_vv_h, 2)
1222 GEN_VEXT_VV(vand_vv_w, 4)
1223 GEN_VEXT_VV(vand_vv_d, 8)
1224 GEN_VEXT_VV(vor_vv_b, 1)
1225 GEN_VEXT_VV(vor_vv_h, 2)
1226 GEN_VEXT_VV(vor_vv_w, 4)
1227 GEN_VEXT_VV(vor_vv_d, 8)
1228 GEN_VEXT_VV(vxor_vv_b, 1)
1229 GEN_VEXT_VV(vxor_vv_h, 2)
1230 GEN_VEXT_VV(vxor_vv_w, 4)
1231 GEN_VEXT_VV(vxor_vv_d, 8)
1232 
1233 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1234 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1235 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1236 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1237 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1238 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1239 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1240 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1241 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1242 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1243 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1244 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1245 GEN_VEXT_VX(vand_vx_b, 1)
1246 GEN_VEXT_VX(vand_vx_h, 2)
1247 GEN_VEXT_VX(vand_vx_w, 4)
1248 GEN_VEXT_VX(vand_vx_d, 8)
1249 GEN_VEXT_VX(vor_vx_b, 1)
1250 GEN_VEXT_VX(vor_vx_h, 2)
1251 GEN_VEXT_VX(vor_vx_w, 4)
1252 GEN_VEXT_VX(vor_vx_d, 8)
1253 GEN_VEXT_VX(vxor_vx_b, 1)
1254 GEN_VEXT_VX(vxor_vx_h, 2)
1255 GEN_VEXT_VX(vxor_vx_w, 4)
1256 GEN_VEXT_VX(vxor_vx_d, 8)
1257 
1258 /* Vector Single-Width Bit Shift Instructions */
1259 #define DO_SLL(N, M)  (N << (M))
1260 #define DO_SRL(N, M)  (N >> (M))
1261 
1262 /* generate the helpers for shift instructions with two vector operators */
1263 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1264 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1265                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1266 {                                                                         \
1267     uint32_t vm = vext_vm(desc);                                          \
1268     uint32_t vl = env->vl;                                                \
1269     uint32_t esz = sizeof(TS1);                                           \
1270     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1271     uint32_t vta = vext_vta(desc);                                        \
1272     uint32_t vma = vext_vma(desc);                                        \
1273     uint32_t i;                                                           \
1274                                                                           \
1275     for (i = env->vstart; i < vl; i++) {                                  \
1276         if (!vm && !vext_elem_mask(v0, i)) {                              \
1277             /* set masked-off elements to 1s */                           \
1278             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1279             continue;                                                     \
1280         }                                                                 \
1281         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1282         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1283         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1284     }                                                                     \
1285     env->vstart = 0;                                                      \
1286     /* set tail elements to 1s */                                         \
1287     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1288 }
1289 
1290 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1291 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1292 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1293 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1294 
1295 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1296 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1297 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1299 
1300 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1301 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1302 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1303 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1304 
1305 /* generate the helpers for shift instructions with one vector and one scalar */
1306 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1307 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1308                   void *vs2, CPURISCVState *env,            \
1309                   uint32_t desc)                            \
1310 {                                                           \
1311     uint32_t vm = vext_vm(desc);                            \
1312     uint32_t vl = env->vl;                                  \
1313     uint32_t esz = sizeof(TD);                              \
1314     uint32_t total_elems =                                  \
1315         vext_get_total_elems(env, desc, esz);               \
1316     uint32_t vta = vext_vta(desc);                          \
1317     uint32_t vma = vext_vma(desc);                          \
1318     uint32_t i;                                             \
1319                                                             \
1320     for (i = env->vstart; i < vl; i++) {                    \
1321         if (!vm && !vext_elem_mask(v0, i)) {                \
1322             /* set masked-off elements to 1s */             \
1323             vext_set_elems_1s(vd, vma, i * esz,             \
1324                               (i + 1) * esz);               \
1325             continue;                                       \
1326         }                                                   \
1327         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1328         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1329     }                                                       \
1330     env->vstart = 0;                                        \
1331     /* set tail elements to 1s */                           \
1332     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1333 }
1334 
1335 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1336 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1337 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1338 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1339 
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1341 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1342 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1343 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1344 
1345 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1346 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1347 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1348 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1349 
1350 /* Vector Narrowing Integer Right Shift Instructions */
1351 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1354 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1355 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1356 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1357 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1358 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1359 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1360 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1361 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1362 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1363 
1364 /* Vector Integer Comparison Instructions */
1365 #define DO_MSEQ(N, M) (N == M)
1366 #define DO_MSNE(N, M) (N != M)
1367 #define DO_MSLT(N, M) (N < M)
1368 #define DO_MSLE(N, M) (N <= M)
1369 #define DO_MSGT(N, M) (N > M)
1370 
1371 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1372 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1373                   CPURISCVState *env, uint32_t desc)          \
1374 {                                                             \
1375     uint32_t vm = vext_vm(desc);                              \
1376     uint32_t vl = env->vl;                                    \
1377     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1378     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1379     uint32_t vma = vext_vma(desc);                            \
1380     uint32_t i;                                               \
1381                                                               \
1382     for (i = env->vstart; i < vl; i++) {                      \
1383         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1384         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1385         if (!vm && !vext_elem_mask(v0, i)) {                  \
1386             /* set masked-off elements to 1s */               \
1387             if (vma) {                                        \
1388                 vext_set_elem_mask(vd, i, 1);                 \
1389             }                                                 \
1390             continue;                                         \
1391         }                                                     \
1392         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1393     }                                                         \
1394     env->vstart = 0;                                          \
1395     /* mask destination register are always tail-agnostic */  \
1396     /* set tail elements to 1s */                             \
1397     if (vta_all_1s) {                                         \
1398         for (; i < total_elems; i++) {                        \
1399             vext_set_elem_mask(vd, i, 1);                     \
1400         }                                                     \
1401     }                                                         \
1402 }
1403 
1404 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1405 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1406 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1408 
1409 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1410 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1411 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1413 
1414 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1415 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1416 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1418 
1419 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1420 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1421 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1423 
1424 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1425 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1426 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1428 
1429 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1430 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1431 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1433 
1434 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1435 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1436                   CPURISCVState *env, uint32_t desc)                \
1437 {                                                                   \
1438     uint32_t vm = vext_vm(desc);                                    \
1439     uint32_t vl = env->vl;                                          \
1440     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1441     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1442     uint32_t vma = vext_vma(desc);                                  \
1443     uint32_t i;                                                     \
1444                                                                     \
1445     for (i = env->vstart; i < vl; i++) {                            \
1446         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1447         if (!vm && !vext_elem_mask(v0, i)) {                        \
1448             /* set masked-off elements to 1s */                     \
1449             if (vma) {                                              \
1450                 vext_set_elem_mask(vd, i, 1);                       \
1451             }                                                       \
1452             continue;                                               \
1453         }                                                           \
1454         vext_set_elem_mask(vd, i,                                   \
1455                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1456     }                                                               \
1457     env->vstart = 0;                                                \
1458     /* mask destination register are always tail-agnostic */        \
1459     /* set tail elements to 1s */                                   \
1460     if (vta_all_1s) {                                               \
1461         for (; i < total_elems; i++) {                              \
1462             vext_set_elem_mask(vd, i, 1);                           \
1463         }                                                           \
1464     }                                                               \
1465 }
1466 
1467 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1468 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1469 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1470 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1471 
1472 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1473 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1474 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1475 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1476 
1477 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1478 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1479 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1480 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1481 
1482 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1483 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1484 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1485 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1486 
1487 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1488 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1489 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1490 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1491 
1492 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1493 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1494 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1495 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1496 
1497 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1498 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1499 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1500 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1501 
1502 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1503 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1504 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1505 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1506 
1507 /* Vector Integer Min/Max Instructions */
1508 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1509 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1510 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1511 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1512 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1513 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1514 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1515 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1516 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1517 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1518 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1519 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1520 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1521 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1522 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1523 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1524 GEN_VEXT_VV(vminu_vv_b, 1)
1525 GEN_VEXT_VV(vminu_vv_h, 2)
1526 GEN_VEXT_VV(vminu_vv_w, 4)
1527 GEN_VEXT_VV(vminu_vv_d, 8)
1528 GEN_VEXT_VV(vmin_vv_b, 1)
1529 GEN_VEXT_VV(vmin_vv_h, 2)
1530 GEN_VEXT_VV(vmin_vv_w, 4)
1531 GEN_VEXT_VV(vmin_vv_d, 8)
1532 GEN_VEXT_VV(vmaxu_vv_b, 1)
1533 GEN_VEXT_VV(vmaxu_vv_h, 2)
1534 GEN_VEXT_VV(vmaxu_vv_w, 4)
1535 GEN_VEXT_VV(vmaxu_vv_d, 8)
1536 GEN_VEXT_VV(vmax_vv_b, 1)
1537 GEN_VEXT_VV(vmax_vv_h, 2)
1538 GEN_VEXT_VV(vmax_vv_w, 4)
1539 GEN_VEXT_VV(vmax_vv_d, 8)
1540 
1541 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1542 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1543 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1544 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1545 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1546 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1547 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1548 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1549 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1550 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1551 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1552 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1553 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1554 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1555 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1556 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1557 GEN_VEXT_VX(vminu_vx_b, 1)
1558 GEN_VEXT_VX(vminu_vx_h, 2)
1559 GEN_VEXT_VX(vminu_vx_w, 4)
1560 GEN_VEXT_VX(vminu_vx_d, 8)
1561 GEN_VEXT_VX(vmin_vx_b, 1)
1562 GEN_VEXT_VX(vmin_vx_h, 2)
1563 GEN_VEXT_VX(vmin_vx_w, 4)
1564 GEN_VEXT_VX(vmin_vx_d, 8)
1565 GEN_VEXT_VX(vmaxu_vx_b, 1)
1566 GEN_VEXT_VX(vmaxu_vx_h, 2)
1567 GEN_VEXT_VX(vmaxu_vx_w, 4)
1568 GEN_VEXT_VX(vmaxu_vx_d, 8)
1569 GEN_VEXT_VX(vmax_vx_b, 1)
1570 GEN_VEXT_VX(vmax_vx_h, 2)
1571 GEN_VEXT_VX(vmax_vx_w, 4)
1572 GEN_VEXT_VX(vmax_vx_d, 8)
1573 
1574 /* Vector Single-Width Integer Multiply Instructions */
1575 #define DO_MUL(N, M) (N * M)
1576 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1577 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1578 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1579 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1580 GEN_VEXT_VV(vmul_vv_b, 1)
1581 GEN_VEXT_VV(vmul_vv_h, 2)
1582 GEN_VEXT_VV(vmul_vv_w, 4)
1583 GEN_VEXT_VV(vmul_vv_d, 8)
1584 
1585 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1586 {
1587     return (int16_t)s2 * (int16_t)s1 >> 8;
1588 }
1589 
1590 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1591 {
1592     return (int32_t)s2 * (int32_t)s1 >> 16;
1593 }
1594 
1595 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1596 {
1597     return (int64_t)s2 * (int64_t)s1 >> 32;
1598 }
1599 
1600 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1601 {
1602     uint64_t hi_64, lo_64;
1603 
1604     muls64(&lo_64, &hi_64, s1, s2);
1605     return hi_64;
1606 }
1607 
1608 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1609 {
1610     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1611 }
1612 
1613 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1614 {
1615     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1616 }
1617 
1618 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1619 {
1620     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1621 }
1622 
1623 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1624 {
1625     uint64_t hi_64, lo_64;
1626 
1627     mulu64(&lo_64, &hi_64, s2, s1);
1628     return hi_64;
1629 }
1630 
1631 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1632 {
1633     return (int16_t)s2 * (uint16_t)s1 >> 8;
1634 }
1635 
1636 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1637 {
1638     return (int32_t)s2 * (uint32_t)s1 >> 16;
1639 }
1640 
1641 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1642 {
1643     return (int64_t)s2 * (uint64_t)s1 >> 32;
1644 }
1645 
1646 /*
1647  * Let  A = signed operand,
1648  *      B = unsigned operand
1649  *      P = mulu64(A, B), unsigned product
1650  *
1651  * LET  X = 2 ** 64  - A, 2's complement of A
1652  *      SP = signed product
1653  * THEN
1654  *      IF A < 0
1655  *          SP = -X * B
1656  *             = -(2 ** 64 - A) * B
1657  *             = A * B - 2 ** 64 * B
1658  *             = P - 2 ** 64 * B
1659  *      ELSE
1660  *          SP = P
1661  * THEN
1662  *      HI_P -= (A < 0 ? B : 0)
1663  */
1664 
1665 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1666 {
1667     uint64_t hi_64, lo_64;
1668 
1669     mulu64(&lo_64, &hi_64, s2, s1);
1670 
1671     hi_64 -= s2 < 0 ? s1 : 0;
1672     return hi_64;
1673 }
1674 
1675 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1676 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1677 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1678 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1679 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1680 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1681 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1682 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1683 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1684 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1685 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1686 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1687 GEN_VEXT_VV(vmulh_vv_b, 1)
1688 GEN_VEXT_VV(vmulh_vv_h, 2)
1689 GEN_VEXT_VV(vmulh_vv_w, 4)
1690 GEN_VEXT_VV(vmulh_vv_d, 8)
1691 GEN_VEXT_VV(vmulhu_vv_b, 1)
1692 GEN_VEXT_VV(vmulhu_vv_h, 2)
1693 GEN_VEXT_VV(vmulhu_vv_w, 4)
1694 GEN_VEXT_VV(vmulhu_vv_d, 8)
1695 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1696 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1697 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1698 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1699 
1700 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1701 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1702 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1703 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1704 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1705 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1706 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1707 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1708 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1709 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1710 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1711 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1712 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1713 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1714 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1715 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1716 GEN_VEXT_VX(vmul_vx_b, 1)
1717 GEN_VEXT_VX(vmul_vx_h, 2)
1718 GEN_VEXT_VX(vmul_vx_w, 4)
1719 GEN_VEXT_VX(vmul_vx_d, 8)
1720 GEN_VEXT_VX(vmulh_vx_b, 1)
1721 GEN_VEXT_VX(vmulh_vx_h, 2)
1722 GEN_VEXT_VX(vmulh_vx_w, 4)
1723 GEN_VEXT_VX(vmulh_vx_d, 8)
1724 GEN_VEXT_VX(vmulhu_vx_b, 1)
1725 GEN_VEXT_VX(vmulhu_vx_h, 2)
1726 GEN_VEXT_VX(vmulhu_vx_w, 4)
1727 GEN_VEXT_VX(vmulhu_vx_d, 8)
1728 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1729 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1730 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1731 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1732 
1733 /* Vector Integer Divide Instructions */
1734 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1735 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1736 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1737         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1738 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1739         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1740 
1741 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1742 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1743 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1744 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1745 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1746 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1747 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1748 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1749 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1750 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1751 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1752 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1753 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1754 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1755 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1756 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1757 GEN_VEXT_VV(vdivu_vv_b, 1)
1758 GEN_VEXT_VV(vdivu_vv_h, 2)
1759 GEN_VEXT_VV(vdivu_vv_w, 4)
1760 GEN_VEXT_VV(vdivu_vv_d, 8)
1761 GEN_VEXT_VV(vdiv_vv_b, 1)
1762 GEN_VEXT_VV(vdiv_vv_h, 2)
1763 GEN_VEXT_VV(vdiv_vv_w, 4)
1764 GEN_VEXT_VV(vdiv_vv_d, 8)
1765 GEN_VEXT_VV(vremu_vv_b, 1)
1766 GEN_VEXT_VV(vremu_vv_h, 2)
1767 GEN_VEXT_VV(vremu_vv_w, 4)
1768 GEN_VEXT_VV(vremu_vv_d, 8)
1769 GEN_VEXT_VV(vrem_vv_b, 1)
1770 GEN_VEXT_VV(vrem_vv_h, 2)
1771 GEN_VEXT_VV(vrem_vv_w, 4)
1772 GEN_VEXT_VV(vrem_vv_d, 8)
1773 
1774 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1775 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1776 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1777 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1778 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1779 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1780 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1781 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1782 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1783 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1784 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1785 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1786 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1787 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1788 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1789 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1790 GEN_VEXT_VX(vdivu_vx_b, 1)
1791 GEN_VEXT_VX(vdivu_vx_h, 2)
1792 GEN_VEXT_VX(vdivu_vx_w, 4)
1793 GEN_VEXT_VX(vdivu_vx_d, 8)
1794 GEN_VEXT_VX(vdiv_vx_b, 1)
1795 GEN_VEXT_VX(vdiv_vx_h, 2)
1796 GEN_VEXT_VX(vdiv_vx_w, 4)
1797 GEN_VEXT_VX(vdiv_vx_d, 8)
1798 GEN_VEXT_VX(vremu_vx_b, 1)
1799 GEN_VEXT_VX(vremu_vx_h, 2)
1800 GEN_VEXT_VX(vremu_vx_w, 4)
1801 GEN_VEXT_VX(vremu_vx_d, 8)
1802 GEN_VEXT_VX(vrem_vx_b, 1)
1803 GEN_VEXT_VX(vrem_vx_h, 2)
1804 GEN_VEXT_VX(vrem_vx_w, 4)
1805 GEN_VEXT_VX(vrem_vx_d, 8)
1806 
1807 /* Vector Widening Integer Multiply Instructions */
1808 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1809 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1810 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1811 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1812 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1813 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1814 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1815 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1816 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1817 GEN_VEXT_VV(vwmul_vv_b, 2)
1818 GEN_VEXT_VV(vwmul_vv_h, 4)
1819 GEN_VEXT_VV(vwmul_vv_w, 8)
1820 GEN_VEXT_VV(vwmulu_vv_b, 2)
1821 GEN_VEXT_VV(vwmulu_vv_h, 4)
1822 GEN_VEXT_VV(vwmulu_vv_w, 8)
1823 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1824 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1825 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1826 
1827 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1828 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1829 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1830 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1831 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1832 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1833 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1834 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1835 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1836 GEN_VEXT_VX(vwmul_vx_b, 2)
1837 GEN_VEXT_VX(vwmul_vx_h, 4)
1838 GEN_VEXT_VX(vwmul_vx_w, 8)
1839 GEN_VEXT_VX(vwmulu_vx_b, 2)
1840 GEN_VEXT_VX(vwmulu_vx_h, 4)
1841 GEN_VEXT_VX(vwmulu_vx_w, 8)
1842 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1843 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1844 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1845 
1846 /* Vector Single-Width Integer Multiply-Add Instructions */
1847 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1848 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1849 {                                                                  \
1850     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1851     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1852     TD d = *((TD *)vd + HD(i));                                    \
1853     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1854 }
1855 
1856 #define DO_MACC(N, M, D) (M * N + D)
1857 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1858 #define DO_MADD(N, M, D) (M * D + N)
1859 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1860 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1861 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1862 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1863 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1864 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1865 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1866 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1867 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1868 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1869 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1870 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1871 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1872 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1873 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1874 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1875 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1876 GEN_VEXT_VV(vmacc_vv_b, 1)
1877 GEN_VEXT_VV(vmacc_vv_h, 2)
1878 GEN_VEXT_VV(vmacc_vv_w, 4)
1879 GEN_VEXT_VV(vmacc_vv_d, 8)
1880 GEN_VEXT_VV(vnmsac_vv_b, 1)
1881 GEN_VEXT_VV(vnmsac_vv_h, 2)
1882 GEN_VEXT_VV(vnmsac_vv_w, 4)
1883 GEN_VEXT_VV(vnmsac_vv_d, 8)
1884 GEN_VEXT_VV(vmadd_vv_b, 1)
1885 GEN_VEXT_VV(vmadd_vv_h, 2)
1886 GEN_VEXT_VV(vmadd_vv_w, 4)
1887 GEN_VEXT_VV(vmadd_vv_d, 8)
1888 GEN_VEXT_VV(vnmsub_vv_b, 1)
1889 GEN_VEXT_VV(vnmsub_vv_h, 2)
1890 GEN_VEXT_VV(vnmsub_vv_w, 4)
1891 GEN_VEXT_VV(vnmsub_vv_d, 8)
1892 
1893 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1894 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1895 {                                                                   \
1896     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1897     TD d = *((TD *)vd + HD(i));                                     \
1898     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1899 }
1900 
1901 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1902 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1903 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1904 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1905 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1906 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1907 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1908 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1909 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1910 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1911 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1912 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1913 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1914 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1915 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1916 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1917 GEN_VEXT_VX(vmacc_vx_b, 1)
1918 GEN_VEXT_VX(vmacc_vx_h, 2)
1919 GEN_VEXT_VX(vmacc_vx_w, 4)
1920 GEN_VEXT_VX(vmacc_vx_d, 8)
1921 GEN_VEXT_VX(vnmsac_vx_b, 1)
1922 GEN_VEXT_VX(vnmsac_vx_h, 2)
1923 GEN_VEXT_VX(vnmsac_vx_w, 4)
1924 GEN_VEXT_VX(vnmsac_vx_d, 8)
1925 GEN_VEXT_VX(vmadd_vx_b, 1)
1926 GEN_VEXT_VX(vmadd_vx_h, 2)
1927 GEN_VEXT_VX(vmadd_vx_w, 4)
1928 GEN_VEXT_VX(vmadd_vx_d, 8)
1929 GEN_VEXT_VX(vnmsub_vx_b, 1)
1930 GEN_VEXT_VX(vnmsub_vx_h, 2)
1931 GEN_VEXT_VX(vnmsub_vx_w, 4)
1932 GEN_VEXT_VX(vnmsub_vx_d, 8)
1933 
1934 /* Vector Widening Integer Multiply-Add Instructions */
1935 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1936 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1937 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1938 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1939 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1940 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1941 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1942 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1943 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1944 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1945 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1946 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1947 GEN_VEXT_VV(vwmacc_vv_b, 2)
1948 GEN_VEXT_VV(vwmacc_vv_h, 4)
1949 GEN_VEXT_VV(vwmacc_vv_w, 8)
1950 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1951 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1952 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1953 
1954 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1955 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1956 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1957 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1958 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1959 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1960 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1961 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1962 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1965 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1966 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1967 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1968 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1969 GEN_VEXT_VX(vwmacc_vx_b, 2)
1970 GEN_VEXT_VX(vwmacc_vx_h, 4)
1971 GEN_VEXT_VX(vwmacc_vx_w, 8)
1972 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1973 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1974 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1975 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1976 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1977 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1978 
1979 /* Vector Integer Merge and Move Instructions */
1980 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1981 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1982                   uint32_t desc)                                     \
1983 {                                                                    \
1984     uint32_t vl = env->vl;                                           \
1985     uint32_t esz = sizeof(ETYPE);                                    \
1986     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1987     uint32_t vta = vext_vta(desc);                                   \
1988     uint32_t i;                                                      \
1989                                                                      \
1990     for (i = env->vstart; i < vl; i++) {                             \
1991         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1992         *((ETYPE *)vd + H(i)) = s1;                                  \
1993     }                                                                \
1994     env->vstart = 0;                                                 \
1995     /* set tail elements to 1s */                                    \
1996     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1997 }
1998 
1999 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2000 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2001 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2002 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2003 
2004 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2005 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2006                   uint32_t desc)                                     \
2007 {                                                                    \
2008     uint32_t vl = env->vl;                                           \
2009     uint32_t esz = sizeof(ETYPE);                                    \
2010     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2011     uint32_t vta = vext_vta(desc);                                   \
2012     uint32_t i;                                                      \
2013                                                                      \
2014     for (i = env->vstart; i < vl; i++) {                             \
2015         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2016     }                                                                \
2017     env->vstart = 0;                                                 \
2018     /* set tail elements to 1s */                                    \
2019     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2020 }
2021 
2022 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2023 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2024 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2025 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2026 
2027 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2028 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2029                   CPURISCVState *env, uint32_t desc)                 \
2030 {                                                                    \
2031     uint32_t vl = env->vl;                                           \
2032     uint32_t esz = sizeof(ETYPE);                                    \
2033     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2034     uint32_t vta = vext_vta(desc);                                   \
2035     uint32_t i;                                                      \
2036                                                                      \
2037     for (i = env->vstart; i < vl; i++) {                             \
2038         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2039         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2040     }                                                                \
2041     env->vstart = 0;                                                 \
2042     /* set tail elements to 1s */                                    \
2043     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2044 }
2045 
2046 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2047 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2048 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2049 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2050 
2051 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2052 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2053                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2054 {                                                                    \
2055     uint32_t vl = env->vl;                                           \
2056     uint32_t esz = sizeof(ETYPE);                                    \
2057     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2058     uint32_t vta = vext_vta(desc);                                   \
2059     uint32_t i;                                                      \
2060                                                                      \
2061     for (i = env->vstart; i < vl; i++) {                             \
2062         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2063         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2064                    (ETYPE)(target_long)s1);                          \
2065         *((ETYPE *)vd + H(i)) = d;                                   \
2066     }                                                                \
2067     env->vstart = 0;                                                 \
2068     /* set tail elements to 1s */                                    \
2069     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2070 }
2071 
2072 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2073 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2074 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2075 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2076 
2077 /*
2078  *** Vector Fixed-Point Arithmetic Instructions
2079  */
2080 
2081 /* Vector Single-Width Saturating Add and Subtract */
2082 
2083 /*
2084  * As fixed point instructions probably have round mode and saturation,
2085  * define common macros for fixed point here.
2086  */
2087 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2088                           CPURISCVState *env, int vxrm);
2089 
2090 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2091 static inline void                                                  \
2092 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2093           CPURISCVState *env, int vxrm)                             \
2094 {                                                                   \
2095     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2096     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2097     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2098 }
2099 
2100 static inline void
2101 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2102              CPURISCVState *env,
2103              uint32_t vl, uint32_t vm, int vxrm,
2104              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2105 {
2106     for (uint32_t i = env->vstart; i < vl; i++) {
2107         if (!vm && !vext_elem_mask(v0, i)) {
2108             /* set masked-off elements to 1s */
2109             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2110             continue;
2111         }
2112         fn(vd, vs1, vs2, i, env, vxrm);
2113     }
2114     env->vstart = 0;
2115 }
2116 
2117 static inline void
2118 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2119              CPURISCVState *env,
2120              uint32_t desc,
2121              opivv2_rm_fn *fn, uint32_t esz)
2122 {
2123     uint32_t vm = vext_vm(desc);
2124     uint32_t vl = env->vl;
2125     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2126     uint32_t vta = vext_vta(desc);
2127     uint32_t vma = vext_vma(desc);
2128 
2129     switch (env->vxrm) {
2130     case 0: /* rnu */
2131         vext_vv_rm_1(vd, v0, vs1, vs2,
2132                      env, vl, vm, 0, fn, vma, esz);
2133         break;
2134     case 1: /* rne */
2135         vext_vv_rm_1(vd, v0, vs1, vs2,
2136                      env, vl, vm, 1, fn, vma, esz);
2137         break;
2138     case 2: /* rdn */
2139         vext_vv_rm_1(vd, v0, vs1, vs2,
2140                      env, vl, vm, 2, fn, vma, esz);
2141         break;
2142     default: /* rod */
2143         vext_vv_rm_1(vd, v0, vs1, vs2,
2144                      env, vl, vm, 3, fn, vma, esz);
2145         break;
2146     }
2147     /* set tail elements to 1s */
2148     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2149 }
2150 
2151 /* generate helpers for fixed point instructions with OPIVV format */
2152 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2153 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2154                   CPURISCVState *env, uint32_t desc)            \
2155 {                                                               \
2156     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2157                  do_##NAME, ESZ);                               \
2158 }
2159 
2160 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2161 {
2162     uint8_t res = a + b;
2163     if (res < a) {
2164         res = UINT8_MAX;
2165         env->vxsat = 0x1;
2166     }
2167     return res;
2168 }
2169 
2170 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2171                                uint16_t b)
2172 {
2173     uint16_t res = a + b;
2174     if (res < a) {
2175         res = UINT16_MAX;
2176         env->vxsat = 0x1;
2177     }
2178     return res;
2179 }
2180 
2181 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2182                                uint32_t b)
2183 {
2184     uint32_t res = a + b;
2185     if (res < a) {
2186         res = UINT32_MAX;
2187         env->vxsat = 0x1;
2188     }
2189     return res;
2190 }
2191 
2192 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2193                                uint64_t b)
2194 {
2195     uint64_t res = a + b;
2196     if (res < a) {
2197         res = UINT64_MAX;
2198         env->vxsat = 0x1;
2199     }
2200     return res;
2201 }
2202 
2203 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2204 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2205 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2206 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2207 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2208 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2209 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2210 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2211 
2212 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2213                           CPURISCVState *env, int vxrm);
2214 
2215 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2216 static inline void                                                  \
2217 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2218           CPURISCVState *env, int vxrm)                             \
2219 {                                                                   \
2220     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2221     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2222 }
2223 
2224 static inline void
2225 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2226              CPURISCVState *env,
2227              uint32_t vl, uint32_t vm, int vxrm,
2228              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2229 {
2230     for (uint32_t i = env->vstart; i < vl; i++) {
2231         if (!vm && !vext_elem_mask(v0, i)) {
2232             /* set masked-off elements to 1s */
2233             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2234             continue;
2235         }
2236         fn(vd, s1, vs2, i, env, vxrm);
2237     }
2238     env->vstart = 0;
2239 }
2240 
2241 static inline void
2242 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2243              CPURISCVState *env,
2244              uint32_t desc,
2245              opivx2_rm_fn *fn, uint32_t esz)
2246 {
2247     uint32_t vm = vext_vm(desc);
2248     uint32_t vl = env->vl;
2249     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2250     uint32_t vta = vext_vta(desc);
2251     uint32_t vma = vext_vma(desc);
2252 
2253     switch (env->vxrm) {
2254     case 0: /* rnu */
2255         vext_vx_rm_1(vd, v0, s1, vs2,
2256                      env, vl, vm, 0, fn, vma, esz);
2257         break;
2258     case 1: /* rne */
2259         vext_vx_rm_1(vd, v0, s1, vs2,
2260                      env, vl, vm, 1, fn, vma, esz);
2261         break;
2262     case 2: /* rdn */
2263         vext_vx_rm_1(vd, v0, s1, vs2,
2264                      env, vl, vm, 2, fn, vma, esz);
2265         break;
2266     default: /* rod */
2267         vext_vx_rm_1(vd, v0, s1, vs2,
2268                      env, vl, vm, 3, fn, vma, esz);
2269         break;
2270     }
2271     /* set tail elements to 1s */
2272     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2273 }
2274 
2275 /* generate helpers for fixed point instructions with OPIVX format */
2276 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2277 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2278                   void *vs2, CPURISCVState *env,          \
2279                   uint32_t desc)                          \
2280 {                                                         \
2281     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2282                  do_##NAME, ESZ);                         \
2283 }
2284 
2285 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2286 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2287 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2289 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2290 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2291 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2292 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2293 
2294 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2295 {
2296     int8_t res = a + b;
2297     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2298         res = a > 0 ? INT8_MAX : INT8_MIN;
2299         env->vxsat = 0x1;
2300     }
2301     return res;
2302 }
2303 
2304 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2305 {
2306     int16_t res = a + b;
2307     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2308         res = a > 0 ? INT16_MAX : INT16_MIN;
2309         env->vxsat = 0x1;
2310     }
2311     return res;
2312 }
2313 
2314 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2315 {
2316     int32_t res = a + b;
2317     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2318         res = a > 0 ? INT32_MAX : INT32_MIN;
2319         env->vxsat = 0x1;
2320     }
2321     return res;
2322 }
2323 
2324 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2325 {
2326     int64_t res = a + b;
2327     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2328         res = a > 0 ? INT64_MAX : INT64_MIN;
2329         env->vxsat = 0x1;
2330     }
2331     return res;
2332 }
2333 
2334 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2335 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2336 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2337 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2338 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2339 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2340 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2341 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2342 
2343 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2344 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2345 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2346 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2347 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2348 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2349 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2350 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2351 
2352 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2353 {
2354     uint8_t res = a - b;
2355     if (res > a) {
2356         res = 0;
2357         env->vxsat = 0x1;
2358     }
2359     return res;
2360 }
2361 
2362 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2363                                uint16_t b)
2364 {
2365     uint16_t res = a - b;
2366     if (res > a) {
2367         res = 0;
2368         env->vxsat = 0x1;
2369     }
2370     return res;
2371 }
2372 
2373 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2374                                uint32_t b)
2375 {
2376     uint32_t res = a - b;
2377     if (res > a) {
2378         res = 0;
2379         env->vxsat = 0x1;
2380     }
2381     return res;
2382 }
2383 
2384 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2385                                uint64_t b)
2386 {
2387     uint64_t res = a - b;
2388     if (res > a) {
2389         res = 0;
2390         env->vxsat = 0x1;
2391     }
2392     return res;
2393 }
2394 
2395 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2396 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2397 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2398 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2399 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2400 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2401 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2402 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2403 
2404 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2405 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2406 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2407 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2408 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2409 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2410 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2411 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2412 
2413 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2414 {
2415     int8_t res = a - b;
2416     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2417         res = a >= 0 ? INT8_MAX : INT8_MIN;
2418         env->vxsat = 0x1;
2419     }
2420     return res;
2421 }
2422 
2423 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2424 {
2425     int16_t res = a - b;
2426     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2427         res = a >= 0 ? INT16_MAX : INT16_MIN;
2428         env->vxsat = 0x1;
2429     }
2430     return res;
2431 }
2432 
2433 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2434 {
2435     int32_t res = a - b;
2436     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2437         res = a >= 0 ? INT32_MAX : INT32_MIN;
2438         env->vxsat = 0x1;
2439     }
2440     return res;
2441 }
2442 
2443 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2444 {
2445     int64_t res = a - b;
2446     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2447         res = a >= 0 ? INT64_MAX : INT64_MIN;
2448         env->vxsat = 0x1;
2449     }
2450     return res;
2451 }
2452 
2453 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2454 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2455 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2456 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2457 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2458 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2459 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2460 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2461 
2462 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2463 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2464 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2465 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2466 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2467 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2468 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2469 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2470 
2471 /* Vector Single-Width Averaging Add and Subtract */
2472 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2473 {
2474     uint8_t d = extract64(v, shift, 1);
2475     uint8_t d1;
2476     uint64_t D1, D2;
2477 
2478     if (shift == 0 || shift > 64) {
2479         return 0;
2480     }
2481 
2482     d1 = extract64(v, shift - 1, 1);
2483     D1 = extract64(v, 0, shift);
2484     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2485         return d1;
2486     } else if (vxrm == 1) { /* round-to-nearest-even */
2487         if (shift > 1) {
2488             D2 = extract64(v, 0, shift - 1);
2489             return d1 & ((D2 != 0) | d);
2490         } else {
2491             return d1 & d;
2492         }
2493     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2494         return !d & (D1 != 0);
2495     }
2496     return 0; /* round-down (truncate) */
2497 }
2498 
2499 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2500 {
2501     int64_t res = (int64_t)a + b;
2502     uint8_t round = get_round(vxrm, res, 1);
2503 
2504     return (res >> 1) + round;
2505 }
2506 
2507 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2508 {
2509     int64_t res = a + b;
2510     uint8_t round = get_round(vxrm, res, 1);
2511     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2512 
2513     /* With signed overflow, bit 64 is inverse of bit 63. */
2514     return ((res >> 1) ^ over) + round;
2515 }
2516 
2517 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2518 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2519 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2520 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2521 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2522 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2523 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2524 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2525 
2526 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2527 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2528 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2529 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2530 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2531 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2532 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2533 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2534 
2535 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2536                                uint32_t a, uint32_t b)
2537 {
2538     uint64_t res = (uint64_t)a + b;
2539     uint8_t round = get_round(vxrm, res, 1);
2540 
2541     return (res >> 1) + round;
2542 }
2543 
2544 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2545                                uint64_t a, uint64_t b)
2546 {
2547     uint64_t res = a + b;
2548     uint8_t round = get_round(vxrm, res, 1);
2549     uint64_t over = (uint64_t)(res < a) << 63;
2550 
2551     return ((res >> 1) | over) + round;
2552 }
2553 
2554 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2555 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2556 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2557 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2558 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2559 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2560 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2561 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2562 
2563 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2564 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2565 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2566 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2567 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2568 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2569 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2570 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2571 
2572 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2573 {
2574     int64_t res = (int64_t)a - b;
2575     uint8_t round = get_round(vxrm, res, 1);
2576 
2577     return (res >> 1) + round;
2578 }
2579 
2580 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2581 {
2582     int64_t res = (int64_t)a - b;
2583     uint8_t round = get_round(vxrm, res, 1);
2584     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2585 
2586     /* With signed overflow, bit 64 is inverse of bit 63. */
2587     return ((res >> 1) ^ over) + round;
2588 }
2589 
2590 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2591 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2592 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2593 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2594 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2595 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2596 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2597 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2598 
2599 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2600 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2601 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2602 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2603 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2604 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2605 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2606 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2607 
2608 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2609                                uint32_t a, uint32_t b)
2610 {
2611     int64_t res = (int64_t)a - b;
2612     uint8_t round = get_round(vxrm, res, 1);
2613 
2614     return (res >> 1) + round;
2615 }
2616 
2617 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2618                                uint64_t a, uint64_t b)
2619 {
2620     uint64_t res = (uint64_t)a - b;
2621     uint8_t round = get_round(vxrm, res, 1);
2622     uint64_t over = (uint64_t)(res > a) << 63;
2623 
2624     return ((res >> 1) | over) + round;
2625 }
2626 
2627 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2628 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2629 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2630 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2631 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2632 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2633 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2634 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2635 
2636 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2637 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2638 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2639 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2640 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2641 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2642 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2643 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2644 
2645 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2646 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2647 {
2648     uint8_t round;
2649     int16_t res;
2650 
2651     res = (int16_t)a * (int16_t)b;
2652     round = get_round(vxrm, res, 7);
2653     res = (res >> 7) + round;
2654 
2655     if (res > INT8_MAX) {
2656         env->vxsat = 0x1;
2657         return INT8_MAX;
2658     } else if (res < INT8_MIN) {
2659         env->vxsat = 0x1;
2660         return INT8_MIN;
2661     } else {
2662         return res;
2663     }
2664 }
2665 
2666 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2667 {
2668     uint8_t round;
2669     int32_t res;
2670 
2671     res = (int32_t)a * (int32_t)b;
2672     round = get_round(vxrm, res, 15);
2673     res = (res >> 15) + round;
2674 
2675     if (res > INT16_MAX) {
2676         env->vxsat = 0x1;
2677         return INT16_MAX;
2678     } else if (res < INT16_MIN) {
2679         env->vxsat = 0x1;
2680         return INT16_MIN;
2681     } else {
2682         return res;
2683     }
2684 }
2685 
2686 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2687 {
2688     uint8_t round;
2689     int64_t res;
2690 
2691     res = (int64_t)a * (int64_t)b;
2692     round = get_round(vxrm, res, 31);
2693     res = (res >> 31) + round;
2694 
2695     if (res > INT32_MAX) {
2696         env->vxsat = 0x1;
2697         return INT32_MAX;
2698     } else if (res < INT32_MIN) {
2699         env->vxsat = 0x1;
2700         return INT32_MIN;
2701     } else {
2702         return res;
2703     }
2704 }
2705 
2706 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2707 {
2708     uint8_t round;
2709     uint64_t hi_64, lo_64;
2710     int64_t res;
2711 
2712     if (a == INT64_MIN && b == INT64_MIN) {
2713         env->vxsat = 1;
2714         return INT64_MAX;
2715     }
2716 
2717     muls64(&lo_64, &hi_64, a, b);
2718     round = get_round(vxrm, lo_64, 63);
2719     /*
2720      * Cannot overflow, as there are always
2721      * 2 sign bits after multiply.
2722      */
2723     res = (hi_64 << 1) | (lo_64 >> 63);
2724     if (round) {
2725         if (res == INT64_MAX) {
2726             env->vxsat = 1;
2727         } else {
2728             res += 1;
2729         }
2730     }
2731     return res;
2732 }
2733 
2734 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2735 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2736 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2737 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2738 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2739 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2740 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2741 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2742 
2743 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2744 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2745 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2746 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2747 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2748 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2749 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2750 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2751 
2752 /* Vector Single-Width Scaling Shift Instructions */
2753 static inline uint8_t
2754 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2755 {
2756     uint8_t round, shift = b & 0x7;
2757     uint8_t res;
2758 
2759     round = get_round(vxrm, a, shift);
2760     res = (a >> shift) + round;
2761     return res;
2762 }
2763 static inline uint16_t
2764 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2765 {
2766     uint8_t round, shift = b & 0xf;
2767 
2768     round = get_round(vxrm, a, shift);
2769     return (a >> shift) + round;
2770 }
2771 static inline uint32_t
2772 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2773 {
2774     uint8_t round, shift = b & 0x1f;
2775 
2776     round = get_round(vxrm, a, shift);
2777     return (a >> shift) + round;
2778 }
2779 static inline uint64_t
2780 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2781 {
2782     uint8_t round, shift = b & 0x3f;
2783 
2784     round = get_round(vxrm, a, shift);
2785     return (a >> shift) + round;
2786 }
2787 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2788 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2789 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2790 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2791 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2792 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2793 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2794 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2795 
2796 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2797 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2798 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2799 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2800 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2801 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2802 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2803 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2804 
2805 static inline int8_t
2806 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2807 {
2808     uint8_t round, shift = b & 0x7;
2809 
2810     round = get_round(vxrm, a, shift);
2811     return (a >> shift) + round;
2812 }
2813 static inline int16_t
2814 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2815 {
2816     uint8_t round, shift = b & 0xf;
2817 
2818     round = get_round(vxrm, a, shift);
2819     return (a >> shift) + round;
2820 }
2821 static inline int32_t
2822 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2823 {
2824     uint8_t round, shift = b & 0x1f;
2825 
2826     round = get_round(vxrm, a, shift);
2827     return (a >> shift) + round;
2828 }
2829 static inline int64_t
2830 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2831 {
2832     uint8_t round, shift = b & 0x3f;
2833 
2834     round = get_round(vxrm, a, shift);
2835     return (a >> shift) + round;
2836 }
2837 
2838 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2839 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2840 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2841 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2842 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2843 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2844 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2845 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2846 
2847 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2848 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2849 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2850 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2851 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2852 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2853 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2854 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2855 
2856 /* Vector Narrowing Fixed-Point Clip Instructions */
2857 static inline int8_t
2858 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2859 {
2860     uint8_t round, shift = b & 0xf;
2861     int16_t res;
2862 
2863     round = get_round(vxrm, a, shift);
2864     res = (a >> shift) + round;
2865     if (res > INT8_MAX) {
2866         env->vxsat = 0x1;
2867         return INT8_MAX;
2868     } else if (res < INT8_MIN) {
2869         env->vxsat = 0x1;
2870         return INT8_MIN;
2871     } else {
2872         return res;
2873     }
2874 }
2875 
2876 static inline int16_t
2877 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2878 {
2879     uint8_t round, shift = b & 0x1f;
2880     int32_t res;
2881 
2882     round = get_round(vxrm, a, shift);
2883     res = (a >> shift) + round;
2884     if (res > INT16_MAX) {
2885         env->vxsat = 0x1;
2886         return INT16_MAX;
2887     } else if (res < INT16_MIN) {
2888         env->vxsat = 0x1;
2889         return INT16_MIN;
2890     } else {
2891         return res;
2892     }
2893 }
2894 
2895 static inline int32_t
2896 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2897 {
2898     uint8_t round, shift = b & 0x3f;
2899     int64_t res;
2900 
2901     round = get_round(vxrm, a, shift);
2902     res = (a >> shift) + round;
2903     if (res > INT32_MAX) {
2904         env->vxsat = 0x1;
2905         return INT32_MAX;
2906     } else if (res < INT32_MIN) {
2907         env->vxsat = 0x1;
2908         return INT32_MIN;
2909     } else {
2910         return res;
2911     }
2912 }
2913 
2914 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2915 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2916 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2917 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2918 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2919 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2920 
2921 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2922 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2923 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2924 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2925 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2926 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2927 
2928 static inline uint8_t
2929 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2930 {
2931     uint8_t round, shift = b & 0xf;
2932     uint16_t res;
2933 
2934     round = get_round(vxrm, a, shift);
2935     res = (a >> shift) + round;
2936     if (res > UINT8_MAX) {
2937         env->vxsat = 0x1;
2938         return UINT8_MAX;
2939     } else {
2940         return res;
2941     }
2942 }
2943 
2944 static inline uint16_t
2945 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2946 {
2947     uint8_t round, shift = b & 0x1f;
2948     uint32_t res;
2949 
2950     round = get_round(vxrm, a, shift);
2951     res = (a >> shift) + round;
2952     if (res > UINT16_MAX) {
2953         env->vxsat = 0x1;
2954         return UINT16_MAX;
2955     } else {
2956         return res;
2957     }
2958 }
2959 
2960 static inline uint32_t
2961 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2962 {
2963     uint8_t round, shift = b & 0x3f;
2964     uint64_t res;
2965 
2966     round = get_round(vxrm, a, shift);
2967     res = (a >> shift) + round;
2968     if (res > UINT32_MAX) {
2969         env->vxsat = 0x1;
2970         return UINT32_MAX;
2971     } else {
2972         return res;
2973     }
2974 }
2975 
2976 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2977 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2978 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2979 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2980 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2981 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2982 
2983 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2984 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2985 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2986 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2987 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2988 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2989 
2990 /*
2991  *** Vector Float Point Arithmetic Instructions
2992  */
2993 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2994 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2995 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2996                       CPURISCVState *env)                      \
2997 {                                                              \
2998     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2999     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3000     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3001 }
3002 
3003 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3004 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3005                   void *vs2, CPURISCVState *env,          \
3006                   uint32_t desc)                          \
3007 {                                                         \
3008     uint32_t vm = vext_vm(desc);                          \
3009     uint32_t vl = env->vl;                                \
3010     uint32_t total_elems =                                \
3011         vext_get_total_elems(env, desc, ESZ);             \
3012     uint32_t vta = vext_vta(desc);                        \
3013     uint32_t vma = vext_vma(desc);                        \
3014     uint32_t i;                                           \
3015                                                           \
3016     for (i = env->vstart; i < vl; i++) {                  \
3017         if (!vm && !vext_elem_mask(v0, i)) {              \
3018             /* set masked-off elements to 1s */           \
3019             vext_set_elems_1s(vd, vma, i * ESZ,           \
3020                               (i + 1) * ESZ);             \
3021             continue;                                     \
3022         }                                                 \
3023         do_##NAME(vd, vs1, vs2, i, env);                  \
3024     }                                                     \
3025     env->vstart = 0;                                      \
3026     /* set tail elements to 1s */                         \
3027     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3028                       total_elems * ESZ);                 \
3029 }
3030 
3031 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3032 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3033 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3034 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3035 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3036 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3037 
3038 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3039 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3040                       CPURISCVState *env)                      \
3041 {                                                              \
3042     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3043     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3044 }
3045 
3046 #define GEN_VEXT_VF(NAME, ESZ)                            \
3047 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3048                   void *vs2, CPURISCVState *env,          \
3049                   uint32_t desc)                          \
3050 {                                                         \
3051     uint32_t vm = vext_vm(desc);                          \
3052     uint32_t vl = env->vl;                                \
3053     uint32_t total_elems =                                \
3054         vext_get_total_elems(env, desc, ESZ);             \
3055     uint32_t vta = vext_vta(desc);                        \
3056     uint32_t vma = vext_vma(desc);                        \
3057     uint32_t i;                                           \
3058                                                           \
3059     for (i = env->vstart; i < vl; i++) {                  \
3060         if (!vm && !vext_elem_mask(v0, i)) {              \
3061             /* set masked-off elements to 1s */           \
3062             vext_set_elems_1s(vd, vma, i * ESZ,           \
3063                               (i + 1) * ESZ);             \
3064             continue;                                     \
3065         }                                                 \
3066         do_##NAME(vd, s1, vs2, i, env);                   \
3067     }                                                     \
3068     env->vstart = 0;                                      \
3069     /* set tail elements to 1s */                         \
3070     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3071                       total_elems * ESZ);                 \
3072 }
3073 
3074 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3075 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3076 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3077 GEN_VEXT_VF(vfadd_vf_h, 2)
3078 GEN_VEXT_VF(vfadd_vf_w, 4)
3079 GEN_VEXT_VF(vfadd_vf_d, 8)
3080 
3081 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3082 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3083 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3084 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3085 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3086 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3087 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3088 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3089 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3090 GEN_VEXT_VF(vfsub_vf_h, 2)
3091 GEN_VEXT_VF(vfsub_vf_w, 4)
3092 GEN_VEXT_VF(vfsub_vf_d, 8)
3093 
3094 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3095 {
3096     return float16_sub(b, a, s);
3097 }
3098 
3099 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3100 {
3101     return float32_sub(b, a, s);
3102 }
3103 
3104 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3105 {
3106     return float64_sub(b, a, s);
3107 }
3108 
3109 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3110 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3111 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3112 GEN_VEXT_VF(vfrsub_vf_h, 2)
3113 GEN_VEXT_VF(vfrsub_vf_w, 4)
3114 GEN_VEXT_VF(vfrsub_vf_d, 8)
3115 
3116 /* Vector Widening Floating-Point Add/Subtract Instructions */
3117 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3118 {
3119     return float32_add(float16_to_float32(a, true, s),
3120                        float16_to_float32(b, true, s), s);
3121 }
3122 
3123 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3124 {
3125     return float64_add(float32_to_float64(a, s),
3126                        float32_to_float64(b, s), s);
3127 
3128 }
3129 
3130 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3131 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3132 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3133 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3134 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3135 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3136 GEN_VEXT_VF(vfwadd_vf_h, 4)
3137 GEN_VEXT_VF(vfwadd_vf_w, 8)
3138 
3139 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3140 {
3141     return float32_sub(float16_to_float32(a, true, s),
3142                        float16_to_float32(b, true, s), s);
3143 }
3144 
3145 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3146 {
3147     return float64_sub(float32_to_float64(a, s),
3148                        float32_to_float64(b, s), s);
3149 
3150 }
3151 
3152 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3153 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3154 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3155 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3156 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3157 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3158 GEN_VEXT_VF(vfwsub_vf_h, 4)
3159 GEN_VEXT_VF(vfwsub_vf_w, 8)
3160 
3161 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3162 {
3163     return float32_add(a, float16_to_float32(b, true, s), s);
3164 }
3165 
3166 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3167 {
3168     return float64_add(a, float32_to_float64(b, s), s);
3169 }
3170 
3171 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3172 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3173 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3174 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3175 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3176 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3177 GEN_VEXT_VF(vfwadd_wf_h, 4)
3178 GEN_VEXT_VF(vfwadd_wf_w, 8)
3179 
3180 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3181 {
3182     return float32_sub(a, float16_to_float32(b, true, s), s);
3183 }
3184 
3185 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3186 {
3187     return float64_sub(a, float32_to_float64(b, s), s);
3188 }
3189 
3190 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3191 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3192 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3193 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3194 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3195 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3196 GEN_VEXT_VF(vfwsub_wf_h, 4)
3197 GEN_VEXT_VF(vfwsub_wf_w, 8)
3198 
3199 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3200 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3201 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3202 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3203 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3204 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3205 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3206 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3207 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3208 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3209 GEN_VEXT_VF(vfmul_vf_h, 2)
3210 GEN_VEXT_VF(vfmul_vf_w, 4)
3211 GEN_VEXT_VF(vfmul_vf_d, 8)
3212 
3213 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3214 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3215 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3216 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3217 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3218 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3219 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3220 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3221 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3222 GEN_VEXT_VF(vfdiv_vf_h, 2)
3223 GEN_VEXT_VF(vfdiv_vf_w, 4)
3224 GEN_VEXT_VF(vfdiv_vf_d, 8)
3225 
3226 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3227 {
3228     return float16_div(b, a, s);
3229 }
3230 
3231 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3232 {
3233     return float32_div(b, a, s);
3234 }
3235 
3236 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3237 {
3238     return float64_div(b, a, s);
3239 }
3240 
3241 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3242 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3243 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3244 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3245 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3246 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3247 
3248 /* Vector Widening Floating-Point Multiply */
3249 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3250 {
3251     return float32_mul(float16_to_float32(a, true, s),
3252                        float16_to_float32(b, true, s), s);
3253 }
3254 
3255 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3256 {
3257     return float64_mul(float32_to_float64(a, s),
3258                        float32_to_float64(b, s), s);
3259 
3260 }
3261 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3262 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3263 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3264 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3265 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3266 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3267 GEN_VEXT_VF(vfwmul_vf_h, 4)
3268 GEN_VEXT_VF(vfwmul_vf_w, 8)
3269 
3270 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3271 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3272 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3273                       CPURISCVState *env)                          \
3274 {                                                                  \
3275     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3276     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3277     TD d = *((TD *)vd + HD(i));                                    \
3278     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3279 }
3280 
3281 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3282 {
3283     return float16_muladd(a, b, d, 0, s);
3284 }
3285 
3286 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3287 {
3288     return float32_muladd(a, b, d, 0, s);
3289 }
3290 
3291 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3292 {
3293     return float64_muladd(a, b, d, 0, s);
3294 }
3295 
3296 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3297 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3298 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3299 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3300 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3301 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3302 
3303 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3304 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3305                       CPURISCVState *env)                         \
3306 {                                                                 \
3307     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3308     TD d = *((TD *)vd + HD(i));                                   \
3309     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3310 }
3311 
3312 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3313 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3314 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3315 GEN_VEXT_VF(vfmacc_vf_h, 2)
3316 GEN_VEXT_VF(vfmacc_vf_w, 4)
3317 GEN_VEXT_VF(vfmacc_vf_d, 8)
3318 
3319 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3320 {
3321     return float16_muladd(a, b, d, float_muladd_negate_c |
3322                                    float_muladd_negate_product, s);
3323 }
3324 
3325 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3326 {
3327     return float32_muladd(a, b, d, float_muladd_negate_c |
3328                                    float_muladd_negate_product, s);
3329 }
3330 
3331 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3332 {
3333     return float64_muladd(a, b, d, float_muladd_negate_c |
3334                                    float_muladd_negate_product, s);
3335 }
3336 
3337 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3338 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3339 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3340 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3341 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3342 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3343 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3344 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3345 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3346 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3347 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3348 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3349 
3350 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3351 {
3352     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3353 }
3354 
3355 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3356 {
3357     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3358 }
3359 
3360 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3361 {
3362     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3363 }
3364 
3365 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3366 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3367 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3368 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3369 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3370 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3371 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3372 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3373 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3374 GEN_VEXT_VF(vfmsac_vf_h, 2)
3375 GEN_VEXT_VF(vfmsac_vf_w, 4)
3376 GEN_VEXT_VF(vfmsac_vf_d, 8)
3377 
3378 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3379 {
3380     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3381 }
3382 
3383 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3384 {
3385     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3386 }
3387 
3388 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3389 {
3390     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3391 }
3392 
3393 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3394 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3395 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3396 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3397 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3398 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3399 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3400 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3401 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3402 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3403 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3404 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3405 
3406 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3407 {
3408     return float16_muladd(d, b, a, 0, s);
3409 }
3410 
3411 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3412 {
3413     return float32_muladd(d, b, a, 0, s);
3414 }
3415 
3416 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3417 {
3418     return float64_muladd(d, b, a, 0, s);
3419 }
3420 
3421 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3422 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3423 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3424 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3425 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3426 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3427 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3428 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3429 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3430 GEN_VEXT_VF(vfmadd_vf_h, 2)
3431 GEN_VEXT_VF(vfmadd_vf_w, 4)
3432 GEN_VEXT_VF(vfmadd_vf_d, 8)
3433 
3434 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3435 {
3436     return float16_muladd(d, b, a, float_muladd_negate_c |
3437                                    float_muladd_negate_product, s);
3438 }
3439 
3440 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3441 {
3442     return float32_muladd(d, b, a, float_muladd_negate_c |
3443                                    float_muladd_negate_product, s);
3444 }
3445 
3446 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3447 {
3448     return float64_muladd(d, b, a, float_muladd_negate_c |
3449                                    float_muladd_negate_product, s);
3450 }
3451 
3452 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3453 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3454 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3455 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3456 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3457 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3458 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3459 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3460 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3461 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3462 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3463 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3464 
3465 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3466 {
3467     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3468 }
3469 
3470 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3471 {
3472     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3473 }
3474 
3475 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3476 {
3477     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3478 }
3479 
3480 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3481 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3482 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3483 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3484 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3485 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3486 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3487 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3488 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3489 GEN_VEXT_VF(vfmsub_vf_h, 2)
3490 GEN_VEXT_VF(vfmsub_vf_w, 4)
3491 GEN_VEXT_VF(vfmsub_vf_d, 8)
3492 
3493 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3494 {
3495     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3496 }
3497 
3498 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3499 {
3500     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3501 }
3502 
3503 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3504 {
3505     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3506 }
3507 
3508 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3509 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3510 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3511 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3512 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3513 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3514 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3515 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3516 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3517 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3518 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3519 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3520 
3521 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3522 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3523 {
3524     return float32_muladd(float16_to_float32(a, true, s),
3525                           float16_to_float32(b, true, s), d, 0, s);
3526 }
3527 
3528 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3529 {
3530     return float64_muladd(float32_to_float64(a, s),
3531                           float32_to_float64(b, s), d, 0, s);
3532 }
3533 
3534 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3535 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3536 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3537 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3538 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3539 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3540 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3541 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3542 
3543 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3544 {
3545     return float32_muladd(float16_to_float32(a, true, s),
3546                           float16_to_float32(b, true, s), d,
3547                           float_muladd_negate_c | float_muladd_negate_product,
3548                           s);
3549 }
3550 
3551 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3552 {
3553     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3554                           d, float_muladd_negate_c |
3555                              float_muladd_negate_product, s);
3556 }
3557 
3558 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3559 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3560 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3561 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3562 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3563 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3564 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3565 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3566 
3567 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3568 {
3569     return float32_muladd(float16_to_float32(a, true, s),
3570                           float16_to_float32(b, true, s), d,
3571                           float_muladd_negate_c, s);
3572 }
3573 
3574 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3575 {
3576     return float64_muladd(float32_to_float64(a, s),
3577                           float32_to_float64(b, s), d,
3578                           float_muladd_negate_c, s);
3579 }
3580 
3581 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3582 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3583 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3584 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3585 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3586 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3587 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3588 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3589 
3590 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3591 {
3592     return float32_muladd(float16_to_float32(a, true, s),
3593                           float16_to_float32(b, true, s), d,
3594                           float_muladd_negate_product, s);
3595 }
3596 
3597 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3598 {
3599     return float64_muladd(float32_to_float64(a, s),
3600                           float32_to_float64(b, s), d,
3601                           float_muladd_negate_product, s);
3602 }
3603 
3604 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3605 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3606 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3607 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3608 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3609 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3610 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3611 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3612 
3613 /* Vector Floating-Point Square-Root Instruction */
3614 /* (TD, T2, TX2) */
3615 #define OP_UU_H uint16_t, uint16_t, uint16_t
3616 #define OP_UU_W uint32_t, uint32_t, uint32_t
3617 #define OP_UU_D uint64_t, uint64_t, uint64_t
3618 
3619 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3620 static void do_##NAME(void *vd, void *vs2, int i,      \
3621                       CPURISCVState *env)              \
3622 {                                                      \
3623     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3624     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3625 }
3626 
3627 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3628 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3629                   CPURISCVState *env, uint32_t desc)   \
3630 {                                                      \
3631     uint32_t vm = vext_vm(desc);                       \
3632     uint32_t vl = env->vl;                             \
3633     uint32_t total_elems =                             \
3634         vext_get_total_elems(env, desc, ESZ);          \
3635     uint32_t vta = vext_vta(desc);                     \
3636     uint32_t vma = vext_vma(desc);                     \
3637     uint32_t i;                                        \
3638                                                        \
3639     if (vl == 0) {                                     \
3640         return;                                        \
3641     }                                                  \
3642     for (i = env->vstart; i < vl; i++) {               \
3643         if (!vm && !vext_elem_mask(v0, i)) {           \
3644             /* set masked-off elements to 1s */        \
3645             vext_set_elems_1s(vd, vma, i * ESZ,        \
3646                               (i + 1) * ESZ);          \
3647             continue;                                  \
3648         }                                              \
3649         do_##NAME(vd, vs2, i, env);                    \
3650     }                                                  \
3651     env->vstart = 0;                                   \
3652     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3653                       total_elems * ESZ);              \
3654 }
3655 
3656 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3657 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3658 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3659 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3660 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3661 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3662 
3663 /*
3664  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3665  *
3666  * Adapted from riscv-v-spec recip.c:
3667  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3668  */
3669 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3670 {
3671     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3672     uint64_t exp = extract64(f, frac_size, exp_size);
3673     uint64_t frac = extract64(f, 0, frac_size);
3674 
3675     const uint8_t lookup_table[] = {
3676         52, 51, 50, 48, 47, 46, 44, 43,
3677         42, 41, 40, 39, 38, 36, 35, 34,
3678         33, 32, 31, 30, 30, 29, 28, 27,
3679         26, 25, 24, 23, 23, 22, 21, 20,
3680         19, 19, 18, 17, 16, 16, 15, 14,
3681         14, 13, 12, 12, 11, 10, 10, 9,
3682         9, 8, 7, 7, 6, 6, 5, 4,
3683         4, 3, 3, 2, 2, 1, 1, 0,
3684         127, 125, 123, 121, 119, 118, 116, 114,
3685         113, 111, 109, 108, 106, 105, 103, 102,
3686         100, 99, 97, 96, 95, 93, 92, 91,
3687         90, 88, 87, 86, 85, 84, 83, 82,
3688         80, 79, 78, 77, 76, 75, 74, 73,
3689         72, 71, 70, 70, 69, 68, 67, 66,
3690         65, 64, 63, 63, 62, 61, 60, 59,
3691         59, 58, 57, 56, 56, 55, 54, 53
3692     };
3693     const int precision = 7;
3694 
3695     if (exp == 0 && frac != 0) { /* subnormal */
3696         /* Normalize the subnormal. */
3697         while (extract64(frac, frac_size - 1, 1) == 0) {
3698             exp--;
3699             frac <<= 1;
3700         }
3701 
3702         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3703     }
3704 
3705     int idx = ((exp & 1) << (precision - 1)) |
3706               (frac >> (frac_size - precision + 1));
3707     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3708                         (frac_size - precision);
3709     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3710 
3711     uint64_t val = 0;
3712     val = deposit64(val, 0, frac_size, out_frac);
3713     val = deposit64(val, frac_size, exp_size, out_exp);
3714     val = deposit64(val, frac_size + exp_size, 1, sign);
3715     return val;
3716 }
3717 
3718 static float16 frsqrt7_h(float16 f, float_status *s)
3719 {
3720     int exp_size = 5, frac_size = 10;
3721     bool sign = float16_is_neg(f);
3722 
3723     /*
3724      * frsqrt7(sNaN) = canonical NaN
3725      * frsqrt7(-inf) = canonical NaN
3726      * frsqrt7(-normal) = canonical NaN
3727      * frsqrt7(-subnormal) = canonical NaN
3728      */
3729     if (float16_is_signaling_nan(f, s) ||
3730         (float16_is_infinity(f) && sign) ||
3731         (float16_is_normal(f) && sign) ||
3732         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3733         s->float_exception_flags |= float_flag_invalid;
3734         return float16_default_nan(s);
3735     }
3736 
3737     /* frsqrt7(qNaN) = canonical NaN */
3738     if (float16_is_quiet_nan(f, s)) {
3739         return float16_default_nan(s);
3740     }
3741 
3742     /* frsqrt7(+-0) = +-inf */
3743     if (float16_is_zero(f)) {
3744         s->float_exception_flags |= float_flag_divbyzero;
3745         return float16_set_sign(float16_infinity, sign);
3746     }
3747 
3748     /* frsqrt7(+inf) = +0 */
3749     if (float16_is_infinity(f) && !sign) {
3750         return float16_set_sign(float16_zero, sign);
3751     }
3752 
3753     /* +normal, +subnormal */
3754     uint64_t val = frsqrt7(f, exp_size, frac_size);
3755     return make_float16(val);
3756 }
3757 
3758 static float32 frsqrt7_s(float32 f, float_status *s)
3759 {
3760     int exp_size = 8, frac_size = 23;
3761     bool sign = float32_is_neg(f);
3762 
3763     /*
3764      * frsqrt7(sNaN) = canonical NaN
3765      * frsqrt7(-inf) = canonical NaN
3766      * frsqrt7(-normal) = canonical NaN
3767      * frsqrt7(-subnormal) = canonical NaN
3768      */
3769     if (float32_is_signaling_nan(f, s) ||
3770         (float32_is_infinity(f) && sign) ||
3771         (float32_is_normal(f) && sign) ||
3772         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3773         s->float_exception_flags |= float_flag_invalid;
3774         return float32_default_nan(s);
3775     }
3776 
3777     /* frsqrt7(qNaN) = canonical NaN */
3778     if (float32_is_quiet_nan(f, s)) {
3779         return float32_default_nan(s);
3780     }
3781 
3782     /* frsqrt7(+-0) = +-inf */
3783     if (float32_is_zero(f)) {
3784         s->float_exception_flags |= float_flag_divbyzero;
3785         return float32_set_sign(float32_infinity, sign);
3786     }
3787 
3788     /* frsqrt7(+inf) = +0 */
3789     if (float32_is_infinity(f) && !sign) {
3790         return float32_set_sign(float32_zero, sign);
3791     }
3792 
3793     /* +normal, +subnormal */
3794     uint64_t val = frsqrt7(f, exp_size, frac_size);
3795     return make_float32(val);
3796 }
3797 
3798 static float64 frsqrt7_d(float64 f, float_status *s)
3799 {
3800     int exp_size = 11, frac_size = 52;
3801     bool sign = float64_is_neg(f);
3802 
3803     /*
3804      * frsqrt7(sNaN) = canonical NaN
3805      * frsqrt7(-inf) = canonical NaN
3806      * frsqrt7(-normal) = canonical NaN
3807      * frsqrt7(-subnormal) = canonical NaN
3808      */
3809     if (float64_is_signaling_nan(f, s) ||
3810         (float64_is_infinity(f) && sign) ||
3811         (float64_is_normal(f) && sign) ||
3812         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3813         s->float_exception_flags |= float_flag_invalid;
3814         return float64_default_nan(s);
3815     }
3816 
3817     /* frsqrt7(qNaN) = canonical NaN */
3818     if (float64_is_quiet_nan(f, s)) {
3819         return float64_default_nan(s);
3820     }
3821 
3822     /* frsqrt7(+-0) = +-inf */
3823     if (float64_is_zero(f)) {
3824         s->float_exception_flags |= float_flag_divbyzero;
3825         return float64_set_sign(float64_infinity, sign);
3826     }
3827 
3828     /* frsqrt7(+inf) = +0 */
3829     if (float64_is_infinity(f) && !sign) {
3830         return float64_set_sign(float64_zero, sign);
3831     }
3832 
3833     /* +normal, +subnormal */
3834     uint64_t val = frsqrt7(f, exp_size, frac_size);
3835     return make_float64(val);
3836 }
3837 
3838 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3839 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3840 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3841 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3842 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3843 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3844 
3845 /*
3846  * Vector Floating-Point Reciprocal Estimate Instruction
3847  *
3848  * Adapted from riscv-v-spec recip.c:
3849  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3850  */
3851 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3852                       float_status *s)
3853 {
3854     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3855     uint64_t exp = extract64(f, frac_size, exp_size);
3856     uint64_t frac = extract64(f, 0, frac_size);
3857 
3858     const uint8_t lookup_table[] = {
3859         127, 125, 123, 121, 119, 117, 116, 114,
3860         112, 110, 109, 107, 105, 104, 102, 100,
3861         99, 97, 96, 94, 93, 91, 90, 88,
3862         87, 85, 84, 83, 81, 80, 79, 77,
3863         76, 75, 74, 72, 71, 70, 69, 68,
3864         66, 65, 64, 63, 62, 61, 60, 59,
3865         58, 57, 56, 55, 54, 53, 52, 51,
3866         50, 49, 48, 47, 46, 45, 44, 43,
3867         42, 41, 40, 40, 39, 38, 37, 36,
3868         35, 35, 34, 33, 32, 31, 31, 30,
3869         29, 28, 28, 27, 26, 25, 25, 24,
3870         23, 23, 22, 21, 21, 20, 19, 19,
3871         18, 17, 17, 16, 15, 15, 14, 14,
3872         13, 12, 12, 11, 11, 10, 9, 9,
3873         8, 8, 7, 7, 6, 5, 5, 4,
3874         4, 3, 3, 2, 2, 1, 1, 0
3875     };
3876     const int precision = 7;
3877 
3878     if (exp == 0 && frac != 0) { /* subnormal */
3879         /* Normalize the subnormal. */
3880         while (extract64(frac, frac_size - 1, 1) == 0) {
3881             exp--;
3882             frac <<= 1;
3883         }
3884 
3885         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3886 
3887         if (exp != 0 && exp != UINT64_MAX) {
3888             /*
3889              * Overflow to inf or max value of same sign,
3890              * depending on sign and rounding mode.
3891              */
3892             s->float_exception_flags |= (float_flag_inexact |
3893                                          float_flag_overflow);
3894 
3895             if ((s->float_rounding_mode == float_round_to_zero) ||
3896                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3897                 ((s->float_rounding_mode == float_round_up) && sign)) {
3898                 /* Return greatest/negative finite value. */
3899                 return (sign << (exp_size + frac_size)) |
3900                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3901             } else {
3902                 /* Return +-inf. */
3903                 return (sign << (exp_size + frac_size)) |
3904                        MAKE_64BIT_MASK(frac_size, exp_size);
3905             }
3906         }
3907     }
3908 
3909     int idx = frac >> (frac_size - precision);
3910     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3911                         (frac_size - precision);
3912     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3913 
3914     if (out_exp == 0 || out_exp == UINT64_MAX) {
3915         /*
3916          * The result is subnormal, but don't raise the underflow exception,
3917          * because there's no additional loss of precision.
3918          */
3919         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3920         if (out_exp == UINT64_MAX) {
3921             out_frac >>= 1;
3922             out_exp = 0;
3923         }
3924     }
3925 
3926     uint64_t val = 0;
3927     val = deposit64(val, 0, frac_size, out_frac);
3928     val = deposit64(val, frac_size, exp_size, out_exp);
3929     val = deposit64(val, frac_size + exp_size, 1, sign);
3930     return val;
3931 }
3932 
3933 static float16 frec7_h(float16 f, float_status *s)
3934 {
3935     int exp_size = 5, frac_size = 10;
3936     bool sign = float16_is_neg(f);
3937 
3938     /* frec7(+-inf) = +-0 */
3939     if (float16_is_infinity(f)) {
3940         return float16_set_sign(float16_zero, sign);
3941     }
3942 
3943     /* frec7(+-0) = +-inf */
3944     if (float16_is_zero(f)) {
3945         s->float_exception_flags |= float_flag_divbyzero;
3946         return float16_set_sign(float16_infinity, sign);
3947     }
3948 
3949     /* frec7(sNaN) = canonical NaN */
3950     if (float16_is_signaling_nan(f, s)) {
3951         s->float_exception_flags |= float_flag_invalid;
3952         return float16_default_nan(s);
3953     }
3954 
3955     /* frec7(qNaN) = canonical NaN */
3956     if (float16_is_quiet_nan(f, s)) {
3957         return float16_default_nan(s);
3958     }
3959 
3960     /* +-normal, +-subnormal */
3961     uint64_t val = frec7(f, exp_size, frac_size, s);
3962     return make_float16(val);
3963 }
3964 
3965 static float32 frec7_s(float32 f, float_status *s)
3966 {
3967     int exp_size = 8, frac_size = 23;
3968     bool sign = float32_is_neg(f);
3969 
3970     /* frec7(+-inf) = +-0 */
3971     if (float32_is_infinity(f)) {
3972         return float32_set_sign(float32_zero, sign);
3973     }
3974 
3975     /* frec7(+-0) = +-inf */
3976     if (float32_is_zero(f)) {
3977         s->float_exception_flags |= float_flag_divbyzero;
3978         return float32_set_sign(float32_infinity, sign);
3979     }
3980 
3981     /* frec7(sNaN) = canonical NaN */
3982     if (float32_is_signaling_nan(f, s)) {
3983         s->float_exception_flags |= float_flag_invalid;
3984         return float32_default_nan(s);
3985     }
3986 
3987     /* frec7(qNaN) = canonical NaN */
3988     if (float32_is_quiet_nan(f, s)) {
3989         return float32_default_nan(s);
3990     }
3991 
3992     /* +-normal, +-subnormal */
3993     uint64_t val = frec7(f, exp_size, frac_size, s);
3994     return make_float32(val);
3995 }
3996 
3997 static float64 frec7_d(float64 f, float_status *s)
3998 {
3999     int exp_size = 11, frac_size = 52;
4000     bool sign = float64_is_neg(f);
4001 
4002     /* frec7(+-inf) = +-0 */
4003     if (float64_is_infinity(f)) {
4004         return float64_set_sign(float64_zero, sign);
4005     }
4006 
4007     /* frec7(+-0) = +-inf */
4008     if (float64_is_zero(f)) {
4009         s->float_exception_flags |= float_flag_divbyzero;
4010         return float64_set_sign(float64_infinity, sign);
4011     }
4012 
4013     /* frec7(sNaN) = canonical NaN */
4014     if (float64_is_signaling_nan(f, s)) {
4015         s->float_exception_flags |= float_flag_invalid;
4016         return float64_default_nan(s);
4017     }
4018 
4019     /* frec7(qNaN) = canonical NaN */
4020     if (float64_is_quiet_nan(f, s)) {
4021         return float64_default_nan(s);
4022     }
4023 
4024     /* +-normal, +-subnormal */
4025     uint64_t val = frec7(f, exp_size, frac_size, s);
4026     return make_float64(val);
4027 }
4028 
4029 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4030 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4031 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4032 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4033 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4034 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4035 
4036 /* Vector Floating-Point MIN/MAX Instructions */
4037 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4038 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4039 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4040 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4041 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4042 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4043 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4044 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4045 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4046 GEN_VEXT_VF(vfmin_vf_h, 2)
4047 GEN_VEXT_VF(vfmin_vf_w, 4)
4048 GEN_VEXT_VF(vfmin_vf_d, 8)
4049 
4050 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4051 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4052 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4053 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4054 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4055 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4056 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4057 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4058 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4059 GEN_VEXT_VF(vfmax_vf_h, 2)
4060 GEN_VEXT_VF(vfmax_vf_w, 4)
4061 GEN_VEXT_VF(vfmax_vf_d, 8)
4062 
4063 /* Vector Floating-Point Sign-Injection Instructions */
4064 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4065 {
4066     return deposit64(b, 0, 15, a);
4067 }
4068 
4069 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4070 {
4071     return deposit64(b, 0, 31, a);
4072 }
4073 
4074 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4075 {
4076     return deposit64(b, 0, 63, a);
4077 }
4078 
4079 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4080 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4081 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4082 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4083 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4084 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4085 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4086 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4087 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4088 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4089 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4090 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4091 
4092 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4093 {
4094     return deposit64(~b, 0, 15, a);
4095 }
4096 
4097 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4098 {
4099     return deposit64(~b, 0, 31, a);
4100 }
4101 
4102 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4103 {
4104     return deposit64(~b, 0, 63, a);
4105 }
4106 
4107 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4108 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4109 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4110 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4111 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4112 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4113 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4114 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4115 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4116 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4117 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4118 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4119 
4120 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4121 {
4122     return deposit64(b ^ a, 0, 15, a);
4123 }
4124 
4125 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4126 {
4127     return deposit64(b ^ a, 0, 31, a);
4128 }
4129 
4130 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4131 {
4132     return deposit64(b ^ a, 0, 63, a);
4133 }
4134 
4135 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4136 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4137 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4138 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4139 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4140 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4141 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4142 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4143 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4144 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4145 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4146 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4147 
4148 /* Vector Floating-Point Compare Instructions */
4149 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4150 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4151                   CPURISCVState *env, uint32_t desc)          \
4152 {                                                             \
4153     uint32_t vm = vext_vm(desc);                              \
4154     uint32_t vl = env->vl;                                    \
4155     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4156     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4157     uint32_t vma = vext_vma(desc);                            \
4158     uint32_t i;                                               \
4159                                                               \
4160     for (i = env->vstart; i < vl; i++) {                      \
4161         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4162         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4163         if (!vm && !vext_elem_mask(v0, i)) {                  \
4164             /* set masked-off elements to 1s */               \
4165             if (vma) {                                        \
4166                 vext_set_elem_mask(vd, i, 1);                 \
4167             }                                                 \
4168             continue;                                         \
4169         }                                                     \
4170         vext_set_elem_mask(vd, i,                             \
4171                            DO_OP(s2, s1, &env->fp_status));   \
4172     }                                                         \
4173     env->vstart = 0;                                          \
4174     /* mask destination register are always tail-agnostic */  \
4175     /* set tail elements to 1s */                             \
4176     if (vta_all_1s) {                                         \
4177         for (; i < total_elems; i++) {                        \
4178             vext_set_elem_mask(vd, i, 1);                     \
4179         }                                                     \
4180     }                                                         \
4181 }
4182 
4183 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4184 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4185 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4186 
4187 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4188 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4189                   CPURISCVState *env, uint32_t desc)                \
4190 {                                                                   \
4191     uint32_t vm = vext_vm(desc);                                    \
4192     uint32_t vl = env->vl;                                          \
4193     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4194     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4195     uint32_t vma = vext_vma(desc);                                  \
4196     uint32_t i;                                                     \
4197                                                                     \
4198     for (i = env->vstart; i < vl; i++) {                            \
4199         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4200         if (!vm && !vext_elem_mask(v0, i)) {                        \
4201             /* set masked-off elements to 1s */                     \
4202             if (vma) {                                              \
4203                 vext_set_elem_mask(vd, i, 1);                       \
4204             }                                                       \
4205             continue;                                               \
4206         }                                                           \
4207         vext_set_elem_mask(vd, i,                                   \
4208                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4209     }                                                               \
4210     env->vstart = 0;                                                \
4211     /* mask destination register are always tail-agnostic */        \
4212     /* set tail elements to 1s */                                   \
4213     if (vta_all_1s) {                                               \
4214         for (; i < total_elems; i++) {                              \
4215             vext_set_elem_mask(vd, i, 1);                           \
4216         }                                                           \
4217     }                                                               \
4218 }
4219 
4220 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4221 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4222 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4223 
4224 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4225 {
4226     FloatRelation compare = float16_compare_quiet(a, b, s);
4227     return compare != float_relation_equal;
4228 }
4229 
4230 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4231 {
4232     FloatRelation compare = float32_compare_quiet(a, b, s);
4233     return compare != float_relation_equal;
4234 }
4235 
4236 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4237 {
4238     FloatRelation compare = float64_compare_quiet(a, b, s);
4239     return compare != float_relation_equal;
4240 }
4241 
4242 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4243 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4244 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4245 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4246 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4247 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4248 
4249 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4250 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4251 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4252 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4253 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4254 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4255 
4256 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4257 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4258 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4259 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4260 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4261 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4262 
4263 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4264 {
4265     FloatRelation compare = float16_compare(a, b, s);
4266     return compare == float_relation_greater;
4267 }
4268 
4269 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4270 {
4271     FloatRelation compare = float32_compare(a, b, s);
4272     return compare == float_relation_greater;
4273 }
4274 
4275 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4276 {
4277     FloatRelation compare = float64_compare(a, b, s);
4278     return compare == float_relation_greater;
4279 }
4280 
4281 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4282 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4283 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4284 
4285 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4286 {
4287     FloatRelation compare = float16_compare(a, b, s);
4288     return compare == float_relation_greater ||
4289            compare == float_relation_equal;
4290 }
4291 
4292 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4293 {
4294     FloatRelation compare = float32_compare(a, b, s);
4295     return compare == float_relation_greater ||
4296            compare == float_relation_equal;
4297 }
4298 
4299 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4300 {
4301     FloatRelation compare = float64_compare(a, b, s);
4302     return compare == float_relation_greater ||
4303            compare == float_relation_equal;
4304 }
4305 
4306 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4307 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4308 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4309 
4310 /* Vector Floating-Point Classify Instruction */
4311 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4312 static void do_##NAME(void *vd, void *vs2, int i)      \
4313 {                                                      \
4314     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4315     *((TD *)vd + HD(i)) = OP(s2);                      \
4316 }
4317 
4318 #define GEN_VEXT_V(NAME, ESZ)                          \
4319 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4320                   CPURISCVState *env, uint32_t desc)   \
4321 {                                                      \
4322     uint32_t vm = vext_vm(desc);                       \
4323     uint32_t vl = env->vl;                             \
4324     uint32_t total_elems =                             \
4325         vext_get_total_elems(env, desc, ESZ);          \
4326     uint32_t vta = vext_vta(desc);                     \
4327     uint32_t vma = vext_vma(desc);                     \
4328     uint32_t i;                                        \
4329                                                        \
4330     for (i = env->vstart; i < vl; i++) {               \
4331         if (!vm && !vext_elem_mask(v0, i)) {           \
4332             /* set masked-off elements to 1s */        \
4333             vext_set_elems_1s(vd, vma, i * ESZ,        \
4334                               (i + 1) * ESZ);          \
4335             continue;                                  \
4336         }                                              \
4337         do_##NAME(vd, vs2, i);                         \
4338     }                                                  \
4339     env->vstart = 0;                                   \
4340     /* set tail elements to 1s */                      \
4341     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4342                       total_elems * ESZ);              \
4343 }
4344 
4345 target_ulong fclass_h(uint64_t frs1)
4346 {
4347     float16 f = frs1;
4348     bool sign = float16_is_neg(f);
4349 
4350     if (float16_is_infinity(f)) {
4351         return sign ? 1 << 0 : 1 << 7;
4352     } else if (float16_is_zero(f)) {
4353         return sign ? 1 << 3 : 1 << 4;
4354     } else if (float16_is_zero_or_denormal(f)) {
4355         return sign ? 1 << 2 : 1 << 5;
4356     } else if (float16_is_any_nan(f)) {
4357         float_status s = { }; /* for snan_bit_is_one */
4358         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4359     } else {
4360         return sign ? 1 << 1 : 1 << 6;
4361     }
4362 }
4363 
4364 target_ulong fclass_s(uint64_t frs1)
4365 {
4366     float32 f = frs1;
4367     bool sign = float32_is_neg(f);
4368 
4369     if (float32_is_infinity(f)) {
4370         return sign ? 1 << 0 : 1 << 7;
4371     } else if (float32_is_zero(f)) {
4372         return sign ? 1 << 3 : 1 << 4;
4373     } else if (float32_is_zero_or_denormal(f)) {
4374         return sign ? 1 << 2 : 1 << 5;
4375     } else if (float32_is_any_nan(f)) {
4376         float_status s = { }; /* for snan_bit_is_one */
4377         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4378     } else {
4379         return sign ? 1 << 1 : 1 << 6;
4380     }
4381 }
4382 
4383 target_ulong fclass_d(uint64_t frs1)
4384 {
4385     float64 f = frs1;
4386     bool sign = float64_is_neg(f);
4387 
4388     if (float64_is_infinity(f)) {
4389         return sign ? 1 << 0 : 1 << 7;
4390     } else if (float64_is_zero(f)) {
4391         return sign ? 1 << 3 : 1 << 4;
4392     } else if (float64_is_zero_or_denormal(f)) {
4393         return sign ? 1 << 2 : 1 << 5;
4394     } else if (float64_is_any_nan(f)) {
4395         float_status s = { }; /* for snan_bit_is_one */
4396         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4397     } else {
4398         return sign ? 1 << 1 : 1 << 6;
4399     }
4400 }
4401 
4402 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4403 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4404 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4405 GEN_VEXT_V(vfclass_v_h, 2)
4406 GEN_VEXT_V(vfclass_v_w, 4)
4407 GEN_VEXT_V(vfclass_v_d, 8)
4408 
4409 /* Vector Floating-Point Merge Instruction */
4410 
4411 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4412 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4413                   CPURISCVState *env, uint32_t desc)          \
4414 {                                                             \
4415     uint32_t vm = vext_vm(desc);                              \
4416     uint32_t vl = env->vl;                                    \
4417     uint32_t esz = sizeof(ETYPE);                             \
4418     uint32_t total_elems =                                    \
4419         vext_get_total_elems(env, desc, esz);                 \
4420     uint32_t vta = vext_vta(desc);                            \
4421     uint32_t i;                                               \
4422                                                               \
4423     for (i = env->vstart; i < vl; i++) {                      \
4424         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4425         *((ETYPE *)vd + H(i)) =                               \
4426             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4427     }                                                         \
4428     env->vstart = 0;                                          \
4429     /* set tail elements to 1s */                             \
4430     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4431 }
4432 
4433 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4434 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4435 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4436 
4437 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4438 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4439 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4440 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4441 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4442 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4443 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4444 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4445 
4446 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4447 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4448 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4449 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4450 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4451 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4452 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4453 
4454 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4455 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4456 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4457 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4458 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4459 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4460 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4461 
4462 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4463 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4464 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4465 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4466 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4467 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4468 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4469 
4470 /* Widening Floating-Point/Integer Type-Convert Instructions */
4471 /* (TD, T2, TX2) */
4472 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4473 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4474 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4475 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4476 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4477 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4478 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4479 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4480 
4481 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4482 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4483 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4484 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4485 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4486 
4487 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4488 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4489 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4490 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4491 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4492 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4493 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4494 
4495 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4496 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4497 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4498 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4499 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4500 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4501 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4502 
4503 /*
4504  * vfwcvt.f.f.v vd, vs2, vm
4505  * Convert single-width float to double-width float.
4506  */
4507 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4508 {
4509     return float16_to_float32(a, true, s);
4510 }
4511 
4512 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4513 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4514 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4515 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4516 
4517 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4518 /* (TD, T2, TX2) */
4519 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4520 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4521 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4522 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4523 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4524 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4525 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4526 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4527 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4528 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4529 
4530 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4531 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4532 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4533 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4534 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4535 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4536 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4537 
4538 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4539 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4540 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4541 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4542 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4543 
4544 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4545 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4546 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4547 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4548 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4549 
4550 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4551 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4552 {
4553     return float32_to_float16(a, true, s);
4554 }
4555 
4556 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4557 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4558 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4559 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4560 
4561 /*
4562  *** Vector Reduction Operations
4563  */
4564 /* Vector Single-Width Integer Reduction Instructions */
4565 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4566 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4567                   void *vs2, CPURISCVState *env,          \
4568                   uint32_t desc)                          \
4569 {                                                         \
4570     uint32_t vm = vext_vm(desc);                          \
4571     uint32_t vl = env->vl;                                \
4572     uint32_t esz = sizeof(TD);                            \
4573     uint32_t vlenb = simd_maxsz(desc);                    \
4574     uint32_t vta = vext_vta(desc);                        \
4575     uint32_t i;                                           \
4576     TD s1 =  *((TD *)vs1 + HD(0));                        \
4577                                                           \
4578     for (i = env->vstart; i < vl; i++) {                  \
4579         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4580         if (!vm && !vext_elem_mask(v0, i)) {              \
4581             continue;                                     \
4582         }                                                 \
4583         s1 = OP(s1, (TD)s2);                              \
4584     }                                                     \
4585     *((TD *)vd + HD(0)) = s1;                             \
4586     env->vstart = 0;                                      \
4587     /* set tail elements to 1s */                         \
4588     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4589 }
4590 
4591 /* vd[0] = sum(vs1[0], vs2[*]) */
4592 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4593 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4594 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4595 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4596 
4597 /* vd[0] = maxu(vs1[0], vs2[*]) */
4598 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4599 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4600 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4601 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4602 
4603 /* vd[0] = max(vs1[0], vs2[*]) */
4604 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4605 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4606 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4607 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4608 
4609 /* vd[0] = minu(vs1[0], vs2[*]) */
4610 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4611 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4612 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4613 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4614 
4615 /* vd[0] = min(vs1[0], vs2[*]) */
4616 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4617 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4618 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4619 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4620 
4621 /* vd[0] = and(vs1[0], vs2[*]) */
4622 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4623 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4624 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4625 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4626 
4627 /* vd[0] = or(vs1[0], vs2[*]) */
4628 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4629 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4630 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4631 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4632 
4633 /* vd[0] = xor(vs1[0], vs2[*]) */
4634 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4635 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4636 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4637 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4638 
4639 /* Vector Widening Integer Reduction Instructions */
4640 /* signed sum reduction into double-width accumulator */
4641 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4642 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4643 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4644 
4645 /* Unsigned sum reduction into double-width accumulator */
4646 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4647 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4648 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4649 
4650 /* Vector Single-Width Floating-Point Reduction Instructions */
4651 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4652 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4653                   void *vs2, CPURISCVState *env,           \
4654                   uint32_t desc)                           \
4655 {                                                          \
4656     uint32_t vm = vext_vm(desc);                           \
4657     uint32_t vl = env->vl;                                 \
4658     uint32_t esz = sizeof(TD);                             \
4659     uint32_t vlenb = simd_maxsz(desc);                     \
4660     uint32_t vta = vext_vta(desc);                         \
4661     uint32_t i;                                            \
4662     TD s1 =  *((TD *)vs1 + HD(0));                         \
4663                                                            \
4664     for (i = env->vstart; i < vl; i++) {                   \
4665         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4666         if (!vm && !vext_elem_mask(v0, i)) {               \
4667             continue;                                      \
4668         }                                                  \
4669         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4670     }                                                      \
4671     *((TD *)vd + HD(0)) = s1;                              \
4672     env->vstart = 0;                                       \
4673     /* set tail elements to 1s */                          \
4674     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4675 }
4676 
4677 /* Unordered sum */
4678 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4679 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4680 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4681 
4682 /* Ordered sum */
4683 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4684 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4685 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4686 
4687 /* Maximum value */
4688 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4689 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4690 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4691 
4692 /* Minimum value */
4693 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4694 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4695 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4696 
4697 /* Vector Widening Floating-Point Add Instructions */
4698 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4699 {
4700     return float32_add(a, float16_to_float32(b, true, s), s);
4701 }
4702 
4703 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4704 {
4705     return float64_add(a, float32_to_float64(b, s), s);
4706 }
4707 
4708 /* Vector Widening Floating-Point Reduction Instructions */
4709 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4710 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4711 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4712 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4713 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4714 
4715 /*
4716  *** Vector Mask Operations
4717  */
4718 /* Vector Mask-Register Logical Instructions */
4719 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4720 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4721                   void *vs2, CPURISCVState *env,          \
4722                   uint32_t desc)                          \
4723 {                                                         \
4724     uint32_t vl = env->vl;                                \
4725     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4726     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4727     uint32_t i;                                           \
4728     int a, b;                                             \
4729                                                           \
4730     for (i = env->vstart; i < vl; i++) {                  \
4731         a = vext_elem_mask(vs1, i);                       \
4732         b = vext_elem_mask(vs2, i);                       \
4733         vext_set_elem_mask(vd, i, OP(b, a));              \
4734     }                                                     \
4735     env->vstart = 0;                                      \
4736     /* mask destination register are always tail-         \
4737      * agnostic                                           \
4738      */                                                   \
4739     /* set tail elements to 1s */                         \
4740     if (vta_all_1s) {                                     \
4741         for (; i < total_elems; i++) {                    \
4742             vext_set_elem_mask(vd, i, 1);                 \
4743         }                                                 \
4744     }                                                     \
4745 }
4746 
4747 #define DO_NAND(N, M)  (!(N & M))
4748 #define DO_ANDNOT(N, M)  (N & !M)
4749 #define DO_NOR(N, M)  (!(N | M))
4750 #define DO_ORNOT(N, M)  (N | !M)
4751 #define DO_XNOR(N, M)  (!(N ^ M))
4752 
4753 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4754 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4755 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4756 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4757 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4758 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4759 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4760 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4761 
4762 /* Vector count population in mask vcpop */
4763 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4764                              uint32_t desc)
4765 {
4766     target_ulong cnt = 0;
4767     uint32_t vm = vext_vm(desc);
4768     uint32_t vl = env->vl;
4769     int i;
4770 
4771     for (i = env->vstart; i < vl; i++) {
4772         if (vm || vext_elem_mask(v0, i)) {
4773             if (vext_elem_mask(vs2, i)) {
4774                 cnt++;
4775             }
4776         }
4777     }
4778     env->vstart = 0;
4779     return cnt;
4780 }
4781 
4782 /* vfirst find-first-set mask bit*/
4783 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4784                               uint32_t desc)
4785 {
4786     uint32_t vm = vext_vm(desc);
4787     uint32_t vl = env->vl;
4788     int i;
4789 
4790     for (i = env->vstart; i < vl; i++) {
4791         if (vm || vext_elem_mask(v0, i)) {
4792             if (vext_elem_mask(vs2, i)) {
4793                 return i;
4794             }
4795         }
4796     }
4797     env->vstart = 0;
4798     return -1LL;
4799 }
4800 
4801 enum set_mask_type {
4802     ONLY_FIRST = 1,
4803     INCLUDE_FIRST,
4804     BEFORE_FIRST,
4805 };
4806 
4807 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4808                    uint32_t desc, enum set_mask_type type)
4809 {
4810     uint32_t vm = vext_vm(desc);
4811     uint32_t vl = env->vl;
4812     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4813     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4814     uint32_t vma = vext_vma(desc);
4815     int i;
4816     bool first_mask_bit = false;
4817 
4818     for (i = env->vstart; i < vl; i++) {
4819         if (!vm && !vext_elem_mask(v0, i)) {
4820             /* set masked-off elements to 1s */
4821             if (vma) {
4822                 vext_set_elem_mask(vd, i, 1);
4823             }
4824             continue;
4825         }
4826         /* write a zero to all following active elements */
4827         if (first_mask_bit) {
4828             vext_set_elem_mask(vd, i, 0);
4829             continue;
4830         }
4831         if (vext_elem_mask(vs2, i)) {
4832             first_mask_bit = true;
4833             if (type == BEFORE_FIRST) {
4834                 vext_set_elem_mask(vd, i, 0);
4835             } else {
4836                 vext_set_elem_mask(vd, i, 1);
4837             }
4838         } else {
4839             if (type == ONLY_FIRST) {
4840                 vext_set_elem_mask(vd, i, 0);
4841             } else {
4842                 vext_set_elem_mask(vd, i, 1);
4843             }
4844         }
4845     }
4846     env->vstart = 0;
4847     /* mask destination register are always tail-agnostic */
4848     /* set tail elements to 1s */
4849     if (vta_all_1s) {
4850         for (; i < total_elems; i++) {
4851             vext_set_elem_mask(vd, i, 1);
4852         }
4853     }
4854 }
4855 
4856 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4857                      uint32_t desc)
4858 {
4859     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4860 }
4861 
4862 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4863                      uint32_t desc)
4864 {
4865     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4866 }
4867 
4868 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4869                      uint32_t desc)
4870 {
4871     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4872 }
4873 
4874 /* Vector Iota Instruction */
4875 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4876 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4877                   uint32_t desc)                                          \
4878 {                                                                         \
4879     uint32_t vm = vext_vm(desc);                                          \
4880     uint32_t vl = env->vl;                                                \
4881     uint32_t esz = sizeof(ETYPE);                                         \
4882     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4883     uint32_t vta = vext_vta(desc);                                        \
4884     uint32_t vma = vext_vma(desc);                                        \
4885     uint32_t sum = 0;                                                     \
4886     int i;                                                                \
4887                                                                           \
4888     for (i = env->vstart; i < vl; i++) {                                  \
4889         if (!vm && !vext_elem_mask(v0, i)) {                              \
4890             /* set masked-off elements to 1s */                           \
4891             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4892             continue;                                                     \
4893         }                                                                 \
4894         *((ETYPE *)vd + H(i)) = sum;                                      \
4895         if (vext_elem_mask(vs2, i)) {                                     \
4896             sum++;                                                        \
4897         }                                                                 \
4898     }                                                                     \
4899     env->vstart = 0;                                                      \
4900     /* set tail elements to 1s */                                         \
4901     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4902 }
4903 
4904 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4905 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4906 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4907 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4908 
4909 /* Vector Element Index Instruction */
4910 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4911 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4912 {                                                                         \
4913     uint32_t vm = vext_vm(desc);                                          \
4914     uint32_t vl = env->vl;                                                \
4915     uint32_t esz = sizeof(ETYPE);                                         \
4916     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4917     uint32_t vta = vext_vta(desc);                                        \
4918     uint32_t vma = vext_vma(desc);                                        \
4919     int i;                                                                \
4920                                                                           \
4921     for (i = env->vstart; i < vl; i++) {                                  \
4922         if (!vm && !vext_elem_mask(v0, i)) {                              \
4923             /* set masked-off elements to 1s */                           \
4924             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4925             continue;                                                     \
4926         }                                                                 \
4927         *((ETYPE *)vd + H(i)) = i;                                        \
4928     }                                                                     \
4929     env->vstart = 0;                                                      \
4930     /* set tail elements to 1s */                                         \
4931     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4932 }
4933 
4934 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4935 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4936 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4937 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4938 
4939 /*
4940  *** Vector Permutation Instructions
4941  */
4942 
4943 /* Vector Slide Instructions */
4944 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4945 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4946                   CPURISCVState *env, uint32_t desc)                      \
4947 {                                                                         \
4948     uint32_t vm = vext_vm(desc);                                          \
4949     uint32_t vl = env->vl;                                                \
4950     uint32_t esz = sizeof(ETYPE);                                         \
4951     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4952     uint32_t vta = vext_vta(desc);                                        \
4953     uint32_t vma = vext_vma(desc);                                        \
4954     target_ulong offset = s1, i_min, i;                                   \
4955                                                                           \
4956     i_min = MAX(env->vstart, offset);                                     \
4957     for (i = i_min; i < vl; i++) {                                        \
4958         if (!vm && !vext_elem_mask(v0, i)) {                              \
4959             /* set masked-off elements to 1s */                           \
4960             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4961             continue;                                                     \
4962         }                                                                 \
4963         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4964     }                                                                     \
4965     /* set tail elements to 1s */                                         \
4966     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4967 }
4968 
4969 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4970 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4971 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4972 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4973 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4974 
4975 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4976 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4977                   CPURISCVState *env, uint32_t desc)                      \
4978 {                                                                         \
4979     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4980     uint32_t vm = vext_vm(desc);                                          \
4981     uint32_t vl = env->vl;                                                \
4982     uint32_t esz = sizeof(ETYPE);                                         \
4983     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4984     uint32_t vta = vext_vta(desc);                                        \
4985     uint32_t vma = vext_vma(desc);                                        \
4986     target_ulong i_max, i;                                                \
4987                                                                           \
4988     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4989     for (i = env->vstart; i < i_max; ++i) {                               \
4990         if (!vm && !vext_elem_mask(v0, i)) {                              \
4991             /* set masked-off elements to 1s */                           \
4992             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4993             continue;                                                     \
4994         }                                                                 \
4995         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4996     }                                                                     \
4997                                                                           \
4998     for (i = i_max; i < vl; ++i) {                                        \
4999         if (vm || vext_elem_mask(v0, i)) {                                \
5000             *((ETYPE *)vd + H(i)) = 0;                                    \
5001         }                                                                 \
5002     }                                                                     \
5003                                                                           \
5004     env->vstart = 0;                                                      \
5005     /* set tail elements to 1s */                                         \
5006     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5007 }
5008 
5009 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5010 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5011 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5012 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5013 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5014 
5015 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5016 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5017                                  void *vs2, CPURISCVState *env,             \
5018                                  uint32_t desc)                             \
5019 {                                                                           \
5020     typedef uint##BITWIDTH##_t ETYPE;                                       \
5021     uint32_t vm = vext_vm(desc);                                            \
5022     uint32_t vl = env->vl;                                                  \
5023     uint32_t esz = sizeof(ETYPE);                                           \
5024     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5025     uint32_t vta = vext_vta(desc);                                          \
5026     uint32_t vma = vext_vma(desc);                                          \
5027     uint32_t i;                                                             \
5028                                                                             \
5029     for (i = env->vstart; i < vl; i++) {                                    \
5030         if (!vm && !vext_elem_mask(v0, i)) {                                \
5031             /* set masked-off elements to 1s */                             \
5032             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5033             continue;                                                       \
5034         }                                                                   \
5035         if (i == 0) {                                                       \
5036             *((ETYPE *)vd + H(i)) = s1;                                     \
5037         } else {                                                            \
5038             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5039         }                                                                   \
5040     }                                                                       \
5041     env->vstart = 0;                                                        \
5042     /* set tail elements to 1s */                                           \
5043     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5044 }
5045 
5046 GEN_VEXT_VSLIE1UP(8,  H1)
5047 GEN_VEXT_VSLIE1UP(16, H2)
5048 GEN_VEXT_VSLIE1UP(32, H4)
5049 GEN_VEXT_VSLIE1UP(64, H8)
5050 
5051 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5052 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5053                   CPURISCVState *env, uint32_t desc)              \
5054 {                                                                 \
5055     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5056 }
5057 
5058 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5059 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5060 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5061 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5062 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5063 
5064 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5065 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5066                                    void *vs2, CPURISCVState *env,             \
5067                                    uint32_t desc)                             \
5068 {                                                                             \
5069     typedef uint##BITWIDTH##_t ETYPE;                                         \
5070     uint32_t vm = vext_vm(desc);                                              \
5071     uint32_t vl = env->vl;                                                    \
5072     uint32_t esz = sizeof(ETYPE);                                             \
5073     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5074     uint32_t vta = vext_vta(desc);                                            \
5075     uint32_t vma = vext_vma(desc);                                            \
5076     uint32_t i;                                                               \
5077                                                                               \
5078     for (i = env->vstart; i < vl; i++) {                                      \
5079         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5080             /* set masked-off elements to 1s */                               \
5081             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5082             continue;                                                         \
5083         }                                                                     \
5084         if (i == vl - 1) {                                                    \
5085             *((ETYPE *)vd + H(i)) = s1;                                       \
5086         } else {                                                              \
5087             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5088         }                                                                     \
5089     }                                                                         \
5090     env->vstart = 0;                                                          \
5091     /* set tail elements to 1s */                                             \
5092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5093 }
5094 
5095 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5096 GEN_VEXT_VSLIDE1DOWN(16, H2)
5097 GEN_VEXT_VSLIDE1DOWN(32, H4)
5098 GEN_VEXT_VSLIDE1DOWN(64, H8)
5099 
5100 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5101 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5102                   CPURISCVState *env, uint32_t desc)              \
5103 {                                                                 \
5104     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5105 }
5106 
5107 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5108 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5109 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5110 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5111 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5112 
5113 /* Vector Floating-Point Slide Instructions */
5114 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5115 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5116                   CPURISCVState *env, uint32_t desc)          \
5117 {                                                             \
5118     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5119 }
5120 
5121 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5122 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5123 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5124 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5125 
5126 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5127 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5128                   CPURISCVState *env, uint32_t desc)          \
5129 {                                                             \
5130     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5131 }
5132 
5133 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5134 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5135 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5136 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5137 
5138 /* Vector Register Gather Instruction */
5139 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5140 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5141                   CPURISCVState *env, uint32_t desc)                      \
5142 {                                                                         \
5143     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5144     uint32_t vm = vext_vm(desc);                                          \
5145     uint32_t vl = env->vl;                                                \
5146     uint32_t esz = sizeof(TS2);                                           \
5147     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5148     uint32_t vta = vext_vta(desc);                                        \
5149     uint32_t vma = vext_vma(desc);                                        \
5150     uint64_t index;                                                       \
5151     uint32_t i;                                                           \
5152                                                                           \
5153     for (i = env->vstart; i < vl; i++) {                                  \
5154         if (!vm && !vext_elem_mask(v0, i)) {                              \
5155             /* set masked-off elements to 1s */                           \
5156             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5157             continue;                                                     \
5158         }                                                                 \
5159         index = *((TS1 *)vs1 + HS1(i));                                   \
5160         if (index >= vlmax) {                                             \
5161             *((TS2 *)vd + HS2(i)) = 0;                                    \
5162         } else {                                                          \
5163             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5164         }                                                                 \
5165     }                                                                     \
5166     env->vstart = 0;                                                      \
5167     /* set tail elements to 1s */                                         \
5168     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5169 }
5170 
5171 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5172 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5173 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5174 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5175 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5176 
5177 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5178 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5179 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5180 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5181 
5182 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5183 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5184                   CPURISCVState *env, uint32_t desc)                      \
5185 {                                                                         \
5186     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5187     uint32_t vm = vext_vm(desc);                                          \
5188     uint32_t vl = env->vl;                                                \
5189     uint32_t esz = sizeof(ETYPE);                                         \
5190     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5191     uint32_t vta = vext_vta(desc);                                        \
5192     uint32_t vma = vext_vma(desc);                                        \
5193     uint64_t index = s1;                                                  \
5194     uint32_t i;                                                           \
5195                                                                           \
5196     for (i = env->vstart; i < vl; i++) {                                  \
5197         if (!vm && !vext_elem_mask(v0, i)) {                              \
5198             /* set masked-off elements to 1s */                           \
5199             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5200             continue;                                                     \
5201         }                                                                 \
5202         if (index >= vlmax) {                                             \
5203             *((ETYPE *)vd + H(i)) = 0;                                    \
5204         } else {                                                          \
5205             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5206         }                                                                 \
5207     }                                                                     \
5208     env->vstart = 0;                                                      \
5209     /* set tail elements to 1s */                                         \
5210     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5211 }
5212 
5213 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5214 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5215 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5216 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5217 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5218 
5219 /* Vector Compress Instruction */
5220 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5221 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5222                   CPURISCVState *env, uint32_t desc)                      \
5223 {                                                                         \
5224     uint32_t vl = env->vl;                                                \
5225     uint32_t esz = sizeof(ETYPE);                                         \
5226     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5227     uint32_t vta = vext_vta(desc);                                        \
5228     uint32_t num = 0, i;                                                  \
5229                                                                           \
5230     for (i = env->vstart; i < vl; i++) {                                  \
5231         if (!vext_elem_mask(vs1, i)) {                                    \
5232             continue;                                                     \
5233         }                                                                 \
5234         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5235         num++;                                                            \
5236     }                                                                     \
5237     env->vstart = 0;                                                      \
5238     /* set tail elements to 1s */                                         \
5239     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5240 }
5241 
5242 /* Compress into vd elements of vs2 where vs1 is enabled */
5243 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5244 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5245 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5246 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5247 
5248 /* Vector Whole Register Move */
5249 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5250 {
5251     /* EEW = SEW */
5252     uint32_t maxsz = simd_maxsz(desc);
5253     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5254     uint32_t startb = env->vstart * sewb;
5255     uint32_t i = startb;
5256 
5257     memcpy((uint8_t *)vd + H1(i),
5258            (uint8_t *)vs2 + H1(i),
5259            maxsz - startb);
5260 
5261     env->vstart = 0;
5262 }
5263 
5264 /* Vector Integer Extension */
5265 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5266 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5267                   CPURISCVState *env, uint32_t desc)             \
5268 {                                                                \
5269     uint32_t vl = env->vl;                                       \
5270     uint32_t vm = vext_vm(desc);                                 \
5271     uint32_t esz = sizeof(ETYPE);                                \
5272     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5273     uint32_t vta = vext_vta(desc);                               \
5274     uint32_t vma = vext_vma(desc);                               \
5275     uint32_t i;                                                  \
5276                                                                  \
5277     for (i = env->vstart; i < vl; i++) {                         \
5278         if (!vm && !vext_elem_mask(v0, i)) {                     \
5279             /* set masked-off elements to 1s */                  \
5280             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5281             continue;                                            \
5282         }                                                        \
5283         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5284     }                                                            \
5285     env->vstart = 0;                                             \
5286     /* set tail elements to 1s */                                \
5287     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5288 }
5289 
5290 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5291 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5292 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5293 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5294 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5295 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5296 
5297 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5298 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5299 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5300 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5301 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5302 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5303