xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 246f8796)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
54         /* only set vill bit. */
55         env->vill = 1;
56         env->vtype = 0;
57         env->vl = 0;
58         env->vstart = 0;
59         return 0;
60     }
61 
62     vlmax = vext_get_vlmax(cpu, s2);
63     if (s1 <= vlmax) {
64         vl = s1;
65     } else {
66         vl = vlmax;
67     }
68     env->vl = vl;
69     env->vtype = s2;
70     env->vstart = 0;
71     env->vill = 0;
72     return vl;
73 }
74 
75 /*
76  * Note that vector data is stored in host-endian 64-bit chunks,
77  * so addressing units smaller than that needs a host-endian fixup.
78  */
79 #if HOST_BIG_ENDIAN
80 #define H1(x)   ((x) ^ 7)
81 #define H1_2(x) ((x) ^ 6)
82 #define H1_4(x) ((x) ^ 4)
83 #define H2(x)   ((x) ^ 3)
84 #define H4(x)   ((x) ^ 1)
85 #define H8(x)   ((x))
86 #else
87 #define H1(x)   (x)
88 #define H1_2(x) (x)
89 #define H1_4(x) (x)
90 #define H2(x)   (x)
91 #define H4(x)   (x)
92 #define H8(x)   (x)
93 #endif
94 
95 static inline uint32_t vext_nf(uint32_t desc)
96 {
97     return FIELD_EX32(simd_data(desc), VDATA, NF);
98 }
99 
100 static inline uint32_t vext_vm(uint32_t desc)
101 {
102     return FIELD_EX32(simd_data(desc), VDATA, VM);
103 }
104 
105 /*
106  * Encode LMUL to lmul as following:
107  *     LMUL    vlmul    lmul
108  *      1       000       0
109  *      2       001       1
110  *      4       010       2
111  *      8       011       3
112  *      -       100       -
113  *     1/8      101      -3
114  *     1/4      110      -2
115  *     1/2      111      -1
116  */
117 static inline int32_t vext_lmul(uint32_t desc)
118 {
119     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
120 }
121 
122 static inline uint32_t vext_vta(uint32_t desc)
123 {
124     return FIELD_EX32(simd_data(desc), VDATA, VTA);
125 }
126 
127 static inline uint32_t vext_vma(uint32_t desc)
128 {
129     return FIELD_EX32(simd_data(desc), VDATA, VMA);
130 }
131 
132 static inline uint32_t vext_vta_all_1s(uint32_t desc)
133 {
134     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
135 }
136 
137 /*
138  * Get the maximum number of elements can be operated.
139  *
140  * log2_esz: log2 of element size in bytes.
141  */
142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
143 {
144     /*
145      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
146      * so vlen in bytes (vlenb) is encoded as maxsz.
147      */
148     uint32_t vlenb = simd_maxsz(desc);
149 
150     /* Return VLMAX */
151     int scale = vext_lmul(desc) - log2_esz;
152     return scale < 0 ? vlenb >> -scale : vlenb << scale;
153 }
154 
155 /*
156  * Get number of total elements, including prestart, body and tail elements.
157  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
158  * are held in the same vector register.
159  */
160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
161                                             uint32_t esz)
162 {
163     uint32_t vlenb = simd_maxsz(desc);
164     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
165     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
166                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
167     return (vlenb << emul) / esz;
168 }
169 
170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
171 {
172     return (addr & env->cur_pmmask) | env->cur_pmbase;
173 }
174 
175 /*
176  * This function checks watchpoint before real load operation.
177  *
178  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
179  * In user mode, there is no watchpoint support now.
180  *
181  * It will trigger an exception if there is no mapping in TLB
182  * and page table walk can't fill the TLB entry. Then the guest
183  * software can return here after process the exception or never return.
184  */
185 static void probe_pages(CPURISCVState *env, target_ulong addr,
186                         target_ulong len, uintptr_t ra,
187                         MMUAccessType access_type)
188 {
189     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
190     target_ulong curlen = MIN(pagelen, len);
191 
192     probe_access(env, adjust_addr(env, addr), curlen, access_type,
193                  cpu_mmu_index(env, false), ra);
194     if (len > curlen) {
195         addr += curlen;
196         curlen = len - curlen;
197         probe_access(env, adjust_addr(env, addr), curlen, access_type,
198                      cpu_mmu_index(env, false), ra);
199     }
200 }
201 
202 /* set agnostic elements to 1s */
203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
204                               uint32_t tot)
205 {
206     if (is_agnostic == 0) {
207         /* policy undisturbed */
208         return;
209     }
210     if (tot - cnt == 0) {
211         return;
212     }
213     memset(base + cnt, -1, tot - cnt);
214 }
215 
216 static inline void vext_set_elem_mask(void *v0, int index,
217                                       uint8_t value)
218 {
219     int idx = index / 64;
220     int pos = index % 64;
221     uint64_t old = ((uint64_t *)v0)[idx];
222     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
223 }
224 
225 /*
226  * Earlier designs (pre-0.9) had a varying number of bits
227  * per mask value (MLEN). In the 0.9 design, MLEN=1.
228  * (Section 4.5)
229  */
230 static inline int vext_elem_mask(void *v0, int index)
231 {
232     int idx = index / 64;
233     int pos = index  % 64;
234     return (((uint64_t *)v0)[idx] >> pos) & 1;
235 }
236 
237 /* elements operations for load and store */
238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
239                                uint32_t idx, void *vd, uintptr_t retaddr);
240 
241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
242 static void NAME(CPURISCVState *env, abi_ptr addr,         \
243                  uint32_t idx, void *vd, uintptr_t retaddr)\
244 {                                                          \
245     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
246     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
247 }                                                          \
248 
249 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
253 
254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
255 static void NAME(CPURISCVState *env, abi_ptr addr,         \
256                  uint32_t idx, void *vd, uintptr_t retaddr)\
257 {                                                          \
258     ETYPE data = *((ETYPE *)vd + H(idx));                  \
259     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
260 }
261 
262 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
266 
267 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl,
268                                    void *vd, uint32_t desc, uint32_t nf,
269                                    uint32_t esz, uint32_t max_elems)
270 {
271     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
272     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
273     uint32_t vta = vext_vta(desc);
274     uint32_t registers_used;
275     int k;
276 
277     for (k = 0; k < nf; ++k) {
278         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
279                           (k * max_elems + max_elems) * esz);
280     }
281 
282     if (nf * max_elems % total_elems != 0) {
283         registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
284         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
285                           registers_used * vlenb);
286     }
287 }
288 
289 /*
290  * stride: access vector element from strided memory
291  */
292 static void
293 vext_ldst_stride(void *vd, void *v0, target_ulong base,
294                  target_ulong stride, CPURISCVState *env,
295                  uint32_t desc, uint32_t vm,
296                  vext_ldst_elem_fn *ldst_elem,
297                  uint32_t log2_esz, uintptr_t ra)
298 {
299     uint32_t i, k;
300     uint32_t nf = vext_nf(desc);
301     uint32_t max_elems = vext_max_elems(desc, log2_esz);
302     uint32_t esz = 1 << log2_esz;
303     uint32_t vma = vext_vma(desc);
304 
305     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
306         k = 0;
307         while (k < nf) {
308             if (!vm && !vext_elem_mask(v0, i)) {
309                 /* set masked-off elements to 1s */
310                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
311                                   (i + k * max_elems + 1) * esz);
312                 k++;
313                 continue;
314             }
315             target_ulong addr = base + stride * i + (k << log2_esz);
316             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
317             k++;
318         }
319     }
320     env->vstart = 0;
321 
322     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
323 }
324 
325 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
326 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
327                   target_ulong stride, CPURISCVState *env,              \
328                   uint32_t desc)                                        \
329 {                                                                       \
330     uint32_t vm = vext_vm(desc);                                        \
331     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
332                      ctzl(sizeof(ETYPE)), GETPC());                     \
333 }
334 
335 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
336 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
337 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
338 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
339 
340 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
341 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
342                   target_ulong stride, CPURISCVState *env,              \
343                   uint32_t desc)                                        \
344 {                                                                       \
345     uint32_t vm = vext_vm(desc);                                        \
346     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
347                      ctzl(sizeof(ETYPE)), GETPC());                     \
348 }
349 
350 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
351 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
352 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
353 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
354 
355 /*
356  * unit-stride: access elements stored contiguously in memory
357  */
358 
359 /* unmasked unit-stride load and store operation */
360 static void
361 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
362              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
363              uintptr_t ra)
364 {
365     uint32_t i, k;
366     uint32_t nf = vext_nf(desc);
367     uint32_t max_elems = vext_max_elems(desc, log2_esz);
368     uint32_t esz = 1 << log2_esz;
369 
370     /* load bytes from guest memory */
371     for (i = env->vstart; i < evl; i++, env->vstart++) {
372         k = 0;
373         while (k < nf) {
374             target_ulong addr = base + ((i * nf + k) << log2_esz);
375             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
376             k++;
377         }
378     }
379     env->vstart = 0;
380 
381     vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems);
382 }
383 
384 /*
385  * masked unit-stride load and store operation will be a special case of
386  * stride, stride = NF * sizeof (MTYPE)
387  */
388 
389 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
390 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
391                          CPURISCVState *env, uint32_t desc)             \
392 {                                                                       \
393     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
394     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
395                      ctzl(sizeof(ETYPE)), GETPC());                     \
396 }                                                                       \
397                                                                         \
398 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
399                   CPURISCVState *env, uint32_t desc)                    \
400 {                                                                       \
401     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
402                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
403 }
404 
405 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
406 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
407 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
408 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
409 
410 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
411 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
412                          CPURISCVState *env, uint32_t desc)              \
413 {                                                                        \
414     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
415     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
416                      ctzl(sizeof(ETYPE)), GETPC());                      \
417 }                                                                        \
418                                                                          \
419 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
420                   CPURISCVState *env, uint32_t desc)                     \
421 {                                                                        \
422     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
423                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
424 }
425 
426 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
427 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
428 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
429 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
430 
431 /*
432  * unit stride mask load and store, EEW = 1
433  */
434 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
435                     CPURISCVState *env, uint32_t desc)
436 {
437     /* evl = ceil(vl/8) */
438     uint8_t evl = (env->vl + 7) >> 3;
439     vext_ldst_us(vd, base, env, desc, lde_b,
440                  0, evl, GETPC());
441 }
442 
443 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
444                     CPURISCVState *env, uint32_t desc)
445 {
446     /* evl = ceil(vl/8) */
447     uint8_t evl = (env->vl + 7) >> 3;
448     vext_ldst_us(vd, base, env, desc, ste_b,
449                  0, evl, GETPC());
450 }
451 
452 /*
453  * index: access vector element from indexed memory
454  */
455 typedef target_ulong vext_get_index_addr(target_ulong base,
456         uint32_t idx, void *vs2);
457 
458 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
459 static target_ulong NAME(target_ulong base,            \
460                          uint32_t idx, void *vs2)      \
461 {                                                      \
462     return (base + *((ETYPE *)vs2 + H(idx)));          \
463 }
464 
465 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
466 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
467 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
468 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
469 
470 static inline void
471 vext_ldst_index(void *vd, void *v0, target_ulong base,
472                 void *vs2, CPURISCVState *env, uint32_t desc,
473                 vext_get_index_addr get_index_addr,
474                 vext_ldst_elem_fn *ldst_elem,
475                 uint32_t log2_esz, uintptr_t ra)
476 {
477     uint32_t i, k;
478     uint32_t nf = vext_nf(desc);
479     uint32_t vm = vext_vm(desc);
480     uint32_t max_elems = vext_max_elems(desc, log2_esz);
481     uint32_t esz = 1 << log2_esz;
482     uint32_t vma = vext_vma(desc);
483 
484     /* load bytes from guest memory */
485     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
486         k = 0;
487         while (k < nf) {
488             if (!vm && !vext_elem_mask(v0, i)) {
489                 /* set masked-off elements to 1s */
490                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
491                                   (i + k * max_elems + 1) * esz);
492                 k++;
493                 continue;
494             }
495             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
496             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
497             k++;
498         }
499     }
500     env->vstart = 0;
501 
502     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
503 }
504 
505 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
506 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
507                   void *vs2, CPURISCVState *env, uint32_t desc)            \
508 {                                                                          \
509     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
510                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
511 }
512 
513 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
514 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
515 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
516 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
517 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
529 
530 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
531 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
532                   void *vs2, CPURISCVState *env, uint32_t desc)  \
533 {                                                                \
534     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
535                     STORE_FN, ctzl(sizeof(ETYPE)),               \
536                     GETPC());                                    \
537 }
538 
539 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
540 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
541 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
542 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
543 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
555 
556 /*
557  * unit-stride fault-only-fisrt load instructions
558  */
559 static inline void
560 vext_ldff(void *vd, void *v0, target_ulong base,
561           CPURISCVState *env, uint32_t desc,
562           vext_ldst_elem_fn *ldst_elem,
563           uint32_t log2_esz, uintptr_t ra)
564 {
565     void *host;
566     uint32_t i, k, vl = 0;
567     uint32_t nf = vext_nf(desc);
568     uint32_t vm = vext_vm(desc);
569     uint32_t max_elems = vext_max_elems(desc, log2_esz);
570     uint32_t esz = 1 << log2_esz;
571     uint32_t vma = vext_vma(desc);
572     target_ulong addr, offset, remain;
573 
574     /* probe every access */
575     for (i = env->vstart; i < env->vl; i++) {
576         if (!vm && !vext_elem_mask(v0, i)) {
577             continue;
578         }
579         addr = adjust_addr(env, base + i * (nf << log2_esz));
580         if (i == 0) {
581             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
582         } else {
583             /* if it triggers an exception, no need to check watchpoint */
584             remain = nf << log2_esz;
585             while (remain > 0) {
586                 offset = -(addr | TARGET_PAGE_MASK);
587                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
588                                          cpu_mmu_index(env, false));
589                 if (host) {
590 #ifdef CONFIG_USER_ONLY
591                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
592                         vl = i;
593                         goto ProbeSuccess;
594                     }
595 #else
596                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
597 #endif
598                 } else {
599                     vl = i;
600                     goto ProbeSuccess;
601                 }
602                 if (remain <=  offset) {
603                     break;
604                 }
605                 remain -= offset;
606                 addr = adjust_addr(env, addr + offset);
607             }
608         }
609     }
610 ProbeSuccess:
611     /* load bytes from guest memory */
612     if (vl != 0) {
613         env->vl = vl;
614     }
615     for (i = env->vstart; i < env->vl; i++) {
616         k = 0;
617         while (k < nf) {
618             if (!vm && !vext_elem_mask(v0, i)) {
619                 /* set masked-off elements to 1s */
620                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
621                                   (i + k * max_elems + 1) * esz);
622                 k++;
623                 continue;
624             }
625             target_ulong addr = base + ((i * nf + k) << log2_esz);
626             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
627             k++;
628         }
629     }
630     env->vstart = 0;
631 
632     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
633 }
634 
635 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
636 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
637                   CPURISCVState *env, uint32_t desc)      \
638 {                                                         \
639     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
640               ctzl(sizeof(ETYPE)), GETPC());              \
641 }
642 
643 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
644 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
645 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
646 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
647 
648 #define DO_SWAP(N, M) (M)
649 #define DO_AND(N, M)  (N & M)
650 #define DO_XOR(N, M)  (N ^ M)
651 #define DO_OR(N, M)   (N | M)
652 #define DO_ADD(N, M)  (N + M)
653 
654 /* Signed min/max */
655 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
656 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
657 
658 /* Unsigned min/max */
659 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
660 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
661 
662 /*
663  * load and store whole register instructions
664  */
665 static void
666 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
667                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
668 {
669     uint32_t i, k, off, pos;
670     uint32_t nf = vext_nf(desc);
671     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
672     uint32_t max_elems = vlenb >> log2_esz;
673 
674     k = env->vstart / max_elems;
675     off = env->vstart % max_elems;
676 
677     if (off) {
678         /* load/store rest of elements of current segment pointed by vstart */
679         for (pos = off; pos < max_elems; pos++, env->vstart++) {
680             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
681             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
682                       ra);
683         }
684         k++;
685     }
686 
687     /* load/store elements for rest of segments */
688     for (; k < nf; k++) {
689         for (i = 0; i < max_elems; i++, env->vstart++) {
690             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
691             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
692         }
693     }
694 
695     env->vstart = 0;
696 }
697 
698 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
699 void HELPER(NAME)(void *vd, target_ulong base,       \
700                   CPURISCVState *env, uint32_t desc) \
701 {                                                    \
702     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
703                     ctzl(sizeof(ETYPE)), GETPC());   \
704 }
705 
706 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
707 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
708 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
709 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
710 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
711 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
712 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
713 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
714 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
715 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
716 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
717 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
718 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
719 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
720 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
721 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
722 
723 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
724 void HELPER(NAME)(void *vd, target_ulong base,       \
725                   CPURISCVState *env, uint32_t desc) \
726 {                                                    \
727     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
728                     ctzl(sizeof(ETYPE)), GETPC());   \
729 }
730 
731 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
732 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
733 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
734 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
735 
736 /*
737  * Vector Integer Arithmetic Instructions
738  */
739 
740 /* expand macro args before macro */
741 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
742 
743 /* (TD, T1, T2, TX1, TX2) */
744 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
745 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
746 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
747 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
748 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
749 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
750 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
751 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
752 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
753 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
754 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
755 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
756 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
757 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
758 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
759 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
760 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
761 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
762 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
763 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
764 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
765 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
766 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
767 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
768 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
769 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
770 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
771 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
772 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
773 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
774 
775 /* operation of two vector elements */
776 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
777 
778 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
779 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
780 {                                                               \
781     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
782     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
783     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
784 }
785 #define DO_SUB(N, M) (N - M)
786 #define DO_RSUB(N, M) (M - N)
787 
788 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
789 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
790 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
791 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
792 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
793 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
794 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
795 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
796 
797 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
798                        CPURISCVState *env, uint32_t desc,
799                        opivv2_fn *fn, uint32_t esz)
800 {
801     uint32_t vm = vext_vm(desc);
802     uint32_t vl = env->vl;
803     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
804     uint32_t vta = vext_vta(desc);
805     uint32_t vma = vext_vma(desc);
806     uint32_t i;
807 
808     for (i = env->vstart; i < vl; i++) {
809         if (!vm && !vext_elem_mask(v0, i)) {
810             /* set masked-off elements to 1s */
811             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
812             continue;
813         }
814         fn(vd, vs1, vs2, i);
815     }
816     env->vstart = 0;
817     /* set tail elements to 1s */
818     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
819 }
820 
821 /* generate the helpers for OPIVV */
822 #define GEN_VEXT_VV(NAME, ESZ)                            \
823 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
824                   void *vs2, CPURISCVState *env,          \
825                   uint32_t desc)                          \
826 {                                                         \
827     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
828                do_##NAME, ESZ);                           \
829 }
830 
831 GEN_VEXT_VV(vadd_vv_b, 1)
832 GEN_VEXT_VV(vadd_vv_h, 2)
833 GEN_VEXT_VV(vadd_vv_w, 4)
834 GEN_VEXT_VV(vadd_vv_d, 8)
835 GEN_VEXT_VV(vsub_vv_b, 1)
836 GEN_VEXT_VV(vsub_vv_h, 2)
837 GEN_VEXT_VV(vsub_vv_w, 4)
838 GEN_VEXT_VV(vsub_vv_d, 8)
839 
840 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
841 
842 /*
843  * (T1)s1 gives the real operator type.
844  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
845  */
846 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
847 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
848 {                                                                   \
849     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
850     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
851 }
852 
853 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
854 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
855 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
856 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
857 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
858 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
859 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
860 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
861 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
862 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
863 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
864 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
865 
866 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
867                        CPURISCVState *env, uint32_t desc,
868                        opivx2_fn fn, uint32_t esz)
869 {
870     uint32_t vm = vext_vm(desc);
871     uint32_t vl = env->vl;
872     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
873     uint32_t vta = vext_vta(desc);
874     uint32_t vma = vext_vma(desc);
875     uint32_t i;
876 
877     for (i = env->vstart; i < vl; i++) {
878         if (!vm && !vext_elem_mask(v0, i)) {
879             /* set masked-off elements to 1s */
880             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
881             continue;
882         }
883         fn(vd, s1, vs2, i);
884     }
885     env->vstart = 0;
886     /* set tail elements to 1s */
887     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
888 }
889 
890 /* generate the helpers for OPIVX */
891 #define GEN_VEXT_VX(NAME, ESZ)                            \
892 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
893                   void *vs2, CPURISCVState *env,          \
894                   uint32_t desc)                          \
895 {                                                         \
896     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
897                do_##NAME, ESZ);                           \
898 }
899 
900 GEN_VEXT_VX(vadd_vx_b, 1)
901 GEN_VEXT_VX(vadd_vx_h, 2)
902 GEN_VEXT_VX(vadd_vx_w, 4)
903 GEN_VEXT_VX(vadd_vx_d, 8)
904 GEN_VEXT_VX(vsub_vx_b, 1)
905 GEN_VEXT_VX(vsub_vx_h, 2)
906 GEN_VEXT_VX(vsub_vx_w, 4)
907 GEN_VEXT_VX(vsub_vx_d, 8)
908 GEN_VEXT_VX(vrsub_vx_b, 1)
909 GEN_VEXT_VX(vrsub_vx_h, 2)
910 GEN_VEXT_VX(vrsub_vx_w, 4)
911 GEN_VEXT_VX(vrsub_vx_d, 8)
912 
913 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
914 {
915     intptr_t oprsz = simd_oprsz(desc);
916     intptr_t i;
917 
918     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
919         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
920     }
921 }
922 
923 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
924 {
925     intptr_t oprsz = simd_oprsz(desc);
926     intptr_t i;
927 
928     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
929         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
930     }
931 }
932 
933 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
934 {
935     intptr_t oprsz = simd_oprsz(desc);
936     intptr_t i;
937 
938     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
939         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
940     }
941 }
942 
943 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
944 {
945     intptr_t oprsz = simd_oprsz(desc);
946     intptr_t i;
947 
948     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
949         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
950     }
951 }
952 
953 /* Vector Widening Integer Add/Subtract */
954 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
955 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
956 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
957 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
958 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
959 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
960 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
961 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
962 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
963 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
964 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
965 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
966 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
967 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
968 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
969 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
970 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
971 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
972 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
973 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
974 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
975 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
976 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
977 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
978 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
979 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
980 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
981 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
982 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
983 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
984 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
985 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
986 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
987 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
988 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
989 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
990 GEN_VEXT_VV(vwaddu_vv_b, 2)
991 GEN_VEXT_VV(vwaddu_vv_h, 4)
992 GEN_VEXT_VV(vwaddu_vv_w, 8)
993 GEN_VEXT_VV(vwsubu_vv_b, 2)
994 GEN_VEXT_VV(vwsubu_vv_h, 4)
995 GEN_VEXT_VV(vwsubu_vv_w, 8)
996 GEN_VEXT_VV(vwadd_vv_b, 2)
997 GEN_VEXT_VV(vwadd_vv_h, 4)
998 GEN_VEXT_VV(vwadd_vv_w, 8)
999 GEN_VEXT_VV(vwsub_vv_b, 2)
1000 GEN_VEXT_VV(vwsub_vv_h, 4)
1001 GEN_VEXT_VV(vwsub_vv_w, 8)
1002 GEN_VEXT_VV(vwaddu_wv_b, 2)
1003 GEN_VEXT_VV(vwaddu_wv_h, 4)
1004 GEN_VEXT_VV(vwaddu_wv_w, 8)
1005 GEN_VEXT_VV(vwsubu_wv_b, 2)
1006 GEN_VEXT_VV(vwsubu_wv_h, 4)
1007 GEN_VEXT_VV(vwsubu_wv_w, 8)
1008 GEN_VEXT_VV(vwadd_wv_b, 2)
1009 GEN_VEXT_VV(vwadd_wv_h, 4)
1010 GEN_VEXT_VV(vwadd_wv_w, 8)
1011 GEN_VEXT_VV(vwsub_wv_b, 2)
1012 GEN_VEXT_VV(vwsub_wv_h, 4)
1013 GEN_VEXT_VV(vwsub_wv_w, 8)
1014 
1015 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1016 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1017 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1018 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1019 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1020 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1021 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1022 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1023 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1024 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1025 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1026 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1027 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1028 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1029 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1030 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1031 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1032 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1033 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1034 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1035 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1036 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1037 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1038 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1039 GEN_VEXT_VX(vwaddu_vx_b, 2)
1040 GEN_VEXT_VX(vwaddu_vx_h, 4)
1041 GEN_VEXT_VX(vwaddu_vx_w, 8)
1042 GEN_VEXT_VX(vwsubu_vx_b, 2)
1043 GEN_VEXT_VX(vwsubu_vx_h, 4)
1044 GEN_VEXT_VX(vwsubu_vx_w, 8)
1045 GEN_VEXT_VX(vwadd_vx_b, 2)
1046 GEN_VEXT_VX(vwadd_vx_h, 4)
1047 GEN_VEXT_VX(vwadd_vx_w, 8)
1048 GEN_VEXT_VX(vwsub_vx_b, 2)
1049 GEN_VEXT_VX(vwsub_vx_h, 4)
1050 GEN_VEXT_VX(vwsub_vx_w, 8)
1051 GEN_VEXT_VX(vwaddu_wx_b, 2)
1052 GEN_VEXT_VX(vwaddu_wx_h, 4)
1053 GEN_VEXT_VX(vwaddu_wx_w, 8)
1054 GEN_VEXT_VX(vwsubu_wx_b, 2)
1055 GEN_VEXT_VX(vwsubu_wx_h, 4)
1056 GEN_VEXT_VX(vwsubu_wx_w, 8)
1057 GEN_VEXT_VX(vwadd_wx_b, 2)
1058 GEN_VEXT_VX(vwadd_wx_h, 4)
1059 GEN_VEXT_VX(vwadd_wx_w, 8)
1060 GEN_VEXT_VX(vwsub_wx_b, 2)
1061 GEN_VEXT_VX(vwsub_wx_h, 4)
1062 GEN_VEXT_VX(vwsub_wx_w, 8)
1063 
1064 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1065 #define DO_VADC(N, M, C) (N + M + C)
1066 #define DO_VSBC(N, M, C) (N - M - C)
1067 
1068 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1069 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1070                   CPURISCVState *env, uint32_t desc)          \
1071 {                                                             \
1072     uint32_t vl = env->vl;                                    \
1073     uint32_t esz = sizeof(ETYPE);                             \
1074     uint32_t total_elems =                                    \
1075         vext_get_total_elems(env, desc, esz);                 \
1076     uint32_t vta = vext_vta(desc);                            \
1077     uint32_t i;                                               \
1078                                                               \
1079     for (i = env->vstart; i < vl; i++) {                      \
1080         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1081         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1082         ETYPE carry = vext_elem_mask(v0, i);                  \
1083                                                               \
1084         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1085     }                                                         \
1086     env->vstart = 0;                                          \
1087     /* set tail elements to 1s */                             \
1088     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1089 }
1090 
1091 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1092 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1093 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1094 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1095 
1096 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1097 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1098 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1099 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1100 
1101 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1102 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1103                   CPURISCVState *env, uint32_t desc)                     \
1104 {                                                                        \
1105     uint32_t vl = env->vl;                                               \
1106     uint32_t esz = sizeof(ETYPE);                                        \
1107     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1108     uint32_t vta = vext_vta(desc);                                       \
1109     uint32_t i;                                                          \
1110                                                                          \
1111     for (i = env->vstart; i < vl; i++) {                                 \
1112         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1113         ETYPE carry = vext_elem_mask(v0, i);                             \
1114                                                                          \
1115         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1116     }                                                                    \
1117     env->vstart = 0;                                                     \
1118     /* set tail elements to 1s */                                        \
1119     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1120 }
1121 
1122 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1123 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1124 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1125 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1126 
1127 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1128 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1129 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1130 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1131 
1132 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1133                           (__typeof(N))(N + M) < N)
1134 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1135 
1136 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1137 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1138                   CPURISCVState *env, uint32_t desc)          \
1139 {                                                             \
1140     uint32_t vl = env->vl;                                    \
1141     uint32_t vm = vext_vm(desc);                              \
1142     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1143     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1144     uint32_t i;                                               \
1145                                                               \
1146     for (i = env->vstart; i < vl; i++) {                      \
1147         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1148         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1149         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1150         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1151     }                                                         \
1152     env->vstart = 0;                                          \
1153     /*
1154      * mask destination register are always tail-agnostic
1155      * set tail elements to 1s
1156      */                                                       \
1157     if (vta_all_1s) {                                         \
1158         for (; i < total_elems; i++) {                        \
1159             vext_set_elem_mask(vd, i, 1);                     \
1160         }                                                     \
1161     }                                                         \
1162 }
1163 
1164 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1165 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1168 
1169 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1170 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1173 
1174 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1175 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1176                   void *vs2, CPURISCVState *env, uint32_t desc) \
1177 {                                                               \
1178     uint32_t vl = env->vl;                                      \
1179     uint32_t vm = vext_vm(desc);                                \
1180     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1181     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1182     uint32_t i;                                                 \
1183                                                                 \
1184     for (i = env->vstart; i < vl; i++) {                        \
1185         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1186         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1187         vext_set_elem_mask(vd, i,                               \
1188                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1189     }                                                           \
1190     env->vstart = 0;                                            \
1191     /*
1192      * mask destination register are always tail-agnostic
1193      * set tail elements to 1s
1194      */                                                         \
1195     if (vta_all_1s) {                                           \
1196         for (; i < total_elems; i++) {                          \
1197             vext_set_elem_mask(vd, i, 1);                       \
1198         }                                                       \
1199     }                                                           \
1200 }
1201 
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206 
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211 
1212 /* Vector Bitwise Logical Instructions */
1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225 GEN_VEXT_VV(vand_vv_b, 1)
1226 GEN_VEXT_VV(vand_vv_h, 2)
1227 GEN_VEXT_VV(vand_vv_w, 4)
1228 GEN_VEXT_VV(vand_vv_d, 8)
1229 GEN_VEXT_VV(vor_vv_b, 1)
1230 GEN_VEXT_VV(vor_vv_h, 2)
1231 GEN_VEXT_VV(vor_vv_w, 4)
1232 GEN_VEXT_VV(vor_vv_d, 8)
1233 GEN_VEXT_VV(vxor_vv_b, 1)
1234 GEN_VEXT_VV(vxor_vv_h, 2)
1235 GEN_VEXT_VV(vxor_vv_w, 4)
1236 GEN_VEXT_VV(vxor_vv_d, 8)
1237 
1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250 GEN_VEXT_VX(vand_vx_b, 1)
1251 GEN_VEXT_VX(vand_vx_h, 2)
1252 GEN_VEXT_VX(vand_vx_w, 4)
1253 GEN_VEXT_VX(vand_vx_d, 8)
1254 GEN_VEXT_VX(vor_vx_b, 1)
1255 GEN_VEXT_VX(vor_vx_h, 2)
1256 GEN_VEXT_VX(vor_vx_w, 4)
1257 GEN_VEXT_VX(vor_vx_d, 8)
1258 GEN_VEXT_VX(vxor_vx_b, 1)
1259 GEN_VEXT_VX(vxor_vx_h, 2)
1260 GEN_VEXT_VX(vxor_vx_w, 4)
1261 GEN_VEXT_VX(vxor_vx_d, 8)
1262 
1263 /* Vector Single-Width Bit Shift Instructions */
1264 #define DO_SLL(N, M)  (N << (M))
1265 #define DO_SRL(N, M)  (N >> (M))
1266 
1267 /* generate the helpers for shift instructions with two vector operators */
1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1271 {                                                                         \
1272     uint32_t vm = vext_vm(desc);                                          \
1273     uint32_t vl = env->vl;                                                \
1274     uint32_t esz = sizeof(TS1);                                           \
1275     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1276     uint32_t vta = vext_vta(desc);                                        \
1277     uint32_t vma = vext_vma(desc);                                        \
1278     uint32_t i;                                                           \
1279                                                                           \
1280     for (i = env->vstart; i < vl; i++) {                                  \
1281         if (!vm && !vext_elem_mask(v0, i)) {                              \
1282             /* set masked-off elements to 1s */                           \
1283             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1284             continue;                                                     \
1285         }                                                                 \
1286         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1287         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1288         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1289     }                                                                     \
1290     env->vstart = 0;                                                      \
1291     /* set tail elements to 1s */                                         \
1292     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1293 }
1294 
1295 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1296 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1297 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1298 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1299 
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1301 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1302 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1303 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1304 
1305 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1306 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1307 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1308 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1309 
1310 /*
1311  * generate the helpers for shift instructions with one vector and one scalar
1312  */
1313 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1314 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1315                   void *vs2, CPURISCVState *env,            \
1316                   uint32_t desc)                            \
1317 {                                                           \
1318     uint32_t vm = vext_vm(desc);                            \
1319     uint32_t vl = env->vl;                                  \
1320     uint32_t esz = sizeof(TD);                              \
1321     uint32_t total_elems =                                  \
1322         vext_get_total_elems(env, desc, esz);               \
1323     uint32_t vta = vext_vta(desc);                          \
1324     uint32_t vma = vext_vma(desc);                          \
1325     uint32_t i;                                             \
1326                                                             \
1327     for (i = env->vstart; i < vl; i++) {                    \
1328         if (!vm && !vext_elem_mask(v0, i)) {                \
1329             /* set masked-off elements to 1s */             \
1330             vext_set_elems_1s(vd, vma, i * esz,             \
1331                               (i + 1) * esz);               \
1332             continue;                                       \
1333         }                                                   \
1334         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1335         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1336     }                                                       \
1337     env->vstart = 0;                                        \
1338     /* set tail elements to 1s */                           \
1339     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1340 }
1341 
1342 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1346 
1347 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1348 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1351 
1352 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1353 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1354 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1355 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1356 
1357 /* Vector Narrowing Integer Right Shift Instructions */
1358 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1359 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1360 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1361 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1362 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1363 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1364 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1367 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1368 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1369 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1370 
1371 /* Vector Integer Comparison Instructions */
1372 #define DO_MSEQ(N, M) (N == M)
1373 #define DO_MSNE(N, M) (N != M)
1374 #define DO_MSLT(N, M) (N < M)
1375 #define DO_MSLE(N, M) (N <= M)
1376 #define DO_MSGT(N, M) (N > M)
1377 
1378 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1379 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1380                   CPURISCVState *env, uint32_t desc)          \
1381 {                                                             \
1382     uint32_t vm = vext_vm(desc);                              \
1383     uint32_t vl = env->vl;                                    \
1384     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1385     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1386     uint32_t vma = vext_vma(desc);                            \
1387     uint32_t i;                                               \
1388                                                               \
1389     for (i = env->vstart; i < vl; i++) {                      \
1390         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1391         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1392         if (!vm && !vext_elem_mask(v0, i)) {                  \
1393             /* set masked-off elements to 1s */               \
1394             if (vma) {                                        \
1395                 vext_set_elem_mask(vd, i, 1);                 \
1396             }                                                 \
1397             continue;                                         \
1398         }                                                     \
1399         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1400     }                                                         \
1401     env->vstart = 0;                                          \
1402     /*
1403      * mask destination register are always tail-agnostic
1404      * set tail elements to 1s
1405      */                                                       \
1406     if (vta_all_1s) {                                         \
1407         for (; i < total_elems; i++) {                        \
1408             vext_set_elem_mask(vd, i, 1);                     \
1409         }                                                     \
1410     }                                                         \
1411 }
1412 
1413 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1414 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1415 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1416 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1417 
1418 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1419 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1420 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1421 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1422 
1423 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1424 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1425 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1426 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1427 
1428 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1429 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1430 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1431 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1432 
1433 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1434 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1435 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1436 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1437 
1438 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1439 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1440 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1441 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1442 
1443 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1444 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1445                   CPURISCVState *env, uint32_t desc)                \
1446 {                                                                   \
1447     uint32_t vm = vext_vm(desc);                                    \
1448     uint32_t vl = env->vl;                                          \
1449     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1450     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1451     uint32_t vma = vext_vma(desc);                                  \
1452     uint32_t i;                                                     \
1453                                                                     \
1454     for (i = env->vstart; i < vl; i++) {                            \
1455         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1456         if (!vm && !vext_elem_mask(v0, i)) {                        \
1457             /* set masked-off elements to 1s */                     \
1458             if (vma) {                                              \
1459                 vext_set_elem_mask(vd, i, 1);                       \
1460             }                                                       \
1461             continue;                                               \
1462         }                                                           \
1463         vext_set_elem_mask(vd, i,                                   \
1464                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1465     }                                                               \
1466     env->vstart = 0;                                                \
1467     /*
1468      * mask destination register are always tail-agnostic
1469      * set tail elements to 1s
1470      */                                                             \
1471     if (vta_all_1s) {                                               \
1472         for (; i < total_elems; i++) {                              \
1473             vext_set_elem_mask(vd, i, 1);                           \
1474         }                                                           \
1475     }                                                               \
1476 }
1477 
1478 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1479 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1480 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1481 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1482 
1483 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1484 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1485 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1486 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1487 
1488 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1489 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1490 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1491 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1492 
1493 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1494 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1495 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1496 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1497 
1498 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1499 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1500 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1501 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1502 
1503 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1504 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1505 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1506 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1507 
1508 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1509 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1510 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1511 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1512 
1513 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1514 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1515 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1516 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1517 
1518 /* Vector Integer Min/Max Instructions */
1519 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1520 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1521 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1522 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1523 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1524 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1525 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1526 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1527 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1528 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1529 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1530 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1531 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1532 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1533 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1534 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1535 GEN_VEXT_VV(vminu_vv_b, 1)
1536 GEN_VEXT_VV(vminu_vv_h, 2)
1537 GEN_VEXT_VV(vminu_vv_w, 4)
1538 GEN_VEXT_VV(vminu_vv_d, 8)
1539 GEN_VEXT_VV(vmin_vv_b, 1)
1540 GEN_VEXT_VV(vmin_vv_h, 2)
1541 GEN_VEXT_VV(vmin_vv_w, 4)
1542 GEN_VEXT_VV(vmin_vv_d, 8)
1543 GEN_VEXT_VV(vmaxu_vv_b, 1)
1544 GEN_VEXT_VV(vmaxu_vv_h, 2)
1545 GEN_VEXT_VV(vmaxu_vv_w, 4)
1546 GEN_VEXT_VV(vmaxu_vv_d, 8)
1547 GEN_VEXT_VV(vmax_vv_b, 1)
1548 GEN_VEXT_VV(vmax_vv_h, 2)
1549 GEN_VEXT_VV(vmax_vv_w, 4)
1550 GEN_VEXT_VV(vmax_vv_d, 8)
1551 
1552 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1553 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1554 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1555 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1556 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1557 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1558 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1559 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1560 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1561 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1562 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1563 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1564 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1565 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1566 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1567 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1568 GEN_VEXT_VX(vminu_vx_b, 1)
1569 GEN_VEXT_VX(vminu_vx_h, 2)
1570 GEN_VEXT_VX(vminu_vx_w, 4)
1571 GEN_VEXT_VX(vminu_vx_d, 8)
1572 GEN_VEXT_VX(vmin_vx_b, 1)
1573 GEN_VEXT_VX(vmin_vx_h, 2)
1574 GEN_VEXT_VX(vmin_vx_w, 4)
1575 GEN_VEXT_VX(vmin_vx_d, 8)
1576 GEN_VEXT_VX(vmaxu_vx_b, 1)
1577 GEN_VEXT_VX(vmaxu_vx_h, 2)
1578 GEN_VEXT_VX(vmaxu_vx_w, 4)
1579 GEN_VEXT_VX(vmaxu_vx_d, 8)
1580 GEN_VEXT_VX(vmax_vx_b, 1)
1581 GEN_VEXT_VX(vmax_vx_h, 2)
1582 GEN_VEXT_VX(vmax_vx_w, 4)
1583 GEN_VEXT_VX(vmax_vx_d, 8)
1584 
1585 /* Vector Single-Width Integer Multiply Instructions */
1586 #define DO_MUL(N, M) (N * M)
1587 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1588 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1589 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1590 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1591 GEN_VEXT_VV(vmul_vv_b, 1)
1592 GEN_VEXT_VV(vmul_vv_h, 2)
1593 GEN_VEXT_VV(vmul_vv_w, 4)
1594 GEN_VEXT_VV(vmul_vv_d, 8)
1595 
1596 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1597 {
1598     return (int16_t)s2 * (int16_t)s1 >> 8;
1599 }
1600 
1601 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1602 {
1603     return (int32_t)s2 * (int32_t)s1 >> 16;
1604 }
1605 
1606 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1607 {
1608     return (int64_t)s2 * (int64_t)s1 >> 32;
1609 }
1610 
1611 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1612 {
1613     uint64_t hi_64, lo_64;
1614 
1615     muls64(&lo_64, &hi_64, s1, s2);
1616     return hi_64;
1617 }
1618 
1619 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1620 {
1621     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1622 }
1623 
1624 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1625 {
1626     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1627 }
1628 
1629 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1630 {
1631     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1632 }
1633 
1634 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1635 {
1636     uint64_t hi_64, lo_64;
1637 
1638     mulu64(&lo_64, &hi_64, s2, s1);
1639     return hi_64;
1640 }
1641 
1642 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1643 {
1644     return (int16_t)s2 * (uint16_t)s1 >> 8;
1645 }
1646 
1647 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1648 {
1649     return (int32_t)s2 * (uint32_t)s1 >> 16;
1650 }
1651 
1652 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1653 {
1654     return (int64_t)s2 * (uint64_t)s1 >> 32;
1655 }
1656 
1657 /*
1658  * Let  A = signed operand,
1659  *      B = unsigned operand
1660  *      P = mulu64(A, B), unsigned product
1661  *
1662  * LET  X = 2 ** 64  - A, 2's complement of A
1663  *      SP = signed product
1664  * THEN
1665  *      IF A < 0
1666  *          SP = -X * B
1667  *             = -(2 ** 64 - A) * B
1668  *             = A * B - 2 ** 64 * B
1669  *             = P - 2 ** 64 * B
1670  *      ELSE
1671  *          SP = P
1672  * THEN
1673  *      HI_P -= (A < 0 ? B : 0)
1674  */
1675 
1676 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1677 {
1678     uint64_t hi_64, lo_64;
1679 
1680     mulu64(&lo_64, &hi_64, s2, s1);
1681 
1682     hi_64 -= s2 < 0 ? s1 : 0;
1683     return hi_64;
1684 }
1685 
1686 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1687 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1688 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1689 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1690 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1691 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1692 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1693 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1694 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1695 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1696 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1697 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1698 GEN_VEXT_VV(vmulh_vv_b, 1)
1699 GEN_VEXT_VV(vmulh_vv_h, 2)
1700 GEN_VEXT_VV(vmulh_vv_w, 4)
1701 GEN_VEXT_VV(vmulh_vv_d, 8)
1702 GEN_VEXT_VV(vmulhu_vv_b, 1)
1703 GEN_VEXT_VV(vmulhu_vv_h, 2)
1704 GEN_VEXT_VV(vmulhu_vv_w, 4)
1705 GEN_VEXT_VV(vmulhu_vv_d, 8)
1706 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1707 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1708 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1709 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1710 
1711 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1712 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1713 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1714 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1715 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1716 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1717 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1718 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1719 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1720 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1721 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1722 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1723 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1724 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1725 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1726 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1727 GEN_VEXT_VX(vmul_vx_b, 1)
1728 GEN_VEXT_VX(vmul_vx_h, 2)
1729 GEN_VEXT_VX(vmul_vx_w, 4)
1730 GEN_VEXT_VX(vmul_vx_d, 8)
1731 GEN_VEXT_VX(vmulh_vx_b, 1)
1732 GEN_VEXT_VX(vmulh_vx_h, 2)
1733 GEN_VEXT_VX(vmulh_vx_w, 4)
1734 GEN_VEXT_VX(vmulh_vx_d, 8)
1735 GEN_VEXT_VX(vmulhu_vx_b, 1)
1736 GEN_VEXT_VX(vmulhu_vx_h, 2)
1737 GEN_VEXT_VX(vmulhu_vx_w, 4)
1738 GEN_VEXT_VX(vmulhu_vx_d, 8)
1739 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1740 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1741 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1742 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1743 
1744 /* Vector Integer Divide Instructions */
1745 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1746 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1747 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1748         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1749 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1750         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1751 
1752 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1753 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1754 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1755 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1756 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1757 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1758 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1759 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1760 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1761 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1762 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1763 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1764 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1765 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1766 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1767 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1768 GEN_VEXT_VV(vdivu_vv_b, 1)
1769 GEN_VEXT_VV(vdivu_vv_h, 2)
1770 GEN_VEXT_VV(vdivu_vv_w, 4)
1771 GEN_VEXT_VV(vdivu_vv_d, 8)
1772 GEN_VEXT_VV(vdiv_vv_b, 1)
1773 GEN_VEXT_VV(vdiv_vv_h, 2)
1774 GEN_VEXT_VV(vdiv_vv_w, 4)
1775 GEN_VEXT_VV(vdiv_vv_d, 8)
1776 GEN_VEXT_VV(vremu_vv_b, 1)
1777 GEN_VEXT_VV(vremu_vv_h, 2)
1778 GEN_VEXT_VV(vremu_vv_w, 4)
1779 GEN_VEXT_VV(vremu_vv_d, 8)
1780 GEN_VEXT_VV(vrem_vv_b, 1)
1781 GEN_VEXT_VV(vrem_vv_h, 2)
1782 GEN_VEXT_VV(vrem_vv_w, 4)
1783 GEN_VEXT_VV(vrem_vv_d, 8)
1784 
1785 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1786 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1787 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1788 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1789 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1790 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1791 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1792 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1793 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1794 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1795 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1796 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1797 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1798 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1799 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1800 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1801 GEN_VEXT_VX(vdivu_vx_b, 1)
1802 GEN_VEXT_VX(vdivu_vx_h, 2)
1803 GEN_VEXT_VX(vdivu_vx_w, 4)
1804 GEN_VEXT_VX(vdivu_vx_d, 8)
1805 GEN_VEXT_VX(vdiv_vx_b, 1)
1806 GEN_VEXT_VX(vdiv_vx_h, 2)
1807 GEN_VEXT_VX(vdiv_vx_w, 4)
1808 GEN_VEXT_VX(vdiv_vx_d, 8)
1809 GEN_VEXT_VX(vremu_vx_b, 1)
1810 GEN_VEXT_VX(vremu_vx_h, 2)
1811 GEN_VEXT_VX(vremu_vx_w, 4)
1812 GEN_VEXT_VX(vremu_vx_d, 8)
1813 GEN_VEXT_VX(vrem_vx_b, 1)
1814 GEN_VEXT_VX(vrem_vx_h, 2)
1815 GEN_VEXT_VX(vrem_vx_w, 4)
1816 GEN_VEXT_VX(vrem_vx_d, 8)
1817 
1818 /* Vector Widening Integer Multiply Instructions */
1819 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1820 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1821 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1822 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1823 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1824 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1825 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1826 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1827 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1828 GEN_VEXT_VV(vwmul_vv_b, 2)
1829 GEN_VEXT_VV(vwmul_vv_h, 4)
1830 GEN_VEXT_VV(vwmul_vv_w, 8)
1831 GEN_VEXT_VV(vwmulu_vv_b, 2)
1832 GEN_VEXT_VV(vwmulu_vv_h, 4)
1833 GEN_VEXT_VV(vwmulu_vv_w, 8)
1834 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1835 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1836 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1837 
1838 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1839 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1840 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1841 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1842 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1843 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1844 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1845 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1846 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1847 GEN_VEXT_VX(vwmul_vx_b, 2)
1848 GEN_VEXT_VX(vwmul_vx_h, 4)
1849 GEN_VEXT_VX(vwmul_vx_w, 8)
1850 GEN_VEXT_VX(vwmulu_vx_b, 2)
1851 GEN_VEXT_VX(vwmulu_vx_h, 4)
1852 GEN_VEXT_VX(vwmulu_vx_w, 8)
1853 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1854 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1855 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1856 
1857 /* Vector Single-Width Integer Multiply-Add Instructions */
1858 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1859 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1860 {                                                                  \
1861     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1862     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1863     TD d = *((TD *)vd + HD(i));                                    \
1864     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1865 }
1866 
1867 #define DO_MACC(N, M, D) (M * N + D)
1868 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1869 #define DO_MADD(N, M, D) (M * D + N)
1870 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1871 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1872 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1873 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1874 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1875 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1876 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1877 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1878 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1879 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1880 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1881 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1882 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1883 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1884 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1885 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1886 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1887 GEN_VEXT_VV(vmacc_vv_b, 1)
1888 GEN_VEXT_VV(vmacc_vv_h, 2)
1889 GEN_VEXT_VV(vmacc_vv_w, 4)
1890 GEN_VEXT_VV(vmacc_vv_d, 8)
1891 GEN_VEXT_VV(vnmsac_vv_b, 1)
1892 GEN_VEXT_VV(vnmsac_vv_h, 2)
1893 GEN_VEXT_VV(vnmsac_vv_w, 4)
1894 GEN_VEXT_VV(vnmsac_vv_d, 8)
1895 GEN_VEXT_VV(vmadd_vv_b, 1)
1896 GEN_VEXT_VV(vmadd_vv_h, 2)
1897 GEN_VEXT_VV(vmadd_vv_w, 4)
1898 GEN_VEXT_VV(vmadd_vv_d, 8)
1899 GEN_VEXT_VV(vnmsub_vv_b, 1)
1900 GEN_VEXT_VV(vnmsub_vv_h, 2)
1901 GEN_VEXT_VV(vnmsub_vv_w, 4)
1902 GEN_VEXT_VV(vnmsub_vv_d, 8)
1903 
1904 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1905 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1906 {                                                                   \
1907     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1908     TD d = *((TD *)vd + HD(i));                                     \
1909     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1910 }
1911 
1912 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1913 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1914 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1915 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1916 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1917 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1918 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1919 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1920 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1921 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1922 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1923 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1924 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1925 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1926 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1927 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1928 GEN_VEXT_VX(vmacc_vx_b, 1)
1929 GEN_VEXT_VX(vmacc_vx_h, 2)
1930 GEN_VEXT_VX(vmacc_vx_w, 4)
1931 GEN_VEXT_VX(vmacc_vx_d, 8)
1932 GEN_VEXT_VX(vnmsac_vx_b, 1)
1933 GEN_VEXT_VX(vnmsac_vx_h, 2)
1934 GEN_VEXT_VX(vnmsac_vx_w, 4)
1935 GEN_VEXT_VX(vnmsac_vx_d, 8)
1936 GEN_VEXT_VX(vmadd_vx_b, 1)
1937 GEN_VEXT_VX(vmadd_vx_h, 2)
1938 GEN_VEXT_VX(vmadd_vx_w, 4)
1939 GEN_VEXT_VX(vmadd_vx_d, 8)
1940 GEN_VEXT_VX(vnmsub_vx_b, 1)
1941 GEN_VEXT_VX(vnmsub_vx_h, 2)
1942 GEN_VEXT_VX(vnmsub_vx_w, 4)
1943 GEN_VEXT_VX(vnmsub_vx_d, 8)
1944 
1945 /* Vector Widening Integer Multiply-Add Instructions */
1946 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1947 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1948 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1949 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1950 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1951 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1952 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1953 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1954 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1955 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1956 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1957 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1958 GEN_VEXT_VV(vwmacc_vv_b, 2)
1959 GEN_VEXT_VV(vwmacc_vv_h, 4)
1960 GEN_VEXT_VV(vwmacc_vv_w, 8)
1961 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1962 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1963 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1964 
1965 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1968 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1969 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1970 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1971 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1972 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1973 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1974 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1975 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1976 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1977 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1978 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1979 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1980 GEN_VEXT_VX(vwmacc_vx_b, 2)
1981 GEN_VEXT_VX(vwmacc_vx_h, 4)
1982 GEN_VEXT_VX(vwmacc_vx_w, 8)
1983 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1984 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1985 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1986 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1987 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1988 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1989 
1990 /* Vector Integer Merge and Move Instructions */
1991 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1992 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1993                   uint32_t desc)                                     \
1994 {                                                                    \
1995     uint32_t vl = env->vl;                                           \
1996     uint32_t esz = sizeof(ETYPE);                                    \
1997     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1998     uint32_t vta = vext_vta(desc);                                   \
1999     uint32_t i;                                                      \
2000                                                                      \
2001     for (i = env->vstart; i < vl; i++) {                             \
2002         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2003         *((ETYPE *)vd + H(i)) = s1;                                  \
2004     }                                                                \
2005     env->vstart = 0;                                                 \
2006     /* set tail elements to 1s */                                    \
2007     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2008 }
2009 
2010 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2011 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2012 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2013 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2014 
2015 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2016 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2017                   uint32_t desc)                                     \
2018 {                                                                    \
2019     uint32_t vl = env->vl;                                           \
2020     uint32_t esz = sizeof(ETYPE);                                    \
2021     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2022     uint32_t vta = vext_vta(desc);                                   \
2023     uint32_t i;                                                      \
2024                                                                      \
2025     for (i = env->vstart; i < vl; i++) {                             \
2026         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2027     }                                                                \
2028     env->vstart = 0;                                                 \
2029     /* set tail elements to 1s */                                    \
2030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2031 }
2032 
2033 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2034 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2035 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2036 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2037 
2038 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2039 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2040                   CPURISCVState *env, uint32_t desc)                 \
2041 {                                                                    \
2042     uint32_t vl = env->vl;                                           \
2043     uint32_t esz = sizeof(ETYPE);                                    \
2044     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2045     uint32_t vta = vext_vta(desc);                                   \
2046     uint32_t i;                                                      \
2047                                                                      \
2048     for (i = env->vstart; i < vl; i++) {                             \
2049         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2050         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2051     }                                                                \
2052     env->vstart = 0;                                                 \
2053     /* set tail elements to 1s */                                    \
2054     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2055 }
2056 
2057 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2058 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2059 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2060 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2061 
2062 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2063 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2064                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2065 {                                                                    \
2066     uint32_t vl = env->vl;                                           \
2067     uint32_t esz = sizeof(ETYPE);                                    \
2068     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2069     uint32_t vta = vext_vta(desc);                                   \
2070     uint32_t i;                                                      \
2071                                                                      \
2072     for (i = env->vstart; i < vl; i++) {                             \
2073         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2074         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2075                    (ETYPE)(target_long)s1);                          \
2076         *((ETYPE *)vd + H(i)) = d;                                   \
2077     }                                                                \
2078     env->vstart = 0;                                                 \
2079     /* set tail elements to 1s */                                    \
2080     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2081 }
2082 
2083 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2084 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2085 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2086 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2087 
2088 /*
2089  * Vector Fixed-Point Arithmetic Instructions
2090  */
2091 
2092 /* Vector Single-Width Saturating Add and Subtract */
2093 
2094 /*
2095  * As fixed point instructions probably have round mode and saturation,
2096  * define common macros for fixed point here.
2097  */
2098 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2099                           CPURISCVState *env, int vxrm);
2100 
2101 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2102 static inline void                                                  \
2103 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2104           CPURISCVState *env, int vxrm)                             \
2105 {                                                                   \
2106     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2107     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2108     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2109 }
2110 
2111 static inline void
2112 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2113              CPURISCVState *env,
2114              uint32_t vl, uint32_t vm, int vxrm,
2115              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2116 {
2117     for (uint32_t i = env->vstart; i < vl; i++) {
2118         if (!vm && !vext_elem_mask(v0, i)) {
2119             /* set masked-off elements to 1s */
2120             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2121             continue;
2122         }
2123         fn(vd, vs1, vs2, i, env, vxrm);
2124     }
2125     env->vstart = 0;
2126 }
2127 
2128 static inline void
2129 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2130              CPURISCVState *env,
2131              uint32_t desc,
2132              opivv2_rm_fn *fn, uint32_t esz)
2133 {
2134     uint32_t vm = vext_vm(desc);
2135     uint32_t vl = env->vl;
2136     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2137     uint32_t vta = vext_vta(desc);
2138     uint32_t vma = vext_vma(desc);
2139 
2140     switch (env->vxrm) {
2141     case 0: /* rnu */
2142         vext_vv_rm_1(vd, v0, vs1, vs2,
2143                      env, vl, vm, 0, fn, vma, esz);
2144         break;
2145     case 1: /* rne */
2146         vext_vv_rm_1(vd, v0, vs1, vs2,
2147                      env, vl, vm, 1, fn, vma, esz);
2148         break;
2149     case 2: /* rdn */
2150         vext_vv_rm_1(vd, v0, vs1, vs2,
2151                      env, vl, vm, 2, fn, vma, esz);
2152         break;
2153     default: /* rod */
2154         vext_vv_rm_1(vd, v0, vs1, vs2,
2155                      env, vl, vm, 3, fn, vma, esz);
2156         break;
2157     }
2158     /* set tail elements to 1s */
2159     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2160 }
2161 
2162 /* generate helpers for fixed point instructions with OPIVV format */
2163 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2164 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2165                   CPURISCVState *env, uint32_t desc)            \
2166 {                                                               \
2167     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2168                  do_##NAME, ESZ);                               \
2169 }
2170 
2171 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2172                              uint8_t b)
2173 {
2174     uint8_t res = a + b;
2175     if (res < a) {
2176         res = UINT8_MAX;
2177         env->vxsat = 0x1;
2178     }
2179     return res;
2180 }
2181 
2182 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2183                                uint16_t b)
2184 {
2185     uint16_t res = a + b;
2186     if (res < a) {
2187         res = UINT16_MAX;
2188         env->vxsat = 0x1;
2189     }
2190     return res;
2191 }
2192 
2193 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2194                                uint32_t b)
2195 {
2196     uint32_t res = a + b;
2197     if (res < a) {
2198         res = UINT32_MAX;
2199         env->vxsat = 0x1;
2200     }
2201     return res;
2202 }
2203 
2204 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2205                                uint64_t b)
2206 {
2207     uint64_t res = a + b;
2208     if (res < a) {
2209         res = UINT64_MAX;
2210         env->vxsat = 0x1;
2211     }
2212     return res;
2213 }
2214 
2215 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2216 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2217 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2218 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2219 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2220 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2221 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2222 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2223 
2224 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2225                           CPURISCVState *env, int vxrm);
2226 
2227 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2228 static inline void                                                  \
2229 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2230           CPURISCVState *env, int vxrm)                             \
2231 {                                                                   \
2232     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2233     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2234 }
2235 
2236 static inline void
2237 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2238              CPURISCVState *env,
2239              uint32_t vl, uint32_t vm, int vxrm,
2240              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2241 {
2242     for (uint32_t i = env->vstart; i < vl; i++) {
2243         if (!vm && !vext_elem_mask(v0, i)) {
2244             /* set masked-off elements to 1s */
2245             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2246             continue;
2247         }
2248         fn(vd, s1, vs2, i, env, vxrm);
2249     }
2250     env->vstart = 0;
2251 }
2252 
2253 static inline void
2254 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2255              CPURISCVState *env,
2256              uint32_t desc,
2257              opivx2_rm_fn *fn, uint32_t esz)
2258 {
2259     uint32_t vm = vext_vm(desc);
2260     uint32_t vl = env->vl;
2261     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2262     uint32_t vta = vext_vta(desc);
2263     uint32_t vma = vext_vma(desc);
2264 
2265     switch (env->vxrm) {
2266     case 0: /* rnu */
2267         vext_vx_rm_1(vd, v0, s1, vs2,
2268                      env, vl, vm, 0, fn, vma, esz);
2269         break;
2270     case 1: /* rne */
2271         vext_vx_rm_1(vd, v0, s1, vs2,
2272                      env, vl, vm, 1, fn, vma, esz);
2273         break;
2274     case 2: /* rdn */
2275         vext_vx_rm_1(vd, v0, s1, vs2,
2276                      env, vl, vm, 2, fn, vma, esz);
2277         break;
2278     default: /* rod */
2279         vext_vx_rm_1(vd, v0, s1, vs2,
2280                      env, vl, vm, 3, fn, vma, esz);
2281         break;
2282     }
2283     /* set tail elements to 1s */
2284     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2285 }
2286 
2287 /* generate helpers for fixed point instructions with OPIVX format */
2288 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2289 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2290                   void *vs2, CPURISCVState *env,          \
2291                   uint32_t desc)                          \
2292 {                                                         \
2293     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2294                  do_##NAME, ESZ);                         \
2295 }
2296 
2297 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2298 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2299 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2300 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2301 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2302 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2303 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2304 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2305 
2306 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2307 {
2308     int8_t res = a + b;
2309     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2310         res = a > 0 ? INT8_MAX : INT8_MIN;
2311         env->vxsat = 0x1;
2312     }
2313     return res;
2314 }
2315 
2316 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2317                              int16_t b)
2318 {
2319     int16_t res = a + b;
2320     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2321         res = a > 0 ? INT16_MAX : INT16_MIN;
2322         env->vxsat = 0x1;
2323     }
2324     return res;
2325 }
2326 
2327 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2328                              int32_t b)
2329 {
2330     int32_t res = a + b;
2331     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2332         res = a > 0 ? INT32_MAX : INT32_MIN;
2333         env->vxsat = 0x1;
2334     }
2335     return res;
2336 }
2337 
2338 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2339                              int64_t b)
2340 {
2341     int64_t res = a + b;
2342     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2343         res = a > 0 ? INT64_MAX : INT64_MIN;
2344         env->vxsat = 0x1;
2345     }
2346     return res;
2347 }
2348 
2349 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2350 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2351 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2352 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2353 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2354 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2355 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2356 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2357 
2358 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2359 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2360 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2361 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2362 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2363 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2364 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2365 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2366 
2367 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2368                              uint8_t b)
2369 {
2370     uint8_t res = a - b;
2371     if (res > a) {
2372         res = 0;
2373         env->vxsat = 0x1;
2374     }
2375     return res;
2376 }
2377 
2378 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2379                                uint16_t b)
2380 {
2381     uint16_t res = a - b;
2382     if (res > a) {
2383         res = 0;
2384         env->vxsat = 0x1;
2385     }
2386     return res;
2387 }
2388 
2389 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2390                                uint32_t b)
2391 {
2392     uint32_t res = a - b;
2393     if (res > a) {
2394         res = 0;
2395         env->vxsat = 0x1;
2396     }
2397     return res;
2398 }
2399 
2400 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2401                                uint64_t b)
2402 {
2403     uint64_t res = a - b;
2404     if (res > a) {
2405         res = 0;
2406         env->vxsat = 0x1;
2407     }
2408     return res;
2409 }
2410 
2411 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2412 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2413 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2414 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2415 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2416 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2417 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2418 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2419 
2420 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2421 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2422 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2423 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2424 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2425 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2426 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2427 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2428 
2429 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2430 {
2431     int8_t res = a - b;
2432     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2433         res = a >= 0 ? INT8_MAX : INT8_MIN;
2434         env->vxsat = 0x1;
2435     }
2436     return res;
2437 }
2438 
2439 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2440                              int16_t b)
2441 {
2442     int16_t res = a - b;
2443     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2444         res = a >= 0 ? INT16_MAX : INT16_MIN;
2445         env->vxsat = 0x1;
2446     }
2447     return res;
2448 }
2449 
2450 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2451                              int32_t b)
2452 {
2453     int32_t res = a - b;
2454     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2455         res = a >= 0 ? INT32_MAX : INT32_MIN;
2456         env->vxsat = 0x1;
2457     }
2458     return res;
2459 }
2460 
2461 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2462                              int64_t b)
2463 {
2464     int64_t res = a - b;
2465     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2466         res = a >= 0 ? INT64_MAX : INT64_MIN;
2467         env->vxsat = 0x1;
2468     }
2469     return res;
2470 }
2471 
2472 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2473 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2474 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2475 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2476 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2477 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2478 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2479 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2480 
2481 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2482 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2483 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2484 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2485 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2486 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2487 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2488 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2489 
2490 /* Vector Single-Width Averaging Add and Subtract */
2491 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2492 {
2493     uint8_t d = extract64(v, shift, 1);
2494     uint8_t d1;
2495     uint64_t D1, D2;
2496 
2497     if (shift == 0 || shift > 64) {
2498         return 0;
2499     }
2500 
2501     d1 = extract64(v, shift - 1, 1);
2502     D1 = extract64(v, 0, shift);
2503     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2504         return d1;
2505     } else if (vxrm == 1) { /* round-to-nearest-even */
2506         if (shift > 1) {
2507             D2 = extract64(v, 0, shift - 1);
2508             return d1 & ((D2 != 0) | d);
2509         } else {
2510             return d1 & d;
2511         }
2512     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2513         return !d & (D1 != 0);
2514     }
2515     return 0; /* round-down (truncate) */
2516 }
2517 
2518 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2519                              int32_t b)
2520 {
2521     int64_t res = (int64_t)a + b;
2522     uint8_t round = get_round(vxrm, res, 1);
2523 
2524     return (res >> 1) + round;
2525 }
2526 
2527 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2528                              int64_t b)
2529 {
2530     int64_t res = a + b;
2531     uint8_t round = get_round(vxrm, res, 1);
2532     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2533 
2534     /* With signed overflow, bit 64 is inverse of bit 63. */
2535     return ((res >> 1) ^ over) + round;
2536 }
2537 
2538 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2539 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2540 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2541 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2542 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2543 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2544 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2545 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2546 
2547 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2548 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2549 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2550 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2551 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2552 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2553 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2554 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2555 
2556 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2557                                uint32_t a, uint32_t b)
2558 {
2559     uint64_t res = (uint64_t)a + b;
2560     uint8_t round = get_round(vxrm, res, 1);
2561 
2562     return (res >> 1) + round;
2563 }
2564 
2565 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2566                                uint64_t a, uint64_t b)
2567 {
2568     uint64_t res = a + b;
2569     uint8_t round = get_round(vxrm, res, 1);
2570     uint64_t over = (uint64_t)(res < a) << 63;
2571 
2572     return ((res >> 1) | over) + round;
2573 }
2574 
2575 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2576 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2577 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2578 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2579 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2580 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2581 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2582 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2583 
2584 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2585 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2586 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2587 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2588 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2589 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2590 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2591 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2592 
2593 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2594                              int32_t b)
2595 {
2596     int64_t res = (int64_t)a - b;
2597     uint8_t round = get_round(vxrm, res, 1);
2598 
2599     return (res >> 1) + round;
2600 }
2601 
2602 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2603                              int64_t b)
2604 {
2605     int64_t res = (int64_t)a - b;
2606     uint8_t round = get_round(vxrm, res, 1);
2607     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2608 
2609     /* With signed overflow, bit 64 is inverse of bit 63. */
2610     return ((res >> 1) ^ over) + round;
2611 }
2612 
2613 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2614 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2615 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2616 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2617 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2618 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2619 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2620 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2621 
2622 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2623 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2624 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2625 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2626 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2627 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2628 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2629 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2630 
2631 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2632                                uint32_t a, uint32_t b)
2633 {
2634     int64_t res = (int64_t)a - b;
2635     uint8_t round = get_round(vxrm, res, 1);
2636 
2637     return (res >> 1) + round;
2638 }
2639 
2640 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2641                                uint64_t a, uint64_t b)
2642 {
2643     uint64_t res = (uint64_t)a - b;
2644     uint8_t round = get_round(vxrm, res, 1);
2645     uint64_t over = (uint64_t)(res > a) << 63;
2646 
2647     return ((res >> 1) | over) + round;
2648 }
2649 
2650 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2651 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2652 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2653 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2654 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2655 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2656 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2657 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2658 
2659 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2660 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2661 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2662 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2663 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2664 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2665 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2666 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2667 
2668 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2669 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2670 {
2671     uint8_t round;
2672     int16_t res;
2673 
2674     res = (int16_t)a * (int16_t)b;
2675     round = get_round(vxrm, res, 7);
2676     res = (res >> 7) + round;
2677 
2678     if (res > INT8_MAX) {
2679         env->vxsat = 0x1;
2680         return INT8_MAX;
2681     } else if (res < INT8_MIN) {
2682         env->vxsat = 0x1;
2683         return INT8_MIN;
2684     } else {
2685         return res;
2686     }
2687 }
2688 
2689 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2690 {
2691     uint8_t round;
2692     int32_t res;
2693 
2694     res = (int32_t)a * (int32_t)b;
2695     round = get_round(vxrm, res, 15);
2696     res = (res >> 15) + round;
2697 
2698     if (res > INT16_MAX) {
2699         env->vxsat = 0x1;
2700         return INT16_MAX;
2701     } else if (res < INT16_MIN) {
2702         env->vxsat = 0x1;
2703         return INT16_MIN;
2704     } else {
2705         return res;
2706     }
2707 }
2708 
2709 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2710 {
2711     uint8_t round;
2712     int64_t res;
2713 
2714     res = (int64_t)a * (int64_t)b;
2715     round = get_round(vxrm, res, 31);
2716     res = (res >> 31) + round;
2717 
2718     if (res > INT32_MAX) {
2719         env->vxsat = 0x1;
2720         return INT32_MAX;
2721     } else if (res < INT32_MIN) {
2722         env->vxsat = 0x1;
2723         return INT32_MIN;
2724     } else {
2725         return res;
2726     }
2727 }
2728 
2729 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2730 {
2731     uint8_t round;
2732     uint64_t hi_64, lo_64;
2733     int64_t res;
2734 
2735     if (a == INT64_MIN && b == INT64_MIN) {
2736         env->vxsat = 1;
2737         return INT64_MAX;
2738     }
2739 
2740     muls64(&lo_64, &hi_64, a, b);
2741     round = get_round(vxrm, lo_64, 63);
2742     /*
2743      * Cannot overflow, as there are always
2744      * 2 sign bits after multiply.
2745      */
2746     res = (hi_64 << 1) | (lo_64 >> 63);
2747     if (round) {
2748         if (res == INT64_MAX) {
2749             env->vxsat = 1;
2750         } else {
2751             res += 1;
2752         }
2753     }
2754     return res;
2755 }
2756 
2757 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2758 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2759 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2760 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2761 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2762 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2763 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2764 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2765 
2766 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2767 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2768 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2769 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2770 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2771 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2772 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2773 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2774 
2775 /* Vector Single-Width Scaling Shift Instructions */
2776 static inline uint8_t
2777 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2778 {
2779     uint8_t round, shift = b & 0x7;
2780     uint8_t res;
2781 
2782     round = get_round(vxrm, a, shift);
2783     res = (a >> shift) + round;
2784     return res;
2785 }
2786 static inline uint16_t
2787 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2788 {
2789     uint8_t round, shift = b & 0xf;
2790 
2791     round = get_round(vxrm, a, shift);
2792     return (a >> shift) + round;
2793 }
2794 static inline uint32_t
2795 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2796 {
2797     uint8_t round, shift = b & 0x1f;
2798 
2799     round = get_round(vxrm, a, shift);
2800     return (a >> shift) + round;
2801 }
2802 static inline uint64_t
2803 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2804 {
2805     uint8_t round, shift = b & 0x3f;
2806 
2807     round = get_round(vxrm, a, shift);
2808     return (a >> shift) + round;
2809 }
2810 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2811 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2812 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2813 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2814 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2815 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2816 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2817 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2818 
2819 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2820 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2821 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2822 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2823 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2824 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2825 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2826 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2827 
2828 static inline int8_t
2829 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2830 {
2831     uint8_t round, shift = b & 0x7;
2832 
2833     round = get_round(vxrm, a, shift);
2834     return (a >> shift) + round;
2835 }
2836 static inline int16_t
2837 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2838 {
2839     uint8_t round, shift = b & 0xf;
2840 
2841     round = get_round(vxrm, a, shift);
2842     return (a >> shift) + round;
2843 }
2844 static inline int32_t
2845 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2846 {
2847     uint8_t round, shift = b & 0x1f;
2848 
2849     round = get_round(vxrm, a, shift);
2850     return (a >> shift) + round;
2851 }
2852 static inline int64_t
2853 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2854 {
2855     uint8_t round, shift = b & 0x3f;
2856 
2857     round = get_round(vxrm, a, shift);
2858     return (a >> shift) + round;
2859 }
2860 
2861 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2862 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2863 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2864 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2865 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2866 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2867 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2868 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2869 
2870 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2871 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2872 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2873 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2874 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2875 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2876 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2877 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2878 
2879 /* Vector Narrowing Fixed-Point Clip Instructions */
2880 static inline int8_t
2881 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2882 {
2883     uint8_t round, shift = b & 0xf;
2884     int16_t res;
2885 
2886     round = get_round(vxrm, a, shift);
2887     res = (a >> shift) + round;
2888     if (res > INT8_MAX) {
2889         env->vxsat = 0x1;
2890         return INT8_MAX;
2891     } else if (res < INT8_MIN) {
2892         env->vxsat = 0x1;
2893         return INT8_MIN;
2894     } else {
2895         return res;
2896     }
2897 }
2898 
2899 static inline int16_t
2900 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2901 {
2902     uint8_t round, shift = b & 0x1f;
2903     int32_t res;
2904 
2905     round = get_round(vxrm, a, shift);
2906     res = (a >> shift) + round;
2907     if (res > INT16_MAX) {
2908         env->vxsat = 0x1;
2909         return INT16_MAX;
2910     } else if (res < INT16_MIN) {
2911         env->vxsat = 0x1;
2912         return INT16_MIN;
2913     } else {
2914         return res;
2915     }
2916 }
2917 
2918 static inline int32_t
2919 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2920 {
2921     uint8_t round, shift = b & 0x3f;
2922     int64_t res;
2923 
2924     round = get_round(vxrm, a, shift);
2925     res = (a >> shift) + round;
2926     if (res > INT32_MAX) {
2927         env->vxsat = 0x1;
2928         return INT32_MAX;
2929     } else if (res < INT32_MIN) {
2930         env->vxsat = 0x1;
2931         return INT32_MIN;
2932     } else {
2933         return res;
2934     }
2935 }
2936 
2937 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2938 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2939 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2940 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2941 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2942 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2943 
2944 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2945 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2946 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2947 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2948 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2949 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2950 
2951 static inline uint8_t
2952 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2953 {
2954     uint8_t round, shift = b & 0xf;
2955     uint16_t res;
2956 
2957     round = get_round(vxrm, a, shift);
2958     res = (a >> shift) + round;
2959     if (res > UINT8_MAX) {
2960         env->vxsat = 0x1;
2961         return UINT8_MAX;
2962     } else {
2963         return res;
2964     }
2965 }
2966 
2967 static inline uint16_t
2968 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2969 {
2970     uint8_t round, shift = b & 0x1f;
2971     uint32_t res;
2972 
2973     round = get_round(vxrm, a, shift);
2974     res = (a >> shift) + round;
2975     if (res > UINT16_MAX) {
2976         env->vxsat = 0x1;
2977         return UINT16_MAX;
2978     } else {
2979         return res;
2980     }
2981 }
2982 
2983 static inline uint32_t
2984 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2985 {
2986     uint8_t round, shift = b & 0x3f;
2987     uint64_t res;
2988 
2989     round = get_round(vxrm, a, shift);
2990     res = (a >> shift) + round;
2991     if (res > UINT32_MAX) {
2992         env->vxsat = 0x1;
2993         return UINT32_MAX;
2994     } else {
2995         return res;
2996     }
2997 }
2998 
2999 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3000 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3001 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3002 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3003 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3004 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3005 
3006 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3007 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3008 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3009 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3010 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3011 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3012 
3013 /*
3014  * Vector Float Point Arithmetic Instructions
3015  */
3016 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3017 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3018 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3019                       CPURISCVState *env)                      \
3020 {                                                              \
3021     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3022     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3023     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3024 }
3025 
3026 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3027 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3028                   void *vs2, CPURISCVState *env,          \
3029                   uint32_t desc)                          \
3030 {                                                         \
3031     uint32_t vm = vext_vm(desc);                          \
3032     uint32_t vl = env->vl;                                \
3033     uint32_t total_elems =                                \
3034         vext_get_total_elems(env, desc, ESZ);             \
3035     uint32_t vta = vext_vta(desc);                        \
3036     uint32_t vma = vext_vma(desc);                        \
3037     uint32_t i;                                           \
3038                                                           \
3039     for (i = env->vstart; i < vl; i++) {                  \
3040         if (!vm && !vext_elem_mask(v0, i)) {              \
3041             /* set masked-off elements to 1s */           \
3042             vext_set_elems_1s(vd, vma, i * ESZ,           \
3043                               (i + 1) * ESZ);             \
3044             continue;                                     \
3045         }                                                 \
3046         do_##NAME(vd, vs1, vs2, i, env);                  \
3047     }                                                     \
3048     env->vstart = 0;                                      \
3049     /* set tail elements to 1s */                         \
3050     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3051                       total_elems * ESZ);                 \
3052 }
3053 
3054 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3055 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3056 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3057 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3058 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3059 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3060 
3061 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3062 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3063                       CPURISCVState *env)                      \
3064 {                                                              \
3065     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3066     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3067 }
3068 
3069 #define GEN_VEXT_VF(NAME, ESZ)                            \
3070 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3071                   void *vs2, CPURISCVState *env,          \
3072                   uint32_t desc)                          \
3073 {                                                         \
3074     uint32_t vm = vext_vm(desc);                          \
3075     uint32_t vl = env->vl;                                \
3076     uint32_t total_elems =                                \
3077         vext_get_total_elems(env, desc, ESZ);             \
3078     uint32_t vta = vext_vta(desc);                        \
3079     uint32_t vma = vext_vma(desc);                        \
3080     uint32_t i;                                           \
3081                                                           \
3082     for (i = env->vstart; i < vl; i++) {                  \
3083         if (!vm && !vext_elem_mask(v0, i)) {              \
3084             /* set masked-off elements to 1s */           \
3085             vext_set_elems_1s(vd, vma, i * ESZ,           \
3086                               (i + 1) * ESZ);             \
3087             continue;                                     \
3088         }                                                 \
3089         do_##NAME(vd, s1, vs2, i, env);                   \
3090     }                                                     \
3091     env->vstart = 0;                                      \
3092     /* set tail elements to 1s */                         \
3093     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3094                       total_elems * ESZ);                 \
3095 }
3096 
3097 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3098 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3099 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3100 GEN_VEXT_VF(vfadd_vf_h, 2)
3101 GEN_VEXT_VF(vfadd_vf_w, 4)
3102 GEN_VEXT_VF(vfadd_vf_d, 8)
3103 
3104 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3105 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3106 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3107 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3108 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3109 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3110 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3111 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3112 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3113 GEN_VEXT_VF(vfsub_vf_h, 2)
3114 GEN_VEXT_VF(vfsub_vf_w, 4)
3115 GEN_VEXT_VF(vfsub_vf_d, 8)
3116 
3117 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3118 {
3119     return float16_sub(b, a, s);
3120 }
3121 
3122 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3123 {
3124     return float32_sub(b, a, s);
3125 }
3126 
3127 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3128 {
3129     return float64_sub(b, a, s);
3130 }
3131 
3132 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3133 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3134 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3135 GEN_VEXT_VF(vfrsub_vf_h, 2)
3136 GEN_VEXT_VF(vfrsub_vf_w, 4)
3137 GEN_VEXT_VF(vfrsub_vf_d, 8)
3138 
3139 /* Vector Widening Floating-Point Add/Subtract Instructions */
3140 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3141 {
3142     return float32_add(float16_to_float32(a, true, s),
3143                        float16_to_float32(b, true, s), s);
3144 }
3145 
3146 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3147 {
3148     return float64_add(float32_to_float64(a, s),
3149                        float32_to_float64(b, s), s);
3150 
3151 }
3152 
3153 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3154 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3155 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3156 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3157 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3158 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3159 GEN_VEXT_VF(vfwadd_vf_h, 4)
3160 GEN_VEXT_VF(vfwadd_vf_w, 8)
3161 
3162 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3163 {
3164     return float32_sub(float16_to_float32(a, true, s),
3165                        float16_to_float32(b, true, s), s);
3166 }
3167 
3168 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3169 {
3170     return float64_sub(float32_to_float64(a, s),
3171                        float32_to_float64(b, s), s);
3172 
3173 }
3174 
3175 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3176 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3177 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3178 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3179 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3180 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3181 GEN_VEXT_VF(vfwsub_vf_h, 4)
3182 GEN_VEXT_VF(vfwsub_vf_w, 8)
3183 
3184 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3185 {
3186     return float32_add(a, float16_to_float32(b, true, s), s);
3187 }
3188 
3189 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3190 {
3191     return float64_add(a, float32_to_float64(b, s), s);
3192 }
3193 
3194 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3195 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3196 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3197 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3198 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3199 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3200 GEN_VEXT_VF(vfwadd_wf_h, 4)
3201 GEN_VEXT_VF(vfwadd_wf_w, 8)
3202 
3203 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3204 {
3205     return float32_sub(a, float16_to_float32(b, true, s), s);
3206 }
3207 
3208 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3209 {
3210     return float64_sub(a, float32_to_float64(b, s), s);
3211 }
3212 
3213 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3214 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3215 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3216 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3217 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3218 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3219 GEN_VEXT_VF(vfwsub_wf_h, 4)
3220 GEN_VEXT_VF(vfwsub_wf_w, 8)
3221 
3222 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3223 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3224 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3225 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3226 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3227 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3228 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3229 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3230 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3231 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3232 GEN_VEXT_VF(vfmul_vf_h, 2)
3233 GEN_VEXT_VF(vfmul_vf_w, 4)
3234 GEN_VEXT_VF(vfmul_vf_d, 8)
3235 
3236 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3237 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3238 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3239 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3240 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3241 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3242 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3243 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3244 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3245 GEN_VEXT_VF(vfdiv_vf_h, 2)
3246 GEN_VEXT_VF(vfdiv_vf_w, 4)
3247 GEN_VEXT_VF(vfdiv_vf_d, 8)
3248 
3249 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3250 {
3251     return float16_div(b, a, s);
3252 }
3253 
3254 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3255 {
3256     return float32_div(b, a, s);
3257 }
3258 
3259 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3260 {
3261     return float64_div(b, a, s);
3262 }
3263 
3264 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3265 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3266 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3267 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3268 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3269 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3270 
3271 /* Vector Widening Floating-Point Multiply */
3272 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3273 {
3274     return float32_mul(float16_to_float32(a, true, s),
3275                        float16_to_float32(b, true, s), s);
3276 }
3277 
3278 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3279 {
3280     return float64_mul(float32_to_float64(a, s),
3281                        float32_to_float64(b, s), s);
3282 
3283 }
3284 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3285 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3286 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3287 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3288 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3289 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3290 GEN_VEXT_VF(vfwmul_vf_h, 4)
3291 GEN_VEXT_VF(vfwmul_vf_w, 8)
3292 
3293 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3294 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3295 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3296                       CPURISCVState *env)                          \
3297 {                                                                  \
3298     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3299     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3300     TD d = *((TD *)vd + HD(i));                                    \
3301     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3302 }
3303 
3304 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3305 {
3306     return float16_muladd(a, b, d, 0, s);
3307 }
3308 
3309 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3310 {
3311     return float32_muladd(a, b, d, 0, s);
3312 }
3313 
3314 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3315 {
3316     return float64_muladd(a, b, d, 0, s);
3317 }
3318 
3319 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3320 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3321 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3322 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3323 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3324 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3325 
3326 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3327 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3328                       CPURISCVState *env)                         \
3329 {                                                                 \
3330     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3331     TD d = *((TD *)vd + HD(i));                                   \
3332     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3333 }
3334 
3335 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3336 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3337 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3338 GEN_VEXT_VF(vfmacc_vf_h, 2)
3339 GEN_VEXT_VF(vfmacc_vf_w, 4)
3340 GEN_VEXT_VF(vfmacc_vf_d, 8)
3341 
3342 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3343 {
3344     return float16_muladd(a, b, d, float_muladd_negate_c |
3345                                    float_muladd_negate_product, s);
3346 }
3347 
3348 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3349 {
3350     return float32_muladd(a, b, d, float_muladd_negate_c |
3351                                    float_muladd_negate_product, s);
3352 }
3353 
3354 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3355 {
3356     return float64_muladd(a, b, d, float_muladd_negate_c |
3357                                    float_muladd_negate_product, s);
3358 }
3359 
3360 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3361 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3362 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3363 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3364 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3365 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3366 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3367 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3368 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3369 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3370 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3371 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3372 
3373 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3374 {
3375     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3376 }
3377 
3378 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3379 {
3380     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3381 }
3382 
3383 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3384 {
3385     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3386 }
3387 
3388 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3389 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3390 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3391 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3392 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3393 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3394 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3395 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3396 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3397 GEN_VEXT_VF(vfmsac_vf_h, 2)
3398 GEN_VEXT_VF(vfmsac_vf_w, 4)
3399 GEN_VEXT_VF(vfmsac_vf_d, 8)
3400 
3401 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3402 {
3403     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3404 }
3405 
3406 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3407 {
3408     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3409 }
3410 
3411 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3412 {
3413     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3414 }
3415 
3416 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3417 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3418 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3419 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3420 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3421 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3422 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3423 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3424 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3425 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3426 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3427 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3428 
3429 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3430 {
3431     return float16_muladd(d, b, a, 0, s);
3432 }
3433 
3434 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3435 {
3436     return float32_muladd(d, b, a, 0, s);
3437 }
3438 
3439 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3440 {
3441     return float64_muladd(d, b, a, 0, s);
3442 }
3443 
3444 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3445 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3446 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3447 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3448 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3449 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3450 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3451 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3452 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3453 GEN_VEXT_VF(vfmadd_vf_h, 2)
3454 GEN_VEXT_VF(vfmadd_vf_w, 4)
3455 GEN_VEXT_VF(vfmadd_vf_d, 8)
3456 
3457 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3458 {
3459     return float16_muladd(d, b, a, float_muladd_negate_c |
3460                                    float_muladd_negate_product, s);
3461 }
3462 
3463 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3464 {
3465     return float32_muladd(d, b, a, float_muladd_negate_c |
3466                                    float_muladd_negate_product, s);
3467 }
3468 
3469 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3470 {
3471     return float64_muladd(d, b, a, float_muladd_negate_c |
3472                                    float_muladd_negate_product, s);
3473 }
3474 
3475 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3476 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3477 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3478 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3479 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3480 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3481 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3482 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3483 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3484 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3485 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3486 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3487 
3488 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3489 {
3490     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3491 }
3492 
3493 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3494 {
3495     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3496 }
3497 
3498 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3499 {
3500     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3501 }
3502 
3503 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3504 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3505 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3506 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3507 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3508 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3509 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3510 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3511 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3512 GEN_VEXT_VF(vfmsub_vf_h, 2)
3513 GEN_VEXT_VF(vfmsub_vf_w, 4)
3514 GEN_VEXT_VF(vfmsub_vf_d, 8)
3515 
3516 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3517 {
3518     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3519 }
3520 
3521 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3522 {
3523     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3524 }
3525 
3526 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3527 {
3528     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3529 }
3530 
3531 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3532 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3533 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3534 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3535 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3536 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3537 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3538 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3539 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3540 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3541 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3542 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3543 
3544 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3545 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3546 {
3547     return float32_muladd(float16_to_float32(a, true, s),
3548                           float16_to_float32(b, true, s), d, 0, s);
3549 }
3550 
3551 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3552 {
3553     return float64_muladd(float32_to_float64(a, s),
3554                           float32_to_float64(b, s), d, 0, s);
3555 }
3556 
3557 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3558 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3559 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3560 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3561 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3562 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3563 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3564 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3565 
3566 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3567 {
3568     return float32_muladd(float16_to_float32(a, true, s),
3569                           float16_to_float32(b, true, s), d,
3570                           float_muladd_negate_c | float_muladd_negate_product,
3571                           s);
3572 }
3573 
3574 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3575 {
3576     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3577                           d, float_muladd_negate_c |
3578                              float_muladd_negate_product, s);
3579 }
3580 
3581 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3582 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3583 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3584 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3585 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3586 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3587 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3588 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3589 
3590 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3591 {
3592     return float32_muladd(float16_to_float32(a, true, s),
3593                           float16_to_float32(b, true, s), d,
3594                           float_muladd_negate_c, s);
3595 }
3596 
3597 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3598 {
3599     return float64_muladd(float32_to_float64(a, s),
3600                           float32_to_float64(b, s), d,
3601                           float_muladd_negate_c, s);
3602 }
3603 
3604 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3605 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3606 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3607 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3608 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3609 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3610 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3611 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3612 
3613 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3614 {
3615     return float32_muladd(float16_to_float32(a, true, s),
3616                           float16_to_float32(b, true, s), d,
3617                           float_muladd_negate_product, s);
3618 }
3619 
3620 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3621 {
3622     return float64_muladd(float32_to_float64(a, s),
3623                           float32_to_float64(b, s), d,
3624                           float_muladd_negate_product, s);
3625 }
3626 
3627 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3628 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3629 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3630 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3631 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3632 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3633 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3634 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3635 
3636 /* Vector Floating-Point Square-Root Instruction */
3637 /* (TD, T2, TX2) */
3638 #define OP_UU_H uint16_t, uint16_t, uint16_t
3639 #define OP_UU_W uint32_t, uint32_t, uint32_t
3640 #define OP_UU_D uint64_t, uint64_t, uint64_t
3641 
3642 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3643 static void do_##NAME(void *vd, void *vs2, int i,      \
3644                       CPURISCVState *env)              \
3645 {                                                      \
3646     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3647     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3648 }
3649 
3650 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3651 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3652                   CPURISCVState *env, uint32_t desc)   \
3653 {                                                      \
3654     uint32_t vm = vext_vm(desc);                       \
3655     uint32_t vl = env->vl;                             \
3656     uint32_t total_elems =                             \
3657         vext_get_total_elems(env, desc, ESZ);          \
3658     uint32_t vta = vext_vta(desc);                     \
3659     uint32_t vma = vext_vma(desc);                     \
3660     uint32_t i;                                        \
3661                                                        \
3662     if (vl == 0) {                                     \
3663         return;                                        \
3664     }                                                  \
3665     for (i = env->vstart; i < vl; i++) {               \
3666         if (!vm && !vext_elem_mask(v0, i)) {           \
3667             /* set masked-off elements to 1s */        \
3668             vext_set_elems_1s(vd, vma, i * ESZ,        \
3669                               (i + 1) * ESZ);          \
3670             continue;                                  \
3671         }                                              \
3672         do_##NAME(vd, vs2, i, env);                    \
3673     }                                                  \
3674     env->vstart = 0;                                   \
3675     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3676                       total_elems * ESZ);              \
3677 }
3678 
3679 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3680 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3681 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3682 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3683 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3684 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3685 
3686 /*
3687  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3688  *
3689  * Adapted from riscv-v-spec recip.c:
3690  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3691  */
3692 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3693 {
3694     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3695     uint64_t exp = extract64(f, frac_size, exp_size);
3696     uint64_t frac = extract64(f, 0, frac_size);
3697 
3698     const uint8_t lookup_table[] = {
3699         52, 51, 50, 48, 47, 46, 44, 43,
3700         42, 41, 40, 39, 38, 36, 35, 34,
3701         33, 32, 31, 30, 30, 29, 28, 27,
3702         26, 25, 24, 23, 23, 22, 21, 20,
3703         19, 19, 18, 17, 16, 16, 15, 14,
3704         14, 13, 12, 12, 11, 10, 10, 9,
3705         9, 8, 7, 7, 6, 6, 5, 4,
3706         4, 3, 3, 2, 2, 1, 1, 0,
3707         127, 125, 123, 121, 119, 118, 116, 114,
3708         113, 111, 109, 108, 106, 105, 103, 102,
3709         100, 99, 97, 96, 95, 93, 92, 91,
3710         90, 88, 87, 86, 85, 84, 83, 82,
3711         80, 79, 78, 77, 76, 75, 74, 73,
3712         72, 71, 70, 70, 69, 68, 67, 66,
3713         65, 64, 63, 63, 62, 61, 60, 59,
3714         59, 58, 57, 56, 56, 55, 54, 53
3715     };
3716     const int precision = 7;
3717 
3718     if (exp == 0 && frac != 0) { /* subnormal */
3719         /* Normalize the subnormal. */
3720         while (extract64(frac, frac_size - 1, 1) == 0) {
3721             exp--;
3722             frac <<= 1;
3723         }
3724 
3725         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3726     }
3727 
3728     int idx = ((exp & 1) << (precision - 1)) |
3729               (frac >> (frac_size - precision + 1));
3730     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3731                         (frac_size - precision);
3732     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3733 
3734     uint64_t val = 0;
3735     val = deposit64(val, 0, frac_size, out_frac);
3736     val = deposit64(val, frac_size, exp_size, out_exp);
3737     val = deposit64(val, frac_size + exp_size, 1, sign);
3738     return val;
3739 }
3740 
3741 static float16 frsqrt7_h(float16 f, float_status *s)
3742 {
3743     int exp_size = 5, frac_size = 10;
3744     bool sign = float16_is_neg(f);
3745 
3746     /*
3747      * frsqrt7(sNaN) = canonical NaN
3748      * frsqrt7(-inf) = canonical NaN
3749      * frsqrt7(-normal) = canonical NaN
3750      * frsqrt7(-subnormal) = canonical NaN
3751      */
3752     if (float16_is_signaling_nan(f, s) ||
3753         (float16_is_infinity(f) && sign) ||
3754         (float16_is_normal(f) && sign) ||
3755         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3756         s->float_exception_flags |= float_flag_invalid;
3757         return float16_default_nan(s);
3758     }
3759 
3760     /* frsqrt7(qNaN) = canonical NaN */
3761     if (float16_is_quiet_nan(f, s)) {
3762         return float16_default_nan(s);
3763     }
3764 
3765     /* frsqrt7(+-0) = +-inf */
3766     if (float16_is_zero(f)) {
3767         s->float_exception_flags |= float_flag_divbyzero;
3768         return float16_set_sign(float16_infinity, sign);
3769     }
3770 
3771     /* frsqrt7(+inf) = +0 */
3772     if (float16_is_infinity(f) && !sign) {
3773         return float16_set_sign(float16_zero, sign);
3774     }
3775 
3776     /* +normal, +subnormal */
3777     uint64_t val = frsqrt7(f, exp_size, frac_size);
3778     return make_float16(val);
3779 }
3780 
3781 static float32 frsqrt7_s(float32 f, float_status *s)
3782 {
3783     int exp_size = 8, frac_size = 23;
3784     bool sign = float32_is_neg(f);
3785 
3786     /*
3787      * frsqrt7(sNaN) = canonical NaN
3788      * frsqrt7(-inf) = canonical NaN
3789      * frsqrt7(-normal) = canonical NaN
3790      * frsqrt7(-subnormal) = canonical NaN
3791      */
3792     if (float32_is_signaling_nan(f, s) ||
3793         (float32_is_infinity(f) && sign) ||
3794         (float32_is_normal(f) && sign) ||
3795         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3796         s->float_exception_flags |= float_flag_invalid;
3797         return float32_default_nan(s);
3798     }
3799 
3800     /* frsqrt7(qNaN) = canonical NaN */
3801     if (float32_is_quiet_nan(f, s)) {
3802         return float32_default_nan(s);
3803     }
3804 
3805     /* frsqrt7(+-0) = +-inf */
3806     if (float32_is_zero(f)) {
3807         s->float_exception_flags |= float_flag_divbyzero;
3808         return float32_set_sign(float32_infinity, sign);
3809     }
3810 
3811     /* frsqrt7(+inf) = +0 */
3812     if (float32_is_infinity(f) && !sign) {
3813         return float32_set_sign(float32_zero, sign);
3814     }
3815 
3816     /* +normal, +subnormal */
3817     uint64_t val = frsqrt7(f, exp_size, frac_size);
3818     return make_float32(val);
3819 }
3820 
3821 static float64 frsqrt7_d(float64 f, float_status *s)
3822 {
3823     int exp_size = 11, frac_size = 52;
3824     bool sign = float64_is_neg(f);
3825 
3826     /*
3827      * frsqrt7(sNaN) = canonical NaN
3828      * frsqrt7(-inf) = canonical NaN
3829      * frsqrt7(-normal) = canonical NaN
3830      * frsqrt7(-subnormal) = canonical NaN
3831      */
3832     if (float64_is_signaling_nan(f, s) ||
3833         (float64_is_infinity(f) && sign) ||
3834         (float64_is_normal(f) && sign) ||
3835         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3836         s->float_exception_flags |= float_flag_invalid;
3837         return float64_default_nan(s);
3838     }
3839 
3840     /* frsqrt7(qNaN) = canonical NaN */
3841     if (float64_is_quiet_nan(f, s)) {
3842         return float64_default_nan(s);
3843     }
3844 
3845     /* frsqrt7(+-0) = +-inf */
3846     if (float64_is_zero(f)) {
3847         s->float_exception_flags |= float_flag_divbyzero;
3848         return float64_set_sign(float64_infinity, sign);
3849     }
3850 
3851     /* frsqrt7(+inf) = +0 */
3852     if (float64_is_infinity(f) && !sign) {
3853         return float64_set_sign(float64_zero, sign);
3854     }
3855 
3856     /* +normal, +subnormal */
3857     uint64_t val = frsqrt7(f, exp_size, frac_size);
3858     return make_float64(val);
3859 }
3860 
3861 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3862 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3863 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3864 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3865 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3866 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3867 
3868 /*
3869  * Vector Floating-Point Reciprocal Estimate Instruction
3870  *
3871  * Adapted from riscv-v-spec recip.c:
3872  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3873  */
3874 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3875                       float_status *s)
3876 {
3877     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3878     uint64_t exp = extract64(f, frac_size, exp_size);
3879     uint64_t frac = extract64(f, 0, frac_size);
3880 
3881     const uint8_t lookup_table[] = {
3882         127, 125, 123, 121, 119, 117, 116, 114,
3883         112, 110, 109, 107, 105, 104, 102, 100,
3884         99, 97, 96, 94, 93, 91, 90, 88,
3885         87, 85, 84, 83, 81, 80, 79, 77,
3886         76, 75, 74, 72, 71, 70, 69, 68,
3887         66, 65, 64, 63, 62, 61, 60, 59,
3888         58, 57, 56, 55, 54, 53, 52, 51,
3889         50, 49, 48, 47, 46, 45, 44, 43,
3890         42, 41, 40, 40, 39, 38, 37, 36,
3891         35, 35, 34, 33, 32, 31, 31, 30,
3892         29, 28, 28, 27, 26, 25, 25, 24,
3893         23, 23, 22, 21, 21, 20, 19, 19,
3894         18, 17, 17, 16, 15, 15, 14, 14,
3895         13, 12, 12, 11, 11, 10, 9, 9,
3896         8, 8, 7, 7, 6, 5, 5, 4,
3897         4, 3, 3, 2, 2, 1, 1, 0
3898     };
3899     const int precision = 7;
3900 
3901     if (exp == 0 && frac != 0) { /* subnormal */
3902         /* Normalize the subnormal. */
3903         while (extract64(frac, frac_size - 1, 1) == 0) {
3904             exp--;
3905             frac <<= 1;
3906         }
3907 
3908         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3909 
3910         if (exp != 0 && exp != UINT64_MAX) {
3911             /*
3912              * Overflow to inf or max value of same sign,
3913              * depending on sign and rounding mode.
3914              */
3915             s->float_exception_flags |= (float_flag_inexact |
3916                                          float_flag_overflow);
3917 
3918             if ((s->float_rounding_mode == float_round_to_zero) ||
3919                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3920                 ((s->float_rounding_mode == float_round_up) && sign)) {
3921                 /* Return greatest/negative finite value. */
3922                 return (sign << (exp_size + frac_size)) |
3923                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3924             } else {
3925                 /* Return +-inf. */
3926                 return (sign << (exp_size + frac_size)) |
3927                        MAKE_64BIT_MASK(frac_size, exp_size);
3928             }
3929         }
3930     }
3931 
3932     int idx = frac >> (frac_size - precision);
3933     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3934                         (frac_size - precision);
3935     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3936 
3937     if (out_exp == 0 || out_exp == UINT64_MAX) {
3938         /*
3939          * The result is subnormal, but don't raise the underflow exception,
3940          * because there's no additional loss of precision.
3941          */
3942         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3943         if (out_exp == UINT64_MAX) {
3944             out_frac >>= 1;
3945             out_exp = 0;
3946         }
3947     }
3948 
3949     uint64_t val = 0;
3950     val = deposit64(val, 0, frac_size, out_frac);
3951     val = deposit64(val, frac_size, exp_size, out_exp);
3952     val = deposit64(val, frac_size + exp_size, 1, sign);
3953     return val;
3954 }
3955 
3956 static float16 frec7_h(float16 f, float_status *s)
3957 {
3958     int exp_size = 5, frac_size = 10;
3959     bool sign = float16_is_neg(f);
3960 
3961     /* frec7(+-inf) = +-0 */
3962     if (float16_is_infinity(f)) {
3963         return float16_set_sign(float16_zero, sign);
3964     }
3965 
3966     /* frec7(+-0) = +-inf */
3967     if (float16_is_zero(f)) {
3968         s->float_exception_flags |= float_flag_divbyzero;
3969         return float16_set_sign(float16_infinity, sign);
3970     }
3971 
3972     /* frec7(sNaN) = canonical NaN */
3973     if (float16_is_signaling_nan(f, s)) {
3974         s->float_exception_flags |= float_flag_invalid;
3975         return float16_default_nan(s);
3976     }
3977 
3978     /* frec7(qNaN) = canonical NaN */
3979     if (float16_is_quiet_nan(f, s)) {
3980         return float16_default_nan(s);
3981     }
3982 
3983     /* +-normal, +-subnormal */
3984     uint64_t val = frec7(f, exp_size, frac_size, s);
3985     return make_float16(val);
3986 }
3987 
3988 static float32 frec7_s(float32 f, float_status *s)
3989 {
3990     int exp_size = 8, frac_size = 23;
3991     bool sign = float32_is_neg(f);
3992 
3993     /* frec7(+-inf) = +-0 */
3994     if (float32_is_infinity(f)) {
3995         return float32_set_sign(float32_zero, sign);
3996     }
3997 
3998     /* frec7(+-0) = +-inf */
3999     if (float32_is_zero(f)) {
4000         s->float_exception_flags |= float_flag_divbyzero;
4001         return float32_set_sign(float32_infinity, sign);
4002     }
4003 
4004     /* frec7(sNaN) = canonical NaN */
4005     if (float32_is_signaling_nan(f, s)) {
4006         s->float_exception_flags |= float_flag_invalid;
4007         return float32_default_nan(s);
4008     }
4009 
4010     /* frec7(qNaN) = canonical NaN */
4011     if (float32_is_quiet_nan(f, s)) {
4012         return float32_default_nan(s);
4013     }
4014 
4015     /* +-normal, +-subnormal */
4016     uint64_t val = frec7(f, exp_size, frac_size, s);
4017     return make_float32(val);
4018 }
4019 
4020 static float64 frec7_d(float64 f, float_status *s)
4021 {
4022     int exp_size = 11, frac_size = 52;
4023     bool sign = float64_is_neg(f);
4024 
4025     /* frec7(+-inf) = +-0 */
4026     if (float64_is_infinity(f)) {
4027         return float64_set_sign(float64_zero, sign);
4028     }
4029 
4030     /* frec7(+-0) = +-inf */
4031     if (float64_is_zero(f)) {
4032         s->float_exception_flags |= float_flag_divbyzero;
4033         return float64_set_sign(float64_infinity, sign);
4034     }
4035 
4036     /* frec7(sNaN) = canonical NaN */
4037     if (float64_is_signaling_nan(f, s)) {
4038         s->float_exception_flags |= float_flag_invalid;
4039         return float64_default_nan(s);
4040     }
4041 
4042     /* frec7(qNaN) = canonical NaN */
4043     if (float64_is_quiet_nan(f, s)) {
4044         return float64_default_nan(s);
4045     }
4046 
4047     /* +-normal, +-subnormal */
4048     uint64_t val = frec7(f, exp_size, frac_size, s);
4049     return make_float64(val);
4050 }
4051 
4052 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4053 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4054 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4055 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4056 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4057 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4058 
4059 /* Vector Floating-Point MIN/MAX Instructions */
4060 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4061 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4062 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4063 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4064 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4065 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4066 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4067 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4068 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4069 GEN_VEXT_VF(vfmin_vf_h, 2)
4070 GEN_VEXT_VF(vfmin_vf_w, 4)
4071 GEN_VEXT_VF(vfmin_vf_d, 8)
4072 
4073 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4074 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4075 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4076 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4077 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4078 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4079 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4080 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4081 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4082 GEN_VEXT_VF(vfmax_vf_h, 2)
4083 GEN_VEXT_VF(vfmax_vf_w, 4)
4084 GEN_VEXT_VF(vfmax_vf_d, 8)
4085 
4086 /* Vector Floating-Point Sign-Injection Instructions */
4087 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4088 {
4089     return deposit64(b, 0, 15, a);
4090 }
4091 
4092 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4093 {
4094     return deposit64(b, 0, 31, a);
4095 }
4096 
4097 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4098 {
4099     return deposit64(b, 0, 63, a);
4100 }
4101 
4102 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4103 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4104 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4105 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4106 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4107 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4108 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4109 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4110 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4111 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4112 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4113 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4114 
4115 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4116 {
4117     return deposit64(~b, 0, 15, a);
4118 }
4119 
4120 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4121 {
4122     return deposit64(~b, 0, 31, a);
4123 }
4124 
4125 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4126 {
4127     return deposit64(~b, 0, 63, a);
4128 }
4129 
4130 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4131 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4132 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4133 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4134 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4135 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4136 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4137 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4138 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4139 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4140 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4141 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4142 
4143 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4144 {
4145     return deposit64(b ^ a, 0, 15, a);
4146 }
4147 
4148 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4149 {
4150     return deposit64(b ^ a, 0, 31, a);
4151 }
4152 
4153 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4154 {
4155     return deposit64(b ^ a, 0, 63, a);
4156 }
4157 
4158 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4159 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4160 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4161 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4162 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4163 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4164 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4165 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4166 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4167 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4168 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4169 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4170 
4171 /* Vector Floating-Point Compare Instructions */
4172 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4173 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4174                   CPURISCVState *env, uint32_t desc)          \
4175 {                                                             \
4176     uint32_t vm = vext_vm(desc);                              \
4177     uint32_t vl = env->vl;                                    \
4178     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4179     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4180     uint32_t vma = vext_vma(desc);                            \
4181     uint32_t i;                                               \
4182                                                               \
4183     for (i = env->vstart; i < vl; i++) {                      \
4184         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4185         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4186         if (!vm && !vext_elem_mask(v0, i)) {                  \
4187             /* set masked-off elements to 1s */               \
4188             if (vma) {                                        \
4189                 vext_set_elem_mask(vd, i, 1);                 \
4190             }                                                 \
4191             continue;                                         \
4192         }                                                     \
4193         vext_set_elem_mask(vd, i,                             \
4194                            DO_OP(s2, s1, &env->fp_status));   \
4195     }                                                         \
4196     env->vstart = 0;                                          \
4197     /*
4198      * mask destination register are always tail-agnostic
4199      * set tail elements to 1s
4200      */                                                       \
4201     if (vta_all_1s) {                                         \
4202         for (; i < total_elems; i++) {                        \
4203             vext_set_elem_mask(vd, i, 1);                     \
4204         }                                                     \
4205     }                                                         \
4206 }
4207 
4208 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4209 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4210 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4211 
4212 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4213 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4214                   CPURISCVState *env, uint32_t desc)                \
4215 {                                                                   \
4216     uint32_t vm = vext_vm(desc);                                    \
4217     uint32_t vl = env->vl;                                          \
4218     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4219     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4220     uint32_t vma = vext_vma(desc);                                  \
4221     uint32_t i;                                                     \
4222                                                                     \
4223     for (i = env->vstart; i < vl; i++) {                            \
4224         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4225         if (!vm && !vext_elem_mask(v0, i)) {                        \
4226             /* set masked-off elements to 1s */                     \
4227             if (vma) {                                              \
4228                 vext_set_elem_mask(vd, i, 1);                       \
4229             }                                                       \
4230             continue;                                               \
4231         }                                                           \
4232         vext_set_elem_mask(vd, i,                                   \
4233                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4234     }                                                               \
4235     env->vstart = 0;                                                \
4236     /*
4237      * mask destination register are always tail-agnostic
4238      * set tail elements to 1s
4239      */                                                             \
4240     if (vta_all_1s) {                                               \
4241         for (; i < total_elems; i++) {                              \
4242             vext_set_elem_mask(vd, i, 1);                           \
4243         }                                                           \
4244     }                                                               \
4245 }
4246 
4247 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4248 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4249 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4250 
4251 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4252 {
4253     FloatRelation compare = float16_compare_quiet(a, b, s);
4254     return compare != float_relation_equal;
4255 }
4256 
4257 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4258 {
4259     FloatRelation compare = float32_compare_quiet(a, b, s);
4260     return compare != float_relation_equal;
4261 }
4262 
4263 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4264 {
4265     FloatRelation compare = float64_compare_quiet(a, b, s);
4266     return compare != float_relation_equal;
4267 }
4268 
4269 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4270 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4271 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4272 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4273 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4274 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4275 
4276 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4277 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4278 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4279 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4280 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4281 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4282 
4283 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4284 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4285 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4286 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4287 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4288 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4289 
4290 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4291 {
4292     FloatRelation compare = float16_compare(a, b, s);
4293     return compare == float_relation_greater;
4294 }
4295 
4296 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4297 {
4298     FloatRelation compare = float32_compare(a, b, s);
4299     return compare == float_relation_greater;
4300 }
4301 
4302 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4303 {
4304     FloatRelation compare = float64_compare(a, b, s);
4305     return compare == float_relation_greater;
4306 }
4307 
4308 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4309 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4310 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4311 
4312 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4313 {
4314     FloatRelation compare = float16_compare(a, b, s);
4315     return compare == float_relation_greater ||
4316            compare == float_relation_equal;
4317 }
4318 
4319 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4320 {
4321     FloatRelation compare = float32_compare(a, b, s);
4322     return compare == float_relation_greater ||
4323            compare == float_relation_equal;
4324 }
4325 
4326 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4327 {
4328     FloatRelation compare = float64_compare(a, b, s);
4329     return compare == float_relation_greater ||
4330            compare == float_relation_equal;
4331 }
4332 
4333 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4334 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4335 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4336 
4337 /* Vector Floating-Point Classify Instruction */
4338 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4339 static void do_##NAME(void *vd, void *vs2, int i)      \
4340 {                                                      \
4341     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4342     *((TD *)vd + HD(i)) = OP(s2);                      \
4343 }
4344 
4345 #define GEN_VEXT_V(NAME, ESZ)                          \
4346 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4347                   CPURISCVState *env, uint32_t desc)   \
4348 {                                                      \
4349     uint32_t vm = vext_vm(desc);                       \
4350     uint32_t vl = env->vl;                             \
4351     uint32_t total_elems =                             \
4352         vext_get_total_elems(env, desc, ESZ);          \
4353     uint32_t vta = vext_vta(desc);                     \
4354     uint32_t vma = vext_vma(desc);                     \
4355     uint32_t i;                                        \
4356                                                        \
4357     for (i = env->vstart; i < vl; i++) {               \
4358         if (!vm && !vext_elem_mask(v0, i)) {           \
4359             /* set masked-off elements to 1s */        \
4360             vext_set_elems_1s(vd, vma, i * ESZ,        \
4361                               (i + 1) * ESZ);          \
4362             continue;                                  \
4363         }                                              \
4364         do_##NAME(vd, vs2, i);                         \
4365     }                                                  \
4366     env->vstart = 0;                                   \
4367     /* set tail elements to 1s */                      \
4368     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4369                       total_elems * ESZ);              \
4370 }
4371 
4372 target_ulong fclass_h(uint64_t frs1)
4373 {
4374     float16 f = frs1;
4375     bool sign = float16_is_neg(f);
4376 
4377     if (float16_is_infinity(f)) {
4378         return sign ? 1 << 0 : 1 << 7;
4379     } else if (float16_is_zero(f)) {
4380         return sign ? 1 << 3 : 1 << 4;
4381     } else if (float16_is_zero_or_denormal(f)) {
4382         return sign ? 1 << 2 : 1 << 5;
4383     } else if (float16_is_any_nan(f)) {
4384         float_status s = { }; /* for snan_bit_is_one */
4385         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4386     } else {
4387         return sign ? 1 << 1 : 1 << 6;
4388     }
4389 }
4390 
4391 target_ulong fclass_s(uint64_t frs1)
4392 {
4393     float32 f = frs1;
4394     bool sign = float32_is_neg(f);
4395 
4396     if (float32_is_infinity(f)) {
4397         return sign ? 1 << 0 : 1 << 7;
4398     } else if (float32_is_zero(f)) {
4399         return sign ? 1 << 3 : 1 << 4;
4400     } else if (float32_is_zero_or_denormal(f)) {
4401         return sign ? 1 << 2 : 1 << 5;
4402     } else if (float32_is_any_nan(f)) {
4403         float_status s = { }; /* for snan_bit_is_one */
4404         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4405     } else {
4406         return sign ? 1 << 1 : 1 << 6;
4407     }
4408 }
4409 
4410 target_ulong fclass_d(uint64_t frs1)
4411 {
4412     float64 f = frs1;
4413     bool sign = float64_is_neg(f);
4414 
4415     if (float64_is_infinity(f)) {
4416         return sign ? 1 << 0 : 1 << 7;
4417     } else if (float64_is_zero(f)) {
4418         return sign ? 1 << 3 : 1 << 4;
4419     } else if (float64_is_zero_or_denormal(f)) {
4420         return sign ? 1 << 2 : 1 << 5;
4421     } else if (float64_is_any_nan(f)) {
4422         float_status s = { }; /* for snan_bit_is_one */
4423         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4424     } else {
4425         return sign ? 1 << 1 : 1 << 6;
4426     }
4427 }
4428 
4429 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4430 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4431 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4432 GEN_VEXT_V(vfclass_v_h, 2)
4433 GEN_VEXT_V(vfclass_v_w, 4)
4434 GEN_VEXT_V(vfclass_v_d, 8)
4435 
4436 /* Vector Floating-Point Merge Instruction */
4437 
4438 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4439 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4440                   CPURISCVState *env, uint32_t desc)          \
4441 {                                                             \
4442     uint32_t vm = vext_vm(desc);                              \
4443     uint32_t vl = env->vl;                                    \
4444     uint32_t esz = sizeof(ETYPE);                             \
4445     uint32_t total_elems =                                    \
4446         vext_get_total_elems(env, desc, esz);                 \
4447     uint32_t vta = vext_vta(desc);                            \
4448     uint32_t i;                                               \
4449                                                               \
4450     for (i = env->vstart; i < vl; i++) {                      \
4451         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4452         *((ETYPE *)vd + H(i)) =                               \
4453             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4454     }                                                         \
4455     env->vstart = 0;                                          \
4456     /* set tail elements to 1s */                             \
4457     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4458 }
4459 
4460 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4461 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4462 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4463 
4464 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4465 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4466 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4467 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4468 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4469 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4470 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4471 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4472 
4473 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4474 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4475 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4476 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4477 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4478 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4479 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4480 
4481 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4482 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4483 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4484 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4485 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4486 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4487 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4488 
4489 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4490 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4491 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4492 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4493 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4494 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4495 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4496 
4497 /* Widening Floating-Point/Integer Type-Convert Instructions */
4498 /* (TD, T2, TX2) */
4499 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4500 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4501 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4502 /*
4503  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4504  */
4505 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4506 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4507 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4508 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4509 
4510 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4511 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4512 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4513 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4514 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4515 
4516 /*
4517  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4518  */
4519 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4520 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4521 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4522 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4523 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4524 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4525 
4526 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4527 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4528 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4529 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4530 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4531 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4532 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4533 
4534 /*
4535  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4536  */
4537 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4538 {
4539     return float16_to_float32(a, true, s);
4540 }
4541 
4542 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4543 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4544 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4545 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4546 
4547 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4548 /* (TD, T2, TX2) */
4549 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4550 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4551 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4552 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4553 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4554 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4555 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4556 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4557 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4558 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4559 
4560 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4561 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4562 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4563 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4564 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4565 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4566 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4567 
4568 /*
4569  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4570  */
4571 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4572 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4573 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4574 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4575 
4576 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4577 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4578 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4579 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4580 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4581 
4582 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4583 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4584 {
4585     return float32_to_float16(a, true, s);
4586 }
4587 
4588 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4589 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4590 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4591 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4592 
4593 /*
4594  * Vector Reduction Operations
4595  */
4596 /* Vector Single-Width Integer Reduction Instructions */
4597 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4598 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4599                   void *vs2, CPURISCVState *env,          \
4600                   uint32_t desc)                          \
4601 {                                                         \
4602     uint32_t vm = vext_vm(desc);                          \
4603     uint32_t vl = env->vl;                                \
4604     uint32_t esz = sizeof(TD);                            \
4605     uint32_t vlenb = simd_maxsz(desc);                    \
4606     uint32_t vta = vext_vta(desc);                        \
4607     uint32_t i;                                           \
4608     TD s1 =  *((TD *)vs1 + HD(0));                        \
4609                                                           \
4610     for (i = env->vstart; i < vl; i++) {                  \
4611         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4612         if (!vm && !vext_elem_mask(v0, i)) {              \
4613             continue;                                     \
4614         }                                                 \
4615         s1 = OP(s1, (TD)s2);                              \
4616     }                                                     \
4617     *((TD *)vd + HD(0)) = s1;                             \
4618     env->vstart = 0;                                      \
4619     /* set tail elements to 1s */                         \
4620     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4621 }
4622 
4623 /* vd[0] = sum(vs1[0], vs2[*]) */
4624 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4625 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4626 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4627 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4628 
4629 /* vd[0] = maxu(vs1[0], vs2[*]) */
4630 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4631 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4632 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4633 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4634 
4635 /* vd[0] = max(vs1[0], vs2[*]) */
4636 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4637 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4638 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4639 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4640 
4641 /* vd[0] = minu(vs1[0], vs2[*]) */
4642 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4643 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4644 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4645 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4646 
4647 /* vd[0] = min(vs1[0], vs2[*]) */
4648 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4649 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4650 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4651 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4652 
4653 /* vd[0] = and(vs1[0], vs2[*]) */
4654 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4655 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4656 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4657 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4658 
4659 /* vd[0] = or(vs1[0], vs2[*]) */
4660 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4661 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4662 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4663 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4664 
4665 /* vd[0] = xor(vs1[0], vs2[*]) */
4666 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4667 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4668 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4669 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4670 
4671 /* Vector Widening Integer Reduction Instructions */
4672 /* signed sum reduction into double-width accumulator */
4673 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4674 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4675 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4676 
4677 /* Unsigned sum reduction into double-width accumulator */
4678 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4679 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4680 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4681 
4682 /* Vector Single-Width Floating-Point Reduction Instructions */
4683 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4684 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4685                   void *vs2, CPURISCVState *env,           \
4686                   uint32_t desc)                           \
4687 {                                                          \
4688     uint32_t vm = vext_vm(desc);                           \
4689     uint32_t vl = env->vl;                                 \
4690     uint32_t esz = sizeof(TD);                             \
4691     uint32_t vlenb = simd_maxsz(desc);                     \
4692     uint32_t vta = vext_vta(desc);                         \
4693     uint32_t i;                                            \
4694     TD s1 =  *((TD *)vs1 + HD(0));                         \
4695                                                            \
4696     for (i = env->vstart; i < vl; i++) {                   \
4697         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4698         if (!vm && !vext_elem_mask(v0, i)) {               \
4699             continue;                                      \
4700         }                                                  \
4701         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4702     }                                                      \
4703     *((TD *)vd + HD(0)) = s1;                              \
4704     env->vstart = 0;                                       \
4705     /* set tail elements to 1s */                          \
4706     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4707 }
4708 
4709 /* Unordered sum */
4710 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4711 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4712 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4713 
4714 /* Ordered sum */
4715 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4716 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4717 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4718 
4719 /* Maximum value */
4720 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4721               float16_maximum_number)
4722 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4723               float32_maximum_number)
4724 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4725               float64_maximum_number)
4726 
4727 /* Minimum value */
4728 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4729               float16_minimum_number)
4730 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4731               float32_minimum_number)
4732 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4733               float64_minimum_number)
4734 
4735 /* Vector Widening Floating-Point Add Instructions */
4736 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4737 {
4738     return float32_add(a, float16_to_float32(b, true, s), s);
4739 }
4740 
4741 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4742 {
4743     return float64_add(a, float32_to_float64(b, s), s);
4744 }
4745 
4746 /* Vector Widening Floating-Point Reduction Instructions */
4747 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4748 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4749 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4750 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4751 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4752 
4753 /*
4754  * Vector Mask Operations
4755  */
4756 /* Vector Mask-Register Logical Instructions */
4757 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4758 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4759                   void *vs2, CPURISCVState *env,          \
4760                   uint32_t desc)                          \
4761 {                                                         \
4762     uint32_t vl = env->vl;                                \
4763     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4764     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4765     uint32_t i;                                           \
4766     int a, b;                                             \
4767                                                           \
4768     for (i = env->vstart; i < vl; i++) {                  \
4769         a = vext_elem_mask(vs1, i);                       \
4770         b = vext_elem_mask(vs2, i);                       \
4771         vext_set_elem_mask(vd, i, OP(b, a));              \
4772     }                                                     \
4773     env->vstart = 0;                                      \
4774     /*
4775      * mask destination register are always tail-agnostic
4776      * set tail elements to 1s
4777      */                                                   \
4778     if (vta_all_1s) {                                     \
4779         for (; i < total_elems; i++) {                    \
4780             vext_set_elem_mask(vd, i, 1);                 \
4781         }                                                 \
4782     }                                                     \
4783 }
4784 
4785 #define DO_NAND(N, M)  (!(N & M))
4786 #define DO_ANDNOT(N, M)  (N & !M)
4787 #define DO_NOR(N, M)  (!(N | M))
4788 #define DO_ORNOT(N, M)  (N | !M)
4789 #define DO_XNOR(N, M)  (!(N ^ M))
4790 
4791 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4792 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4793 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4794 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4795 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4796 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4797 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4798 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4799 
4800 /* Vector count population in mask vcpop */
4801 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4802                              uint32_t desc)
4803 {
4804     target_ulong cnt = 0;
4805     uint32_t vm = vext_vm(desc);
4806     uint32_t vl = env->vl;
4807     int i;
4808 
4809     for (i = env->vstart; i < vl; i++) {
4810         if (vm || vext_elem_mask(v0, i)) {
4811             if (vext_elem_mask(vs2, i)) {
4812                 cnt++;
4813             }
4814         }
4815     }
4816     env->vstart = 0;
4817     return cnt;
4818 }
4819 
4820 /* vfirst find-first-set mask bit */
4821 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4822                               uint32_t desc)
4823 {
4824     uint32_t vm = vext_vm(desc);
4825     uint32_t vl = env->vl;
4826     int i;
4827 
4828     for (i = env->vstart; i < vl; i++) {
4829         if (vm || vext_elem_mask(v0, i)) {
4830             if (vext_elem_mask(vs2, i)) {
4831                 return i;
4832             }
4833         }
4834     }
4835     env->vstart = 0;
4836     return -1LL;
4837 }
4838 
4839 enum set_mask_type {
4840     ONLY_FIRST = 1,
4841     INCLUDE_FIRST,
4842     BEFORE_FIRST,
4843 };
4844 
4845 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4846                    uint32_t desc, enum set_mask_type type)
4847 {
4848     uint32_t vm = vext_vm(desc);
4849     uint32_t vl = env->vl;
4850     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4851     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4852     uint32_t vma = vext_vma(desc);
4853     int i;
4854     bool first_mask_bit = false;
4855 
4856     for (i = env->vstart; i < vl; i++) {
4857         if (!vm && !vext_elem_mask(v0, i)) {
4858             /* set masked-off elements to 1s */
4859             if (vma) {
4860                 vext_set_elem_mask(vd, i, 1);
4861             }
4862             continue;
4863         }
4864         /* write a zero to all following active elements */
4865         if (first_mask_bit) {
4866             vext_set_elem_mask(vd, i, 0);
4867             continue;
4868         }
4869         if (vext_elem_mask(vs2, i)) {
4870             first_mask_bit = true;
4871             if (type == BEFORE_FIRST) {
4872                 vext_set_elem_mask(vd, i, 0);
4873             } else {
4874                 vext_set_elem_mask(vd, i, 1);
4875             }
4876         } else {
4877             if (type == ONLY_FIRST) {
4878                 vext_set_elem_mask(vd, i, 0);
4879             } else {
4880                 vext_set_elem_mask(vd, i, 1);
4881             }
4882         }
4883     }
4884     env->vstart = 0;
4885     /*
4886      * mask destination register are always tail-agnostic
4887      * set tail elements to 1s
4888      */
4889     if (vta_all_1s) {
4890         for (; i < total_elems; i++) {
4891             vext_set_elem_mask(vd, i, 1);
4892         }
4893     }
4894 }
4895 
4896 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4897                      uint32_t desc)
4898 {
4899     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4900 }
4901 
4902 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4903                      uint32_t desc)
4904 {
4905     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4906 }
4907 
4908 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4909                      uint32_t desc)
4910 {
4911     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4912 }
4913 
4914 /* Vector Iota Instruction */
4915 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4916 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4917                   uint32_t desc)                                          \
4918 {                                                                         \
4919     uint32_t vm = vext_vm(desc);                                          \
4920     uint32_t vl = env->vl;                                                \
4921     uint32_t esz = sizeof(ETYPE);                                         \
4922     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4923     uint32_t vta = vext_vta(desc);                                        \
4924     uint32_t vma = vext_vma(desc);                                        \
4925     uint32_t sum = 0;                                                     \
4926     int i;                                                                \
4927                                                                           \
4928     for (i = env->vstart; i < vl; i++) {                                  \
4929         if (!vm && !vext_elem_mask(v0, i)) {                              \
4930             /* set masked-off elements to 1s */                           \
4931             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4932             continue;                                                     \
4933         }                                                                 \
4934         *((ETYPE *)vd + H(i)) = sum;                                      \
4935         if (vext_elem_mask(vs2, i)) {                                     \
4936             sum++;                                                        \
4937         }                                                                 \
4938     }                                                                     \
4939     env->vstart = 0;                                                      \
4940     /* set tail elements to 1s */                                         \
4941     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4942 }
4943 
4944 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4945 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4946 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4947 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4948 
4949 /* Vector Element Index Instruction */
4950 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4951 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4952 {                                                                         \
4953     uint32_t vm = vext_vm(desc);                                          \
4954     uint32_t vl = env->vl;                                                \
4955     uint32_t esz = sizeof(ETYPE);                                         \
4956     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4957     uint32_t vta = vext_vta(desc);                                        \
4958     uint32_t vma = vext_vma(desc);                                        \
4959     int i;                                                                \
4960                                                                           \
4961     for (i = env->vstart; i < vl; i++) {                                  \
4962         if (!vm && !vext_elem_mask(v0, i)) {                              \
4963             /* set masked-off elements to 1s */                           \
4964             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4965             continue;                                                     \
4966         }                                                                 \
4967         *((ETYPE *)vd + H(i)) = i;                                        \
4968     }                                                                     \
4969     env->vstart = 0;                                                      \
4970     /* set tail elements to 1s */                                         \
4971     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4972 }
4973 
4974 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4975 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4976 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4977 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4978 
4979 /*
4980  * Vector Permutation Instructions
4981  */
4982 
4983 /* Vector Slide Instructions */
4984 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4985 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4986                   CPURISCVState *env, uint32_t desc)                      \
4987 {                                                                         \
4988     uint32_t vm = vext_vm(desc);                                          \
4989     uint32_t vl = env->vl;                                                \
4990     uint32_t esz = sizeof(ETYPE);                                         \
4991     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4992     uint32_t vta = vext_vta(desc);                                        \
4993     uint32_t vma = vext_vma(desc);                                        \
4994     target_ulong offset = s1, i_min, i;                                   \
4995                                                                           \
4996     i_min = MAX(env->vstart, offset);                                     \
4997     for (i = i_min; i < vl; i++) {                                        \
4998         if (!vm && !vext_elem_mask(v0, i)) {                              \
4999             /* set masked-off elements to 1s */                           \
5000             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5001             continue;                                                     \
5002         }                                                                 \
5003         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5004     }                                                                     \
5005     /* set tail elements to 1s */                                         \
5006     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5007 }
5008 
5009 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5010 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5011 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5012 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5013 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5014 
5015 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5016 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5017                   CPURISCVState *env, uint32_t desc)                      \
5018 {                                                                         \
5019     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5020     uint32_t vm = vext_vm(desc);                                          \
5021     uint32_t vl = env->vl;                                                \
5022     uint32_t esz = sizeof(ETYPE);                                         \
5023     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5024     uint32_t vta = vext_vta(desc);                                        \
5025     uint32_t vma = vext_vma(desc);                                        \
5026     target_ulong i_max, i;                                                \
5027                                                                           \
5028     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5029     for (i = env->vstart; i < i_max; ++i) {                               \
5030         if (!vm && !vext_elem_mask(v0, i)) {                              \
5031             /* set masked-off elements to 1s */                           \
5032             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5033             continue;                                                     \
5034         }                                                                 \
5035         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5036     }                                                                     \
5037                                                                           \
5038     for (i = i_max; i < vl; ++i) {                                        \
5039         if (vm || vext_elem_mask(v0, i)) {                                \
5040             *((ETYPE *)vd + H(i)) = 0;                                    \
5041         }                                                                 \
5042     }                                                                     \
5043                                                                           \
5044     env->vstart = 0;                                                      \
5045     /* set tail elements to 1s */                                         \
5046     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5047 }
5048 
5049 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5050 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5051 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5052 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5053 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5054 
5055 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5056 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5057                                  void *vs2, CPURISCVState *env,             \
5058                                  uint32_t desc)                             \
5059 {                                                                           \
5060     typedef uint##BITWIDTH##_t ETYPE;                                       \
5061     uint32_t vm = vext_vm(desc);                                            \
5062     uint32_t vl = env->vl;                                                  \
5063     uint32_t esz = sizeof(ETYPE);                                           \
5064     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5065     uint32_t vta = vext_vta(desc);                                          \
5066     uint32_t vma = vext_vma(desc);                                          \
5067     uint32_t i;                                                             \
5068                                                                             \
5069     for (i = env->vstart; i < vl; i++) {                                    \
5070         if (!vm && !vext_elem_mask(v0, i)) {                                \
5071             /* set masked-off elements to 1s */                             \
5072             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5073             continue;                                                       \
5074         }                                                                   \
5075         if (i == 0) {                                                       \
5076             *((ETYPE *)vd + H(i)) = s1;                                     \
5077         } else {                                                            \
5078             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5079         }                                                                   \
5080     }                                                                       \
5081     env->vstart = 0;                                                        \
5082     /* set tail elements to 1s */                                           \
5083     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5084 }
5085 
5086 GEN_VEXT_VSLIE1UP(8,  H1)
5087 GEN_VEXT_VSLIE1UP(16, H2)
5088 GEN_VEXT_VSLIE1UP(32, H4)
5089 GEN_VEXT_VSLIE1UP(64, H8)
5090 
5091 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5092 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5093                   CPURISCVState *env, uint32_t desc)              \
5094 {                                                                 \
5095     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5096 }
5097 
5098 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5099 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5100 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5101 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5102 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5103 
5104 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5105 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5106                                    void *vs2, CPURISCVState *env,             \
5107                                    uint32_t desc)                             \
5108 {                                                                             \
5109     typedef uint##BITWIDTH##_t ETYPE;                                         \
5110     uint32_t vm = vext_vm(desc);                                              \
5111     uint32_t vl = env->vl;                                                    \
5112     uint32_t esz = sizeof(ETYPE);                                             \
5113     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5114     uint32_t vta = vext_vta(desc);                                            \
5115     uint32_t vma = vext_vma(desc);                                            \
5116     uint32_t i;                                                               \
5117                                                                               \
5118     for (i = env->vstart; i < vl; i++) {                                      \
5119         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5120             /* set masked-off elements to 1s */                               \
5121             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5122             continue;                                                         \
5123         }                                                                     \
5124         if (i == vl - 1) {                                                    \
5125             *((ETYPE *)vd + H(i)) = s1;                                       \
5126         } else {                                                              \
5127             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5128         }                                                                     \
5129     }                                                                         \
5130     env->vstart = 0;                                                          \
5131     /* set tail elements to 1s */                                             \
5132     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5133 }
5134 
5135 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5136 GEN_VEXT_VSLIDE1DOWN(16, H2)
5137 GEN_VEXT_VSLIDE1DOWN(32, H4)
5138 GEN_VEXT_VSLIDE1DOWN(64, H8)
5139 
5140 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5141 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5142                   CPURISCVState *env, uint32_t desc)              \
5143 {                                                                 \
5144     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5145 }
5146 
5147 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5148 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5149 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5150 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5151 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5152 
5153 /* Vector Floating-Point Slide Instructions */
5154 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5155 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5156                   CPURISCVState *env, uint32_t desc)          \
5157 {                                                             \
5158     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5159 }
5160 
5161 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5162 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5163 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5164 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5165 
5166 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5167 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5168                   CPURISCVState *env, uint32_t desc)          \
5169 {                                                             \
5170     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5171 }
5172 
5173 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5174 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5175 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5176 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5177 
5178 /* Vector Register Gather Instruction */
5179 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5180 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5181                   CPURISCVState *env, uint32_t desc)                      \
5182 {                                                                         \
5183     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5184     uint32_t vm = vext_vm(desc);                                          \
5185     uint32_t vl = env->vl;                                                \
5186     uint32_t esz = sizeof(TS2);                                           \
5187     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5188     uint32_t vta = vext_vta(desc);                                        \
5189     uint32_t vma = vext_vma(desc);                                        \
5190     uint64_t index;                                                       \
5191     uint32_t i;                                                           \
5192                                                                           \
5193     for (i = env->vstart; i < vl; i++) {                                  \
5194         if (!vm && !vext_elem_mask(v0, i)) {                              \
5195             /* set masked-off elements to 1s */                           \
5196             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5197             continue;                                                     \
5198         }                                                                 \
5199         index = *((TS1 *)vs1 + HS1(i));                                   \
5200         if (index >= vlmax) {                                             \
5201             *((TS2 *)vd + HS2(i)) = 0;                                    \
5202         } else {                                                          \
5203             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5204         }                                                                 \
5205     }                                                                     \
5206     env->vstart = 0;                                                      \
5207     /* set tail elements to 1s */                                         \
5208     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5209 }
5210 
5211 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5212 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5213 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5214 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5215 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5216 
5217 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5218 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5219 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5220 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5221 
5222 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5223 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5224                   CPURISCVState *env, uint32_t desc)                      \
5225 {                                                                         \
5226     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5227     uint32_t vm = vext_vm(desc);                                          \
5228     uint32_t vl = env->vl;                                                \
5229     uint32_t esz = sizeof(ETYPE);                                         \
5230     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5231     uint32_t vta = vext_vta(desc);                                        \
5232     uint32_t vma = vext_vma(desc);                                        \
5233     uint64_t index = s1;                                                  \
5234     uint32_t i;                                                           \
5235                                                                           \
5236     for (i = env->vstart; i < vl; i++) {                                  \
5237         if (!vm && !vext_elem_mask(v0, i)) {                              \
5238             /* set masked-off elements to 1s */                           \
5239             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5240             continue;                                                     \
5241         }                                                                 \
5242         if (index >= vlmax) {                                             \
5243             *((ETYPE *)vd + H(i)) = 0;                                    \
5244         } else {                                                          \
5245             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5246         }                                                                 \
5247     }                                                                     \
5248     env->vstart = 0;                                                      \
5249     /* set tail elements to 1s */                                         \
5250     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5251 }
5252 
5253 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5254 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5255 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5256 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5257 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5258 
5259 /* Vector Compress Instruction */
5260 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5261 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5262                   CPURISCVState *env, uint32_t desc)                      \
5263 {                                                                         \
5264     uint32_t vl = env->vl;                                                \
5265     uint32_t esz = sizeof(ETYPE);                                         \
5266     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5267     uint32_t vta = vext_vta(desc);                                        \
5268     uint32_t num = 0, i;                                                  \
5269                                                                           \
5270     for (i = env->vstart; i < vl; i++) {                                  \
5271         if (!vext_elem_mask(vs1, i)) {                                    \
5272             continue;                                                     \
5273         }                                                                 \
5274         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5275         num++;                                                            \
5276     }                                                                     \
5277     env->vstart = 0;                                                      \
5278     /* set tail elements to 1s */                                         \
5279     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5280 }
5281 
5282 /* Compress into vd elements of vs2 where vs1 is enabled */
5283 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5284 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5285 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5286 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5287 
5288 /* Vector Whole Register Move */
5289 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5290 {
5291     /* EEW = SEW */
5292     uint32_t maxsz = simd_maxsz(desc);
5293     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5294     uint32_t startb = env->vstart * sewb;
5295     uint32_t i = startb;
5296 
5297     memcpy((uint8_t *)vd + H1(i),
5298            (uint8_t *)vs2 + H1(i),
5299            maxsz - startb);
5300 
5301     env->vstart = 0;
5302 }
5303 
5304 /* Vector Integer Extension */
5305 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5306 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5307                   CPURISCVState *env, uint32_t desc)             \
5308 {                                                                \
5309     uint32_t vl = env->vl;                                       \
5310     uint32_t vm = vext_vm(desc);                                 \
5311     uint32_t esz = sizeof(ETYPE);                                \
5312     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5313     uint32_t vta = vext_vta(desc);                               \
5314     uint32_t vma = vext_vma(desc);                               \
5315     uint32_t i;                                                  \
5316                                                                  \
5317     for (i = env->vstart; i < vl; i++) {                         \
5318         if (!vm && !vext_elem_mask(v0, i)) {                     \
5319             /* set masked-off elements to 1s */                  \
5320             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5321             continue;                                            \
5322         }                                                        \
5323         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5324     }                                                            \
5325     env->vstart = 0;                                             \
5326     /* set tail elements to 1s */                                \
5327     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5328 }
5329 
5330 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5331 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5332 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5333 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5334 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5335 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5336 
5337 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5338 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5339 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5340 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5341 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5342 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5343