xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 3b57254d)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
54         /* only set vill bit. */
55         env->vill = 1;
56         env->vtype = 0;
57         env->vl = 0;
58         env->vstart = 0;
59         return 0;
60     }
61 
62     vlmax = vext_get_vlmax(cpu, s2);
63     if (s1 <= vlmax) {
64         vl = s1;
65     } else {
66         vl = vlmax;
67     }
68     env->vl = vl;
69     env->vtype = s2;
70     env->vstart = 0;
71     env->vill = 0;
72     return vl;
73 }
74 
75 /*
76  * Note that vector data is stored in host-endian 64-bit chunks,
77  * so addressing units smaller than that needs a host-endian fixup.
78  */
79 #if HOST_BIG_ENDIAN
80 #define H1(x)   ((x) ^ 7)
81 #define H1_2(x) ((x) ^ 6)
82 #define H1_4(x) ((x) ^ 4)
83 #define H2(x)   ((x) ^ 3)
84 #define H4(x)   ((x) ^ 1)
85 #define H8(x)   ((x))
86 #else
87 #define H1(x)   (x)
88 #define H1_2(x) (x)
89 #define H1_4(x) (x)
90 #define H2(x)   (x)
91 #define H4(x)   (x)
92 #define H8(x)   (x)
93 #endif
94 
95 static inline uint32_t vext_nf(uint32_t desc)
96 {
97     return FIELD_EX32(simd_data(desc), VDATA, NF);
98 }
99 
100 static inline uint32_t vext_vm(uint32_t desc)
101 {
102     return FIELD_EX32(simd_data(desc), VDATA, VM);
103 }
104 
105 /*
106  * Encode LMUL to lmul as following:
107  *     LMUL    vlmul    lmul
108  *      1       000       0
109  *      2       001       1
110  *      4       010       2
111  *      8       011       3
112  *      -       100       -
113  *     1/8      101      -3
114  *     1/4      110      -2
115  *     1/2      111      -1
116  */
117 static inline int32_t vext_lmul(uint32_t desc)
118 {
119     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
120 }
121 
122 static inline uint32_t vext_vta(uint32_t desc)
123 {
124     return FIELD_EX32(simd_data(desc), VDATA, VTA);
125 }
126 
127 static inline uint32_t vext_vma(uint32_t desc)
128 {
129     return FIELD_EX32(simd_data(desc), VDATA, VMA);
130 }
131 
132 static inline uint32_t vext_vta_all_1s(uint32_t desc)
133 {
134     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
135 }
136 
137 /*
138  * Get the maximum number of elements can be operated.
139  *
140  * log2_esz: log2 of element size in bytes.
141  */
142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
143 {
144     /*
145      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
146      * so vlen in bytes (vlenb) is encoded as maxsz.
147      */
148     uint32_t vlenb = simd_maxsz(desc);
149 
150     /* Return VLMAX */
151     int scale = vext_lmul(desc) - log2_esz;
152     return scale < 0 ? vlenb >> -scale : vlenb << scale;
153 }
154 
155 /*
156  * Get number of total elements, including prestart, body and tail elements.
157  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
158  * are held in the same vector register.
159  */
160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
161                                             uint32_t esz)
162 {
163     uint32_t vlenb = simd_maxsz(desc);
164     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
165     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
166                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
167     return (vlenb << emul) / esz;
168 }
169 
170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
171 {
172     return (addr & env->cur_pmmask) | env->cur_pmbase;
173 }
174 
175 /*
176  * This function checks watchpoint before real load operation.
177  *
178  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
179  * In user mode, there is no watchpoint support now.
180  *
181  * It will trigger an exception if there is no mapping in TLB
182  * and page table walk can't fill the TLB entry. Then the guest
183  * software can return here after process the exception or never return.
184  */
185 static void probe_pages(CPURISCVState *env, target_ulong addr,
186                         target_ulong len, uintptr_t ra,
187                         MMUAccessType access_type)
188 {
189     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
190     target_ulong curlen = MIN(pagelen, len);
191 
192     probe_access(env, adjust_addr(env, addr), curlen, access_type,
193                  cpu_mmu_index(env, false), ra);
194     if (len > curlen) {
195         addr += curlen;
196         curlen = len - curlen;
197         probe_access(env, adjust_addr(env, addr), curlen, access_type,
198                      cpu_mmu_index(env, false), ra);
199     }
200 }
201 
202 /* set agnostic elements to 1s */
203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
204                               uint32_t tot)
205 {
206     if (is_agnostic == 0) {
207         /* policy undisturbed */
208         return;
209     }
210     if (tot - cnt == 0) {
211         return;
212     }
213     memset(base + cnt, -1, tot - cnt);
214 }
215 
216 static inline void vext_set_elem_mask(void *v0, int index,
217                                       uint8_t value)
218 {
219     int idx = index / 64;
220     int pos = index % 64;
221     uint64_t old = ((uint64_t *)v0)[idx];
222     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
223 }
224 
225 /*
226  * Earlier designs (pre-0.9) had a varying number of bits
227  * per mask value (MLEN). In the 0.9 design, MLEN=1.
228  * (Section 4.5)
229  */
230 static inline int vext_elem_mask(void *v0, int index)
231 {
232     int idx = index / 64;
233     int pos = index  % 64;
234     return (((uint64_t *)v0)[idx] >> pos) & 1;
235 }
236 
237 /* elements operations for load and store */
238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
239                                uint32_t idx, void *vd, uintptr_t retaddr);
240 
241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
242 static void NAME(CPURISCVState *env, abi_ptr addr,         \
243                  uint32_t idx, void *vd, uintptr_t retaddr)\
244 {                                                          \
245     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
246     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
247 }                                                          \
248 
249 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
253 
254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
255 static void NAME(CPURISCVState *env, abi_ptr addr,         \
256                  uint32_t idx, void *vd, uintptr_t retaddr)\
257 {                                                          \
258     ETYPE data = *((ETYPE *)vd + H(idx));                  \
259     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
260 }
261 
262 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
266 
267 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl,
268                                    void *vd, uint32_t desc, uint32_t nf,
269                                    uint32_t esz, uint32_t max_elems)
270 {
271     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
272     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
273     uint32_t vta = vext_vta(desc);
274     uint32_t registers_used;
275     int k;
276 
277     for (k = 0; k < nf; ++k) {
278         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
279                           (k * max_elems + max_elems) * esz);
280     }
281 
282     if (nf * max_elems % total_elems != 0) {
283         registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
284         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
285                           registers_used * vlenb);
286     }
287 }
288 
289 /*
290  * stride: access vector element from strided memory
291  */
292 static void
293 vext_ldst_stride(void *vd, void *v0, target_ulong base,
294                  target_ulong stride, CPURISCVState *env,
295                  uint32_t desc, uint32_t vm,
296                  vext_ldst_elem_fn *ldst_elem,
297                  uint32_t log2_esz, uintptr_t ra)
298 {
299     uint32_t i, k;
300     uint32_t nf = vext_nf(desc);
301     uint32_t max_elems = vext_max_elems(desc, log2_esz);
302     uint32_t esz = 1 << log2_esz;
303     uint32_t vma = vext_vma(desc);
304 
305     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
306         k = 0;
307         while (k < nf) {
308             if (!vm && !vext_elem_mask(v0, i)) {
309                 /* set masked-off elements to 1s */
310                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
311                                   (i + k * max_elems + 1) * esz);
312                 k++;
313                 continue;
314             }
315             target_ulong addr = base + stride * i + (k << log2_esz);
316             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
317             k++;
318         }
319     }
320     env->vstart = 0;
321 
322     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
323 }
324 
325 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
326 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
327                   target_ulong stride, CPURISCVState *env,              \
328                   uint32_t desc)                                        \
329 {                                                                       \
330     uint32_t vm = vext_vm(desc);                                        \
331     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
332                      ctzl(sizeof(ETYPE)), GETPC());                     \
333 }
334 
335 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
336 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
337 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
338 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
339 
340 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
341 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
342                   target_ulong stride, CPURISCVState *env,              \
343                   uint32_t desc)                                        \
344 {                                                                       \
345     uint32_t vm = vext_vm(desc);                                        \
346     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
347                      ctzl(sizeof(ETYPE)), GETPC());                     \
348 }
349 
350 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
351 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
352 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
353 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
354 
355 /*
356  * unit-stride: access elements stored contiguously in memory
357  */
358 
359 /* unmasked unit-stride load and store operation */
360 static void
361 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
362              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
363              uintptr_t ra)
364 {
365     uint32_t i, k;
366     uint32_t nf = vext_nf(desc);
367     uint32_t max_elems = vext_max_elems(desc, log2_esz);
368     uint32_t esz = 1 << log2_esz;
369 
370     /* load bytes from guest memory */
371     for (i = env->vstart; i < evl; i++, env->vstart++) {
372         k = 0;
373         while (k < nf) {
374             target_ulong addr = base + ((i * nf + k) << log2_esz);
375             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
376             k++;
377         }
378     }
379     env->vstart = 0;
380 
381     vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems);
382 }
383 
384 /*
385  * masked unit-stride load and store operation will be a special case of stride,
386  * stride = NF * sizeof (MTYPE)
387  */
388 
389 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
390 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
391                          CPURISCVState *env, uint32_t desc)             \
392 {                                                                       \
393     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
394     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
395                      ctzl(sizeof(ETYPE)), GETPC());                     \
396 }                                                                       \
397                                                                         \
398 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
399                   CPURISCVState *env, uint32_t desc)                    \
400 {                                                                       \
401     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
402                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
403 }
404 
405 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
406 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
407 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
408 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
409 
410 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
411 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
412                          CPURISCVState *env, uint32_t desc)              \
413 {                                                                        \
414     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
415     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
416                      ctzl(sizeof(ETYPE)), GETPC());                      \
417 }                                                                        \
418                                                                          \
419 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
420                   CPURISCVState *env, uint32_t desc)                     \
421 {                                                                        \
422     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
423                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
424 }
425 
426 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
427 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
428 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
429 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
430 
431 /*
432  * unit stride mask load and store, EEW = 1
433  */
434 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
435                     CPURISCVState *env, uint32_t desc)
436 {
437     /* evl = ceil(vl/8) */
438     uint8_t evl = (env->vl + 7) >> 3;
439     vext_ldst_us(vd, base, env, desc, lde_b,
440                  0, evl, GETPC());
441 }
442 
443 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
444                     CPURISCVState *env, uint32_t desc)
445 {
446     /* evl = ceil(vl/8) */
447     uint8_t evl = (env->vl + 7) >> 3;
448     vext_ldst_us(vd, base, env, desc, ste_b,
449                  0, evl, GETPC());
450 }
451 
452 /*
453  * index: access vector element from indexed memory
454  */
455 typedef target_ulong vext_get_index_addr(target_ulong base,
456         uint32_t idx, void *vs2);
457 
458 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
459 static target_ulong NAME(target_ulong base,            \
460                          uint32_t idx, void *vs2)      \
461 {                                                      \
462     return (base + *((ETYPE *)vs2 + H(idx)));          \
463 }
464 
465 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
466 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
467 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
468 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
469 
470 static inline void
471 vext_ldst_index(void *vd, void *v0, target_ulong base,
472                 void *vs2, CPURISCVState *env, uint32_t desc,
473                 vext_get_index_addr get_index_addr,
474                 vext_ldst_elem_fn *ldst_elem,
475                 uint32_t log2_esz, uintptr_t ra)
476 {
477     uint32_t i, k;
478     uint32_t nf = vext_nf(desc);
479     uint32_t vm = vext_vm(desc);
480     uint32_t max_elems = vext_max_elems(desc, log2_esz);
481     uint32_t esz = 1 << log2_esz;
482     uint32_t vma = vext_vma(desc);
483 
484     /* load bytes from guest memory */
485     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
486         k = 0;
487         while (k < nf) {
488             if (!vm && !vext_elem_mask(v0, i)) {
489                 /* set masked-off elements to 1s */
490                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
491                                   (i + k * max_elems + 1) * esz);
492                 k++;
493                 continue;
494             }
495             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
496             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
497             k++;
498         }
499     }
500     env->vstart = 0;
501 
502     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
503 }
504 
505 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
506 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
507                   void *vs2, CPURISCVState *env, uint32_t desc)            \
508 {                                                                          \
509     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
510                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
511 }
512 
513 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
514 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
515 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
516 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
517 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
529 
530 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
531 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
532                   void *vs2, CPURISCVState *env, uint32_t desc)  \
533 {                                                                \
534     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
535                     STORE_FN, ctzl(sizeof(ETYPE)),               \
536                     GETPC());                                    \
537 }
538 
539 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
540 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
541 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
542 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
543 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
555 
556 /*
557  * unit-stride fault-only-fisrt load instructions
558  */
559 static inline void
560 vext_ldff(void *vd, void *v0, target_ulong base,
561           CPURISCVState *env, uint32_t desc,
562           vext_ldst_elem_fn *ldst_elem,
563           uint32_t log2_esz, uintptr_t ra)
564 {
565     void *host;
566     uint32_t i, k, vl = 0;
567     uint32_t nf = vext_nf(desc);
568     uint32_t vm = vext_vm(desc);
569     uint32_t max_elems = vext_max_elems(desc, log2_esz);
570     uint32_t esz = 1 << log2_esz;
571     uint32_t vma = vext_vma(desc);
572     target_ulong addr, offset, remain;
573 
574     /* probe every access */
575     for (i = env->vstart; i < env->vl; i++) {
576         if (!vm && !vext_elem_mask(v0, i)) {
577             continue;
578         }
579         addr = adjust_addr(env, base + i * (nf << log2_esz));
580         if (i == 0) {
581             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
582         } else {
583             /* if it triggers an exception, no need to check watchpoint */
584             remain = nf << log2_esz;
585             while (remain > 0) {
586                 offset = -(addr | TARGET_PAGE_MASK);
587                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
588                                          cpu_mmu_index(env, false));
589                 if (host) {
590 #ifdef CONFIG_USER_ONLY
591                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
592                         vl = i;
593                         goto ProbeSuccess;
594                     }
595 #else
596                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
597 #endif
598                 } else {
599                     vl = i;
600                     goto ProbeSuccess;
601                 }
602                 if (remain <=  offset) {
603                     break;
604                 }
605                 remain -= offset;
606                 addr = adjust_addr(env, addr + offset);
607             }
608         }
609     }
610 ProbeSuccess:
611     /* load bytes from guest memory */
612     if (vl != 0) {
613         env->vl = vl;
614     }
615     for (i = env->vstart; i < env->vl; i++) {
616         k = 0;
617         while (k < nf) {
618             if (!vm && !vext_elem_mask(v0, i)) {
619                 /* set masked-off elements to 1s */
620                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
621                                   (i + k * max_elems + 1) * esz);
622                 k++;
623                 continue;
624             }
625             target_ulong addr = base + ((i * nf + k) << log2_esz);
626             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
627             k++;
628         }
629     }
630     env->vstart = 0;
631 
632     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
633 }
634 
635 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
636 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
637                   CPURISCVState *env, uint32_t desc)      \
638 {                                                         \
639     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
640               ctzl(sizeof(ETYPE)), GETPC());              \
641 }
642 
643 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
644 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
645 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
646 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
647 
648 #define DO_SWAP(N, M) (M)
649 #define DO_AND(N, M)  (N & M)
650 #define DO_XOR(N, M)  (N ^ M)
651 #define DO_OR(N, M)   (N | M)
652 #define DO_ADD(N, M)  (N + M)
653 
654 /* Signed min/max */
655 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
656 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
657 
658 /* Unsigned min/max */
659 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
660 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
661 
662 /*
663  * load and store whole register instructions
664  */
665 static void
666 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
667                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
668 {
669     uint32_t i, k, off, pos;
670     uint32_t nf = vext_nf(desc);
671     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
672     uint32_t max_elems = vlenb >> log2_esz;
673 
674     k = env->vstart / max_elems;
675     off = env->vstart % max_elems;
676 
677     if (off) {
678         /* load/store rest of elements of current segment pointed by vstart */
679         for (pos = off; pos < max_elems; pos++, env->vstart++) {
680             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
681             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
682         }
683         k++;
684     }
685 
686     /* load/store elements for rest of segments */
687     for (; k < nf; k++) {
688         for (i = 0; i < max_elems; i++, env->vstart++) {
689             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
690             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
691         }
692     }
693 
694     env->vstart = 0;
695 }
696 
697 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
698 void HELPER(NAME)(void *vd, target_ulong base,       \
699                   CPURISCVState *env, uint32_t desc) \
700 {                                                    \
701     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
702                     ctzl(sizeof(ETYPE)), GETPC());   \
703 }
704 
705 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
706 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
707 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
708 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
709 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
710 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
711 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
712 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
713 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
714 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
715 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
716 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
717 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
718 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
719 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
720 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
721 
722 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
723 void HELPER(NAME)(void *vd, target_ulong base,       \
724                   CPURISCVState *env, uint32_t desc) \
725 {                                                    \
726     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
727                     ctzl(sizeof(ETYPE)), GETPC());   \
728 }
729 
730 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
731 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
732 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
733 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
734 
735 /*
736  * Vector Integer Arithmetic Instructions
737  */
738 
739 /* expand macro args before macro */
740 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
741 
742 /* (TD, T1, T2, TX1, TX2) */
743 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
744 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
745 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
746 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
747 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
748 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
749 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
750 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
751 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
752 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
753 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
754 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
755 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
756 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
757 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
758 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
759 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
760 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
761 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
762 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
763 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
764 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
765 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
766 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
767 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
768 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
769 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
770 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
771 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
772 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
773 
774 /* operation of two vector elements */
775 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
776 
777 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
778 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
779 {                                                               \
780     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
781     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
782     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
783 }
784 #define DO_SUB(N, M) (N - M)
785 #define DO_RSUB(N, M) (M - N)
786 
787 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
788 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
789 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
790 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
791 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
792 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
793 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
794 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
795 
796 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
797                        CPURISCVState *env, uint32_t desc,
798                        opivv2_fn *fn, uint32_t esz)
799 {
800     uint32_t vm = vext_vm(desc);
801     uint32_t vl = env->vl;
802     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
803     uint32_t vta = vext_vta(desc);
804     uint32_t vma = vext_vma(desc);
805     uint32_t i;
806 
807     for (i = env->vstart; i < vl; i++) {
808         if (!vm && !vext_elem_mask(v0, i)) {
809             /* set masked-off elements to 1s */
810             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
811             continue;
812         }
813         fn(vd, vs1, vs2, i);
814     }
815     env->vstart = 0;
816     /* set tail elements to 1s */
817     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
818 }
819 
820 /* generate the helpers for OPIVV */
821 #define GEN_VEXT_VV(NAME, ESZ)                            \
822 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
823                   void *vs2, CPURISCVState *env,          \
824                   uint32_t desc)                          \
825 {                                                         \
826     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
827                do_##NAME, ESZ);                           \
828 }
829 
830 GEN_VEXT_VV(vadd_vv_b, 1)
831 GEN_VEXT_VV(vadd_vv_h, 2)
832 GEN_VEXT_VV(vadd_vv_w, 4)
833 GEN_VEXT_VV(vadd_vv_d, 8)
834 GEN_VEXT_VV(vsub_vv_b, 1)
835 GEN_VEXT_VV(vsub_vv_h, 2)
836 GEN_VEXT_VV(vsub_vv_w, 4)
837 GEN_VEXT_VV(vsub_vv_d, 8)
838 
839 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
840 
841 /*
842  * (T1)s1 gives the real operator type.
843  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
844  */
845 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
846 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
847 {                                                                   \
848     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
849     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
850 }
851 
852 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
853 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
854 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
855 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
856 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
857 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
858 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
859 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
860 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
861 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
862 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
863 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
864 
865 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
866                        CPURISCVState *env, uint32_t desc,
867                        opivx2_fn fn, uint32_t esz)
868 {
869     uint32_t vm = vext_vm(desc);
870     uint32_t vl = env->vl;
871     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
872     uint32_t vta = vext_vta(desc);
873     uint32_t vma = vext_vma(desc);
874     uint32_t i;
875 
876     for (i = env->vstart; i < vl; i++) {
877         if (!vm && !vext_elem_mask(v0, i)) {
878             /* set masked-off elements to 1s */
879             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
880             continue;
881         }
882         fn(vd, s1, vs2, i);
883     }
884     env->vstart = 0;
885     /* set tail elements to 1s */
886     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
887 }
888 
889 /* generate the helpers for OPIVX */
890 #define GEN_VEXT_VX(NAME, ESZ)                            \
891 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
892                   void *vs2, CPURISCVState *env,          \
893                   uint32_t desc)                          \
894 {                                                         \
895     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
896                do_##NAME, ESZ);                           \
897 }
898 
899 GEN_VEXT_VX(vadd_vx_b, 1)
900 GEN_VEXT_VX(vadd_vx_h, 2)
901 GEN_VEXT_VX(vadd_vx_w, 4)
902 GEN_VEXT_VX(vadd_vx_d, 8)
903 GEN_VEXT_VX(vsub_vx_b, 1)
904 GEN_VEXT_VX(vsub_vx_h, 2)
905 GEN_VEXT_VX(vsub_vx_w, 4)
906 GEN_VEXT_VX(vsub_vx_d, 8)
907 GEN_VEXT_VX(vrsub_vx_b, 1)
908 GEN_VEXT_VX(vrsub_vx_h, 2)
909 GEN_VEXT_VX(vrsub_vx_w, 4)
910 GEN_VEXT_VX(vrsub_vx_d, 8)
911 
912 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
913 {
914     intptr_t oprsz = simd_oprsz(desc);
915     intptr_t i;
916 
917     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
918         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
919     }
920 }
921 
922 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
923 {
924     intptr_t oprsz = simd_oprsz(desc);
925     intptr_t i;
926 
927     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
928         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
929     }
930 }
931 
932 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
933 {
934     intptr_t oprsz = simd_oprsz(desc);
935     intptr_t i;
936 
937     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
938         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
939     }
940 }
941 
942 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
943 {
944     intptr_t oprsz = simd_oprsz(desc);
945     intptr_t i;
946 
947     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
948         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
949     }
950 }
951 
952 /* Vector Widening Integer Add/Subtract */
953 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
954 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
955 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
956 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
957 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
958 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
959 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
960 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
961 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
962 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
963 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
964 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
965 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
966 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
967 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
968 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
969 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
970 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
971 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
972 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
973 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
974 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
975 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
976 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
977 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
978 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
979 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
980 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
981 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
982 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
983 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
984 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
985 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
986 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
987 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
988 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
989 GEN_VEXT_VV(vwaddu_vv_b, 2)
990 GEN_VEXT_VV(vwaddu_vv_h, 4)
991 GEN_VEXT_VV(vwaddu_vv_w, 8)
992 GEN_VEXT_VV(vwsubu_vv_b, 2)
993 GEN_VEXT_VV(vwsubu_vv_h, 4)
994 GEN_VEXT_VV(vwsubu_vv_w, 8)
995 GEN_VEXT_VV(vwadd_vv_b, 2)
996 GEN_VEXT_VV(vwadd_vv_h, 4)
997 GEN_VEXT_VV(vwadd_vv_w, 8)
998 GEN_VEXT_VV(vwsub_vv_b, 2)
999 GEN_VEXT_VV(vwsub_vv_h, 4)
1000 GEN_VEXT_VV(vwsub_vv_w, 8)
1001 GEN_VEXT_VV(vwaddu_wv_b, 2)
1002 GEN_VEXT_VV(vwaddu_wv_h, 4)
1003 GEN_VEXT_VV(vwaddu_wv_w, 8)
1004 GEN_VEXT_VV(vwsubu_wv_b, 2)
1005 GEN_VEXT_VV(vwsubu_wv_h, 4)
1006 GEN_VEXT_VV(vwsubu_wv_w, 8)
1007 GEN_VEXT_VV(vwadd_wv_b, 2)
1008 GEN_VEXT_VV(vwadd_wv_h, 4)
1009 GEN_VEXT_VV(vwadd_wv_w, 8)
1010 GEN_VEXT_VV(vwsub_wv_b, 2)
1011 GEN_VEXT_VV(vwsub_wv_h, 4)
1012 GEN_VEXT_VV(vwsub_wv_w, 8)
1013 
1014 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1015 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1016 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1017 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1018 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1019 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1020 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1021 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1022 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1023 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1024 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1025 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1026 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1027 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1028 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1029 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1030 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1031 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1032 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1033 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1034 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1035 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1036 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1037 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1038 GEN_VEXT_VX(vwaddu_vx_b, 2)
1039 GEN_VEXT_VX(vwaddu_vx_h, 4)
1040 GEN_VEXT_VX(vwaddu_vx_w, 8)
1041 GEN_VEXT_VX(vwsubu_vx_b, 2)
1042 GEN_VEXT_VX(vwsubu_vx_h, 4)
1043 GEN_VEXT_VX(vwsubu_vx_w, 8)
1044 GEN_VEXT_VX(vwadd_vx_b, 2)
1045 GEN_VEXT_VX(vwadd_vx_h, 4)
1046 GEN_VEXT_VX(vwadd_vx_w, 8)
1047 GEN_VEXT_VX(vwsub_vx_b, 2)
1048 GEN_VEXT_VX(vwsub_vx_h, 4)
1049 GEN_VEXT_VX(vwsub_vx_w, 8)
1050 GEN_VEXT_VX(vwaddu_wx_b, 2)
1051 GEN_VEXT_VX(vwaddu_wx_h, 4)
1052 GEN_VEXT_VX(vwaddu_wx_w, 8)
1053 GEN_VEXT_VX(vwsubu_wx_b, 2)
1054 GEN_VEXT_VX(vwsubu_wx_h, 4)
1055 GEN_VEXT_VX(vwsubu_wx_w, 8)
1056 GEN_VEXT_VX(vwadd_wx_b, 2)
1057 GEN_VEXT_VX(vwadd_wx_h, 4)
1058 GEN_VEXT_VX(vwadd_wx_w, 8)
1059 GEN_VEXT_VX(vwsub_wx_b, 2)
1060 GEN_VEXT_VX(vwsub_wx_h, 4)
1061 GEN_VEXT_VX(vwsub_wx_w, 8)
1062 
1063 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1064 #define DO_VADC(N, M, C) (N + M + C)
1065 #define DO_VSBC(N, M, C) (N - M - C)
1066 
1067 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1068 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1069                   CPURISCVState *env, uint32_t desc)          \
1070 {                                                             \
1071     uint32_t vl = env->vl;                                    \
1072     uint32_t esz = sizeof(ETYPE);                             \
1073     uint32_t total_elems =                                    \
1074         vext_get_total_elems(env, desc, esz);                 \
1075     uint32_t vta = vext_vta(desc);                            \
1076     uint32_t i;                                               \
1077                                                               \
1078     for (i = env->vstart; i < vl; i++) {                      \
1079         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1080         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1081         ETYPE carry = vext_elem_mask(v0, i);                  \
1082                                                               \
1083         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1084     }                                                         \
1085     env->vstart = 0;                                          \
1086     /* set tail elements to 1s */                             \
1087     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1088 }
1089 
1090 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1091 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1092 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1093 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1094 
1095 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1096 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1097 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1098 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1099 
1100 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1101 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1102                   CPURISCVState *env, uint32_t desc)                     \
1103 {                                                                        \
1104     uint32_t vl = env->vl;                                               \
1105     uint32_t esz = sizeof(ETYPE);                                        \
1106     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1107     uint32_t vta = vext_vta(desc);                                       \
1108     uint32_t i;                                                          \
1109                                                                          \
1110     for (i = env->vstart; i < vl; i++) {                                 \
1111         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1112         ETYPE carry = vext_elem_mask(v0, i);                             \
1113                                                                          \
1114         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1115     }                                                                    \
1116     env->vstart = 0;                                                     \
1117     /* set tail elements to 1s */                                        \
1118     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1119 }
1120 
1121 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1122 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1123 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1124 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1125 
1126 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1127 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1128 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1129 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1130 
1131 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1132                           (__typeof(N))(N + M) < N)
1133 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1134 
1135 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1136 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1137                   CPURISCVState *env, uint32_t desc)          \
1138 {                                                             \
1139     uint32_t vl = env->vl;                                    \
1140     uint32_t vm = vext_vm(desc);                              \
1141     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1142     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1143     uint32_t i;                                               \
1144                                                               \
1145     for (i = env->vstart; i < vl; i++) {                      \
1146         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1147         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1148         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1149         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1150     }                                                         \
1151     env->vstart = 0;                                          \
1152     /*
1153      * mask destination register are always tail-agnostic
1154      * set tail elements to 1s
1155      */                                                       \
1156     if (vta_all_1s) {                                         \
1157         for (; i < total_elems; i++) {                        \
1158             vext_set_elem_mask(vd, i, 1);                     \
1159         }                                                     \
1160     }                                                         \
1161 }
1162 
1163 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1164 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1165 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1167 
1168 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1169 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1170 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1172 
1173 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1174 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1175                   void *vs2, CPURISCVState *env, uint32_t desc) \
1176 {                                                               \
1177     uint32_t vl = env->vl;                                      \
1178     uint32_t vm = vext_vm(desc);                                \
1179     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1180     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1181     uint32_t i;                                                 \
1182                                                                 \
1183     for (i = env->vstart; i < vl; i++) {                        \
1184         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1185         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1186         vext_set_elem_mask(vd, i,                               \
1187                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1188     }                                                           \
1189     env->vstart = 0;                                            \
1190     /*
1191      * mask destination register are always tail-agnostic
1192      * set tail elements to 1s
1193      */                                                         \
1194     if (vta_all_1s) {                                           \
1195         for (; i < total_elems; i++) {                          \
1196             vext_set_elem_mask(vd, i, 1);                       \
1197         }                                                       \
1198     }                                                           \
1199 }
1200 
1201 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1205 
1206 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1210 
1211 /* Vector Bitwise Logical Instructions */
1212 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1213 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1216 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1217 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1220 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1221 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1224 GEN_VEXT_VV(vand_vv_b, 1)
1225 GEN_VEXT_VV(vand_vv_h, 2)
1226 GEN_VEXT_VV(vand_vv_w, 4)
1227 GEN_VEXT_VV(vand_vv_d, 8)
1228 GEN_VEXT_VV(vor_vv_b, 1)
1229 GEN_VEXT_VV(vor_vv_h, 2)
1230 GEN_VEXT_VV(vor_vv_w, 4)
1231 GEN_VEXT_VV(vor_vv_d, 8)
1232 GEN_VEXT_VV(vxor_vv_b, 1)
1233 GEN_VEXT_VV(vxor_vv_h, 2)
1234 GEN_VEXT_VV(vxor_vv_w, 4)
1235 GEN_VEXT_VV(vxor_vv_d, 8)
1236 
1237 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1238 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1241 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1242 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1245 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1246 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1249 GEN_VEXT_VX(vand_vx_b, 1)
1250 GEN_VEXT_VX(vand_vx_h, 2)
1251 GEN_VEXT_VX(vand_vx_w, 4)
1252 GEN_VEXT_VX(vand_vx_d, 8)
1253 GEN_VEXT_VX(vor_vx_b, 1)
1254 GEN_VEXT_VX(vor_vx_h, 2)
1255 GEN_VEXT_VX(vor_vx_w, 4)
1256 GEN_VEXT_VX(vor_vx_d, 8)
1257 GEN_VEXT_VX(vxor_vx_b, 1)
1258 GEN_VEXT_VX(vxor_vx_h, 2)
1259 GEN_VEXT_VX(vxor_vx_w, 4)
1260 GEN_VEXT_VX(vxor_vx_d, 8)
1261 
1262 /* Vector Single-Width Bit Shift Instructions */
1263 #define DO_SLL(N, M)  (N << (M))
1264 #define DO_SRL(N, M)  (N >> (M))
1265 
1266 /* generate the helpers for shift instructions with two vector operators */
1267 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1268 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1269                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1270 {                                                                         \
1271     uint32_t vm = vext_vm(desc);                                          \
1272     uint32_t vl = env->vl;                                                \
1273     uint32_t esz = sizeof(TS1);                                           \
1274     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1275     uint32_t vta = vext_vta(desc);                                        \
1276     uint32_t vma = vext_vma(desc);                                        \
1277     uint32_t i;                                                           \
1278                                                                           \
1279     for (i = env->vstart; i < vl; i++) {                                  \
1280         if (!vm && !vext_elem_mask(v0, i)) {                              \
1281             /* set masked-off elements to 1s */                           \
1282             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1283             continue;                                                     \
1284         }                                                                 \
1285         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1286         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1287         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1288     }                                                                     \
1289     env->vstart = 0;                                                      \
1290     /* set tail elements to 1s */                                         \
1291     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1292 }
1293 
1294 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1296 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1297 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1298 
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1301 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1302 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1303 
1304 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1306 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1307 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1308 
1309 /* generate the helpers for shift instructions with one vector and one scalar */
1310 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1311 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1312                   void *vs2, CPURISCVState *env,            \
1313                   uint32_t desc)                            \
1314 {                                                           \
1315     uint32_t vm = vext_vm(desc);                            \
1316     uint32_t vl = env->vl;                                  \
1317     uint32_t esz = sizeof(TD);                              \
1318     uint32_t total_elems =                                  \
1319         vext_get_total_elems(env, desc, esz);               \
1320     uint32_t vta = vext_vta(desc);                          \
1321     uint32_t vma = vext_vma(desc);                          \
1322     uint32_t i;                                             \
1323                                                             \
1324     for (i = env->vstart; i < vl; i++) {                    \
1325         if (!vm && !vext_elem_mask(v0, i)) {                \
1326             /* set masked-off elements to 1s */             \
1327             vext_set_elems_1s(vd, vma, i * esz,             \
1328                               (i + 1) * esz);               \
1329             continue;                                       \
1330         }                                                   \
1331         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1332         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1333     }                                                       \
1334     env->vstart = 0;                                        \
1335     /* set tail elements to 1s */                           \
1336     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1337 }
1338 
1339 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1340 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1341 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1342 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1343 
1344 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1345 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1346 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1347 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1348 
1349 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1350 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1351 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1352 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1353 
1354 /* Vector Narrowing Integer Right Shift Instructions */
1355 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1356 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1357 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1358 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1359 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1360 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1361 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1362 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1363 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1364 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1367 
1368 /* Vector Integer Comparison Instructions */
1369 #define DO_MSEQ(N, M) (N == M)
1370 #define DO_MSNE(N, M) (N != M)
1371 #define DO_MSLT(N, M) (N < M)
1372 #define DO_MSLE(N, M) (N <= M)
1373 #define DO_MSGT(N, M) (N > M)
1374 
1375 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1376 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1377                   CPURISCVState *env, uint32_t desc)          \
1378 {                                                             \
1379     uint32_t vm = vext_vm(desc);                              \
1380     uint32_t vl = env->vl;                                    \
1381     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1382     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1383     uint32_t vma = vext_vma(desc);                            \
1384     uint32_t i;                                               \
1385                                                               \
1386     for (i = env->vstart; i < vl; i++) {                      \
1387         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1388         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1389         if (!vm && !vext_elem_mask(v0, i)) {                  \
1390             /* set masked-off elements to 1s */               \
1391             if (vma) {                                        \
1392                 vext_set_elem_mask(vd, i, 1);                 \
1393             }                                                 \
1394             continue;                                         \
1395         }                                                     \
1396         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1397     }                                                         \
1398     env->vstart = 0;                                          \
1399     /*
1400      * mask destination register are always tail-agnostic
1401      * set tail elements to 1s
1402      */                                                       \
1403     if (vta_all_1s) {                                         \
1404         for (; i < total_elems; i++) {                        \
1405             vext_set_elem_mask(vd, i, 1);                     \
1406         }                                                     \
1407     }                                                         \
1408 }
1409 
1410 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1411 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1412 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1413 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1414 
1415 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1416 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1417 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1418 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1419 
1420 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1421 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1423 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1424 
1425 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1426 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1427 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1428 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1429 
1430 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1431 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1433 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1434 
1435 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1436 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1437 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1438 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1439 
1440 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1441 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1442                   CPURISCVState *env, uint32_t desc)                \
1443 {                                                                   \
1444     uint32_t vm = vext_vm(desc);                                    \
1445     uint32_t vl = env->vl;                                          \
1446     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1447     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1448     uint32_t vma = vext_vma(desc);                                  \
1449     uint32_t i;                                                     \
1450                                                                     \
1451     for (i = env->vstart; i < vl; i++) {                            \
1452         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1453         if (!vm && !vext_elem_mask(v0, i)) {                        \
1454             /* set masked-off elements to 1s */                     \
1455             if (vma) {                                              \
1456                 vext_set_elem_mask(vd, i, 1);                       \
1457             }                                                       \
1458             continue;                                               \
1459         }                                                           \
1460         vext_set_elem_mask(vd, i,                                   \
1461                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1462     }                                                               \
1463     env->vstart = 0;                                                \
1464     /*
1465      * mask destination register are always tail-agnostic
1466      * set tail elements to 1s
1467      */                                                             \
1468     if (vta_all_1s) {                                               \
1469         for (; i < total_elems; i++) {                              \
1470             vext_set_elem_mask(vd, i, 1);                           \
1471         }                                                           \
1472     }                                                               \
1473 }
1474 
1475 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1476 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1477 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1478 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1479 
1480 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1481 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1482 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1483 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1484 
1485 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1488 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1489 
1490 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1491 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1492 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1493 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1494 
1495 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1498 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1499 
1500 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1501 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1502 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1503 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1504 
1505 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1508 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1509 
1510 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1511 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1512 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1513 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1514 
1515 /* Vector Integer Min/Max Instructions */
1516 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1517 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1518 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1519 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1520 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1521 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1522 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1523 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1524 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1525 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1526 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1527 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1528 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1529 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1530 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1531 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1532 GEN_VEXT_VV(vminu_vv_b, 1)
1533 GEN_VEXT_VV(vminu_vv_h, 2)
1534 GEN_VEXT_VV(vminu_vv_w, 4)
1535 GEN_VEXT_VV(vminu_vv_d, 8)
1536 GEN_VEXT_VV(vmin_vv_b, 1)
1537 GEN_VEXT_VV(vmin_vv_h, 2)
1538 GEN_VEXT_VV(vmin_vv_w, 4)
1539 GEN_VEXT_VV(vmin_vv_d, 8)
1540 GEN_VEXT_VV(vmaxu_vv_b, 1)
1541 GEN_VEXT_VV(vmaxu_vv_h, 2)
1542 GEN_VEXT_VV(vmaxu_vv_w, 4)
1543 GEN_VEXT_VV(vmaxu_vv_d, 8)
1544 GEN_VEXT_VV(vmax_vv_b, 1)
1545 GEN_VEXT_VV(vmax_vv_h, 2)
1546 GEN_VEXT_VV(vmax_vv_w, 4)
1547 GEN_VEXT_VV(vmax_vv_d, 8)
1548 
1549 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1550 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1551 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1552 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1553 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1554 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1555 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1556 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1557 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1558 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1559 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1560 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1561 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1562 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1563 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1564 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1565 GEN_VEXT_VX(vminu_vx_b, 1)
1566 GEN_VEXT_VX(vminu_vx_h, 2)
1567 GEN_VEXT_VX(vminu_vx_w, 4)
1568 GEN_VEXT_VX(vminu_vx_d, 8)
1569 GEN_VEXT_VX(vmin_vx_b, 1)
1570 GEN_VEXT_VX(vmin_vx_h, 2)
1571 GEN_VEXT_VX(vmin_vx_w, 4)
1572 GEN_VEXT_VX(vmin_vx_d, 8)
1573 GEN_VEXT_VX(vmaxu_vx_b, 1)
1574 GEN_VEXT_VX(vmaxu_vx_h, 2)
1575 GEN_VEXT_VX(vmaxu_vx_w, 4)
1576 GEN_VEXT_VX(vmaxu_vx_d, 8)
1577 GEN_VEXT_VX(vmax_vx_b, 1)
1578 GEN_VEXT_VX(vmax_vx_h, 2)
1579 GEN_VEXT_VX(vmax_vx_w, 4)
1580 GEN_VEXT_VX(vmax_vx_d, 8)
1581 
1582 /* Vector Single-Width Integer Multiply Instructions */
1583 #define DO_MUL(N, M) (N * M)
1584 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1585 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1586 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1587 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1588 GEN_VEXT_VV(vmul_vv_b, 1)
1589 GEN_VEXT_VV(vmul_vv_h, 2)
1590 GEN_VEXT_VV(vmul_vv_w, 4)
1591 GEN_VEXT_VV(vmul_vv_d, 8)
1592 
1593 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1594 {
1595     return (int16_t)s2 * (int16_t)s1 >> 8;
1596 }
1597 
1598 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1599 {
1600     return (int32_t)s2 * (int32_t)s1 >> 16;
1601 }
1602 
1603 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1604 {
1605     return (int64_t)s2 * (int64_t)s1 >> 32;
1606 }
1607 
1608 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1609 {
1610     uint64_t hi_64, lo_64;
1611 
1612     muls64(&lo_64, &hi_64, s1, s2);
1613     return hi_64;
1614 }
1615 
1616 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1617 {
1618     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1619 }
1620 
1621 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1622 {
1623     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1624 }
1625 
1626 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1627 {
1628     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1629 }
1630 
1631 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1632 {
1633     uint64_t hi_64, lo_64;
1634 
1635     mulu64(&lo_64, &hi_64, s2, s1);
1636     return hi_64;
1637 }
1638 
1639 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1640 {
1641     return (int16_t)s2 * (uint16_t)s1 >> 8;
1642 }
1643 
1644 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1645 {
1646     return (int32_t)s2 * (uint32_t)s1 >> 16;
1647 }
1648 
1649 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1650 {
1651     return (int64_t)s2 * (uint64_t)s1 >> 32;
1652 }
1653 
1654 /*
1655  * Let  A = signed operand,
1656  *      B = unsigned operand
1657  *      P = mulu64(A, B), unsigned product
1658  *
1659  * LET  X = 2 ** 64  - A, 2's complement of A
1660  *      SP = signed product
1661  * THEN
1662  *      IF A < 0
1663  *          SP = -X * B
1664  *             = -(2 ** 64 - A) * B
1665  *             = A * B - 2 ** 64 * B
1666  *             = P - 2 ** 64 * B
1667  *      ELSE
1668  *          SP = P
1669  * THEN
1670  *      HI_P -= (A < 0 ? B : 0)
1671  */
1672 
1673 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1674 {
1675     uint64_t hi_64, lo_64;
1676 
1677     mulu64(&lo_64, &hi_64, s2, s1);
1678 
1679     hi_64 -= s2 < 0 ? s1 : 0;
1680     return hi_64;
1681 }
1682 
1683 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1684 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1685 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1686 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1687 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1688 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1689 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1690 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1691 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1692 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1693 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1694 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1695 GEN_VEXT_VV(vmulh_vv_b, 1)
1696 GEN_VEXT_VV(vmulh_vv_h, 2)
1697 GEN_VEXT_VV(vmulh_vv_w, 4)
1698 GEN_VEXT_VV(vmulh_vv_d, 8)
1699 GEN_VEXT_VV(vmulhu_vv_b, 1)
1700 GEN_VEXT_VV(vmulhu_vv_h, 2)
1701 GEN_VEXT_VV(vmulhu_vv_w, 4)
1702 GEN_VEXT_VV(vmulhu_vv_d, 8)
1703 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1704 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1705 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1706 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1707 
1708 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1709 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1710 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1711 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1712 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1713 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1714 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1715 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1716 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1717 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1718 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1719 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1720 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1721 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1722 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1723 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1724 GEN_VEXT_VX(vmul_vx_b, 1)
1725 GEN_VEXT_VX(vmul_vx_h, 2)
1726 GEN_VEXT_VX(vmul_vx_w, 4)
1727 GEN_VEXT_VX(vmul_vx_d, 8)
1728 GEN_VEXT_VX(vmulh_vx_b, 1)
1729 GEN_VEXT_VX(vmulh_vx_h, 2)
1730 GEN_VEXT_VX(vmulh_vx_w, 4)
1731 GEN_VEXT_VX(vmulh_vx_d, 8)
1732 GEN_VEXT_VX(vmulhu_vx_b, 1)
1733 GEN_VEXT_VX(vmulhu_vx_h, 2)
1734 GEN_VEXT_VX(vmulhu_vx_w, 4)
1735 GEN_VEXT_VX(vmulhu_vx_d, 8)
1736 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1737 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1738 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1739 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1740 
1741 /* Vector Integer Divide Instructions */
1742 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1743 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1744 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1745         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1746 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1747         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1748 
1749 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1750 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1751 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1752 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1753 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1754 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1755 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1756 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1757 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1758 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1759 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1760 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1761 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1762 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1763 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1764 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1765 GEN_VEXT_VV(vdivu_vv_b, 1)
1766 GEN_VEXT_VV(vdivu_vv_h, 2)
1767 GEN_VEXT_VV(vdivu_vv_w, 4)
1768 GEN_VEXT_VV(vdivu_vv_d, 8)
1769 GEN_VEXT_VV(vdiv_vv_b, 1)
1770 GEN_VEXT_VV(vdiv_vv_h, 2)
1771 GEN_VEXT_VV(vdiv_vv_w, 4)
1772 GEN_VEXT_VV(vdiv_vv_d, 8)
1773 GEN_VEXT_VV(vremu_vv_b, 1)
1774 GEN_VEXT_VV(vremu_vv_h, 2)
1775 GEN_VEXT_VV(vremu_vv_w, 4)
1776 GEN_VEXT_VV(vremu_vv_d, 8)
1777 GEN_VEXT_VV(vrem_vv_b, 1)
1778 GEN_VEXT_VV(vrem_vv_h, 2)
1779 GEN_VEXT_VV(vrem_vv_w, 4)
1780 GEN_VEXT_VV(vrem_vv_d, 8)
1781 
1782 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1783 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1784 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1785 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1786 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1787 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1788 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1789 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1790 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1791 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1792 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1793 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1794 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1795 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1796 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1797 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1798 GEN_VEXT_VX(vdivu_vx_b, 1)
1799 GEN_VEXT_VX(vdivu_vx_h, 2)
1800 GEN_VEXT_VX(vdivu_vx_w, 4)
1801 GEN_VEXT_VX(vdivu_vx_d, 8)
1802 GEN_VEXT_VX(vdiv_vx_b, 1)
1803 GEN_VEXT_VX(vdiv_vx_h, 2)
1804 GEN_VEXT_VX(vdiv_vx_w, 4)
1805 GEN_VEXT_VX(vdiv_vx_d, 8)
1806 GEN_VEXT_VX(vremu_vx_b, 1)
1807 GEN_VEXT_VX(vremu_vx_h, 2)
1808 GEN_VEXT_VX(vremu_vx_w, 4)
1809 GEN_VEXT_VX(vremu_vx_d, 8)
1810 GEN_VEXT_VX(vrem_vx_b, 1)
1811 GEN_VEXT_VX(vrem_vx_h, 2)
1812 GEN_VEXT_VX(vrem_vx_w, 4)
1813 GEN_VEXT_VX(vrem_vx_d, 8)
1814 
1815 /* Vector Widening Integer Multiply Instructions */
1816 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1819 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1820 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1821 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1822 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1823 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1824 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1825 GEN_VEXT_VV(vwmul_vv_b, 2)
1826 GEN_VEXT_VV(vwmul_vv_h, 4)
1827 GEN_VEXT_VV(vwmul_vv_w, 8)
1828 GEN_VEXT_VV(vwmulu_vv_b, 2)
1829 GEN_VEXT_VV(vwmulu_vv_h, 4)
1830 GEN_VEXT_VV(vwmulu_vv_w, 8)
1831 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1832 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1833 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1834 
1835 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1838 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1839 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1840 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1841 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1842 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1843 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1844 GEN_VEXT_VX(vwmul_vx_b, 2)
1845 GEN_VEXT_VX(vwmul_vx_h, 4)
1846 GEN_VEXT_VX(vwmul_vx_w, 8)
1847 GEN_VEXT_VX(vwmulu_vx_b, 2)
1848 GEN_VEXT_VX(vwmulu_vx_h, 4)
1849 GEN_VEXT_VX(vwmulu_vx_w, 8)
1850 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1851 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1852 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1853 
1854 /* Vector Single-Width Integer Multiply-Add Instructions */
1855 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1856 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1857 {                                                                  \
1858     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1859     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1860     TD d = *((TD *)vd + HD(i));                                    \
1861     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1862 }
1863 
1864 #define DO_MACC(N, M, D) (M * N + D)
1865 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1866 #define DO_MADD(N, M, D) (M * D + N)
1867 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1868 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1869 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1870 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1871 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1872 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1873 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1874 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1875 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1876 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1877 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1878 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1879 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1880 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1881 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1882 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1883 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1884 GEN_VEXT_VV(vmacc_vv_b, 1)
1885 GEN_VEXT_VV(vmacc_vv_h, 2)
1886 GEN_VEXT_VV(vmacc_vv_w, 4)
1887 GEN_VEXT_VV(vmacc_vv_d, 8)
1888 GEN_VEXT_VV(vnmsac_vv_b, 1)
1889 GEN_VEXT_VV(vnmsac_vv_h, 2)
1890 GEN_VEXT_VV(vnmsac_vv_w, 4)
1891 GEN_VEXT_VV(vnmsac_vv_d, 8)
1892 GEN_VEXT_VV(vmadd_vv_b, 1)
1893 GEN_VEXT_VV(vmadd_vv_h, 2)
1894 GEN_VEXT_VV(vmadd_vv_w, 4)
1895 GEN_VEXT_VV(vmadd_vv_d, 8)
1896 GEN_VEXT_VV(vnmsub_vv_b, 1)
1897 GEN_VEXT_VV(vnmsub_vv_h, 2)
1898 GEN_VEXT_VV(vnmsub_vv_w, 4)
1899 GEN_VEXT_VV(vnmsub_vv_d, 8)
1900 
1901 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1902 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1903 {                                                                   \
1904     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1905     TD d = *((TD *)vd + HD(i));                                     \
1906     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1907 }
1908 
1909 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1910 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1911 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1912 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1913 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1914 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1915 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1916 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1917 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1918 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1919 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1920 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1921 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1922 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1923 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1924 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1925 GEN_VEXT_VX(vmacc_vx_b, 1)
1926 GEN_VEXT_VX(vmacc_vx_h, 2)
1927 GEN_VEXT_VX(vmacc_vx_w, 4)
1928 GEN_VEXT_VX(vmacc_vx_d, 8)
1929 GEN_VEXT_VX(vnmsac_vx_b, 1)
1930 GEN_VEXT_VX(vnmsac_vx_h, 2)
1931 GEN_VEXT_VX(vnmsac_vx_w, 4)
1932 GEN_VEXT_VX(vnmsac_vx_d, 8)
1933 GEN_VEXT_VX(vmadd_vx_b, 1)
1934 GEN_VEXT_VX(vmadd_vx_h, 2)
1935 GEN_VEXT_VX(vmadd_vx_w, 4)
1936 GEN_VEXT_VX(vmadd_vx_d, 8)
1937 GEN_VEXT_VX(vnmsub_vx_b, 1)
1938 GEN_VEXT_VX(vnmsub_vx_h, 2)
1939 GEN_VEXT_VX(vnmsub_vx_w, 4)
1940 GEN_VEXT_VX(vnmsub_vx_d, 8)
1941 
1942 /* Vector Widening Integer Multiply-Add Instructions */
1943 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1946 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1947 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1948 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1949 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1950 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1951 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1952 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1955 GEN_VEXT_VV(vwmacc_vv_b, 2)
1956 GEN_VEXT_VV(vwmacc_vv_h, 4)
1957 GEN_VEXT_VV(vwmacc_vv_w, 8)
1958 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1959 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1960 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1961 
1962 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1968 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1969 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1970 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1971 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1972 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1973 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1974 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1977 GEN_VEXT_VX(vwmacc_vx_b, 2)
1978 GEN_VEXT_VX(vwmacc_vx_h, 4)
1979 GEN_VEXT_VX(vwmacc_vx_w, 8)
1980 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1981 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1982 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1983 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1984 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1985 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1986 
1987 /* Vector Integer Merge and Move Instructions */
1988 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1989 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1990                   uint32_t desc)                                     \
1991 {                                                                    \
1992     uint32_t vl = env->vl;                                           \
1993     uint32_t esz = sizeof(ETYPE);                                    \
1994     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1995     uint32_t vta = vext_vta(desc);                                   \
1996     uint32_t i;                                                      \
1997                                                                      \
1998     for (i = env->vstart; i < vl; i++) {                             \
1999         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2000         *((ETYPE *)vd + H(i)) = s1;                                  \
2001     }                                                                \
2002     env->vstart = 0;                                                 \
2003     /* set tail elements to 1s */                                    \
2004     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2005 }
2006 
2007 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2008 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2009 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2010 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2011 
2012 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2013 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2014                   uint32_t desc)                                     \
2015 {                                                                    \
2016     uint32_t vl = env->vl;                                           \
2017     uint32_t esz = sizeof(ETYPE);                                    \
2018     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2019     uint32_t vta = vext_vta(desc);                                   \
2020     uint32_t i;                                                      \
2021                                                                      \
2022     for (i = env->vstart; i < vl; i++) {                             \
2023         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2024     }                                                                \
2025     env->vstart = 0;                                                 \
2026     /* set tail elements to 1s */                                    \
2027     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2028 }
2029 
2030 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2031 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2032 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2033 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2034 
2035 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2036 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2037                   CPURISCVState *env, uint32_t desc)                 \
2038 {                                                                    \
2039     uint32_t vl = env->vl;                                           \
2040     uint32_t esz = sizeof(ETYPE);                                    \
2041     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2042     uint32_t vta = vext_vta(desc);                                   \
2043     uint32_t i;                                                      \
2044                                                                      \
2045     for (i = env->vstart; i < vl; i++) {                             \
2046         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2047         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2048     }                                                                \
2049     env->vstart = 0;                                                 \
2050     /* set tail elements to 1s */                                    \
2051     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2052 }
2053 
2054 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2055 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2056 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2057 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2058 
2059 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2060 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2061                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2062 {                                                                    \
2063     uint32_t vl = env->vl;                                           \
2064     uint32_t esz = sizeof(ETYPE);                                    \
2065     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2066     uint32_t vta = vext_vta(desc);                                   \
2067     uint32_t i;                                                      \
2068                                                                      \
2069     for (i = env->vstart; i < vl; i++) {                             \
2070         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2071         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2072                    (ETYPE)(target_long)s1);                          \
2073         *((ETYPE *)vd + H(i)) = d;                                   \
2074     }                                                                \
2075     env->vstart = 0;                                                 \
2076     /* set tail elements to 1s */                                    \
2077     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2078 }
2079 
2080 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2081 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2082 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2083 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2084 
2085 /*
2086  * Vector Fixed-Point Arithmetic Instructions
2087  */
2088 
2089 /* Vector Single-Width Saturating Add and Subtract */
2090 
2091 /*
2092  * As fixed point instructions probably have round mode and saturation,
2093  * define common macros for fixed point here.
2094  */
2095 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2096                           CPURISCVState *env, int vxrm);
2097 
2098 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2099 static inline void                                                  \
2100 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2101           CPURISCVState *env, int vxrm)                             \
2102 {                                                                   \
2103     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2104     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2105     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2106 }
2107 
2108 static inline void
2109 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2110              CPURISCVState *env,
2111              uint32_t vl, uint32_t vm, int vxrm,
2112              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2113 {
2114     for (uint32_t i = env->vstart; i < vl; i++) {
2115         if (!vm && !vext_elem_mask(v0, i)) {
2116             /* set masked-off elements to 1s */
2117             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2118             continue;
2119         }
2120         fn(vd, vs1, vs2, i, env, vxrm);
2121     }
2122     env->vstart = 0;
2123 }
2124 
2125 static inline void
2126 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2127              CPURISCVState *env,
2128              uint32_t desc,
2129              opivv2_rm_fn *fn, uint32_t esz)
2130 {
2131     uint32_t vm = vext_vm(desc);
2132     uint32_t vl = env->vl;
2133     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2134     uint32_t vta = vext_vta(desc);
2135     uint32_t vma = vext_vma(desc);
2136 
2137     switch (env->vxrm) {
2138     case 0: /* rnu */
2139         vext_vv_rm_1(vd, v0, vs1, vs2,
2140                      env, vl, vm, 0, fn, vma, esz);
2141         break;
2142     case 1: /* rne */
2143         vext_vv_rm_1(vd, v0, vs1, vs2,
2144                      env, vl, vm, 1, fn, vma, esz);
2145         break;
2146     case 2: /* rdn */
2147         vext_vv_rm_1(vd, v0, vs1, vs2,
2148                      env, vl, vm, 2, fn, vma, esz);
2149         break;
2150     default: /* rod */
2151         vext_vv_rm_1(vd, v0, vs1, vs2,
2152                      env, vl, vm, 3, fn, vma, esz);
2153         break;
2154     }
2155     /* set tail elements to 1s */
2156     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2157 }
2158 
2159 /* generate helpers for fixed point instructions with OPIVV format */
2160 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2161 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2162                   CPURISCVState *env, uint32_t desc)            \
2163 {                                                               \
2164     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2165                  do_##NAME, ESZ);                               \
2166 }
2167 
2168 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2169 {
2170     uint8_t res = a + b;
2171     if (res < a) {
2172         res = UINT8_MAX;
2173         env->vxsat = 0x1;
2174     }
2175     return res;
2176 }
2177 
2178 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2179                                uint16_t b)
2180 {
2181     uint16_t res = a + b;
2182     if (res < a) {
2183         res = UINT16_MAX;
2184         env->vxsat = 0x1;
2185     }
2186     return res;
2187 }
2188 
2189 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2190                                uint32_t b)
2191 {
2192     uint32_t res = a + b;
2193     if (res < a) {
2194         res = UINT32_MAX;
2195         env->vxsat = 0x1;
2196     }
2197     return res;
2198 }
2199 
2200 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2201                                uint64_t b)
2202 {
2203     uint64_t res = a + b;
2204     if (res < a) {
2205         res = UINT64_MAX;
2206         env->vxsat = 0x1;
2207     }
2208     return res;
2209 }
2210 
2211 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2212 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2213 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2214 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2215 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2216 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2217 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2218 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2219 
2220 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2221                           CPURISCVState *env, int vxrm);
2222 
2223 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2224 static inline void                                                  \
2225 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2226           CPURISCVState *env, int vxrm)                             \
2227 {                                                                   \
2228     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2229     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2230 }
2231 
2232 static inline void
2233 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2234              CPURISCVState *env,
2235              uint32_t vl, uint32_t vm, int vxrm,
2236              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2237 {
2238     for (uint32_t i = env->vstart; i < vl; i++) {
2239         if (!vm && !vext_elem_mask(v0, i)) {
2240             /* set masked-off elements to 1s */
2241             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2242             continue;
2243         }
2244         fn(vd, s1, vs2, i, env, vxrm);
2245     }
2246     env->vstart = 0;
2247 }
2248 
2249 static inline void
2250 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2251              CPURISCVState *env,
2252              uint32_t desc,
2253              opivx2_rm_fn *fn, uint32_t esz)
2254 {
2255     uint32_t vm = vext_vm(desc);
2256     uint32_t vl = env->vl;
2257     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2258     uint32_t vta = vext_vta(desc);
2259     uint32_t vma = vext_vma(desc);
2260 
2261     switch (env->vxrm) {
2262     case 0: /* rnu */
2263         vext_vx_rm_1(vd, v0, s1, vs2,
2264                      env, vl, vm, 0, fn, vma, esz);
2265         break;
2266     case 1: /* rne */
2267         vext_vx_rm_1(vd, v0, s1, vs2,
2268                      env, vl, vm, 1, fn, vma, esz);
2269         break;
2270     case 2: /* rdn */
2271         vext_vx_rm_1(vd, v0, s1, vs2,
2272                      env, vl, vm, 2, fn, vma, esz);
2273         break;
2274     default: /* rod */
2275         vext_vx_rm_1(vd, v0, s1, vs2,
2276                      env, vl, vm, 3, fn, vma, esz);
2277         break;
2278     }
2279     /* set tail elements to 1s */
2280     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2281 }
2282 
2283 /* generate helpers for fixed point instructions with OPIVX format */
2284 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2285 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2286                   void *vs2, CPURISCVState *env,          \
2287                   uint32_t desc)                          \
2288 {                                                         \
2289     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2290                  do_##NAME, ESZ);                         \
2291 }
2292 
2293 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2294 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2295 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2296 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2297 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2298 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2299 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2300 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2301 
2302 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2303 {
2304     int8_t res = a + b;
2305     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2306         res = a > 0 ? INT8_MAX : INT8_MIN;
2307         env->vxsat = 0x1;
2308     }
2309     return res;
2310 }
2311 
2312 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2313 {
2314     int16_t res = a + b;
2315     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2316         res = a > 0 ? INT16_MAX : INT16_MIN;
2317         env->vxsat = 0x1;
2318     }
2319     return res;
2320 }
2321 
2322 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2323 {
2324     int32_t res = a + b;
2325     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2326         res = a > 0 ? INT32_MAX : INT32_MIN;
2327         env->vxsat = 0x1;
2328     }
2329     return res;
2330 }
2331 
2332 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2333 {
2334     int64_t res = a + b;
2335     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2336         res = a > 0 ? INT64_MAX : INT64_MIN;
2337         env->vxsat = 0x1;
2338     }
2339     return res;
2340 }
2341 
2342 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2343 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2344 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2345 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2346 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2347 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2348 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2349 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2350 
2351 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2352 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2353 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2354 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2355 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2356 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2357 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2358 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2359 
2360 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2361 {
2362     uint8_t res = a - b;
2363     if (res > a) {
2364         res = 0;
2365         env->vxsat = 0x1;
2366     }
2367     return res;
2368 }
2369 
2370 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2371                                uint16_t b)
2372 {
2373     uint16_t res = a - b;
2374     if (res > a) {
2375         res = 0;
2376         env->vxsat = 0x1;
2377     }
2378     return res;
2379 }
2380 
2381 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2382                                uint32_t b)
2383 {
2384     uint32_t res = a - b;
2385     if (res > a) {
2386         res = 0;
2387         env->vxsat = 0x1;
2388     }
2389     return res;
2390 }
2391 
2392 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2393                                uint64_t b)
2394 {
2395     uint64_t res = a - b;
2396     if (res > a) {
2397         res = 0;
2398         env->vxsat = 0x1;
2399     }
2400     return res;
2401 }
2402 
2403 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2404 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2405 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2406 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2407 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2408 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2409 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2410 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2411 
2412 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2413 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2414 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2415 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2416 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2417 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2418 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2419 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2420 
2421 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2422 {
2423     int8_t res = a - b;
2424     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2425         res = a >= 0 ? INT8_MAX : INT8_MIN;
2426         env->vxsat = 0x1;
2427     }
2428     return res;
2429 }
2430 
2431 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2432 {
2433     int16_t res = a - b;
2434     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2435         res = a >= 0 ? INT16_MAX : INT16_MIN;
2436         env->vxsat = 0x1;
2437     }
2438     return res;
2439 }
2440 
2441 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2442 {
2443     int32_t res = a - b;
2444     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2445         res = a >= 0 ? INT32_MAX : INT32_MIN;
2446         env->vxsat = 0x1;
2447     }
2448     return res;
2449 }
2450 
2451 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2452 {
2453     int64_t res = a - b;
2454     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2455         res = a >= 0 ? INT64_MAX : INT64_MIN;
2456         env->vxsat = 0x1;
2457     }
2458     return res;
2459 }
2460 
2461 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2462 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2463 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2464 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2465 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2466 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2467 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2468 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2469 
2470 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2471 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2472 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2473 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2474 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2475 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2476 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2477 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2478 
2479 /* Vector Single-Width Averaging Add and Subtract */
2480 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2481 {
2482     uint8_t d = extract64(v, shift, 1);
2483     uint8_t d1;
2484     uint64_t D1, D2;
2485 
2486     if (shift == 0 || shift > 64) {
2487         return 0;
2488     }
2489 
2490     d1 = extract64(v, shift - 1, 1);
2491     D1 = extract64(v, 0, shift);
2492     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2493         return d1;
2494     } else if (vxrm == 1) { /* round-to-nearest-even */
2495         if (shift > 1) {
2496             D2 = extract64(v, 0, shift - 1);
2497             return d1 & ((D2 != 0) | d);
2498         } else {
2499             return d1 & d;
2500         }
2501     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2502         return !d & (D1 != 0);
2503     }
2504     return 0; /* round-down (truncate) */
2505 }
2506 
2507 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2508 {
2509     int64_t res = (int64_t)a + b;
2510     uint8_t round = get_round(vxrm, res, 1);
2511 
2512     return (res >> 1) + round;
2513 }
2514 
2515 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2516 {
2517     int64_t res = a + b;
2518     uint8_t round = get_round(vxrm, res, 1);
2519     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2520 
2521     /* With signed overflow, bit 64 is inverse of bit 63. */
2522     return ((res >> 1) ^ over) + round;
2523 }
2524 
2525 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2526 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2527 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2528 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2529 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2530 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2531 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2532 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2533 
2534 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2535 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2536 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2537 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2538 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2539 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2540 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2541 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2542 
2543 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2544                                uint32_t a, uint32_t b)
2545 {
2546     uint64_t res = (uint64_t)a + b;
2547     uint8_t round = get_round(vxrm, res, 1);
2548 
2549     return (res >> 1) + round;
2550 }
2551 
2552 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2553                                uint64_t a, uint64_t b)
2554 {
2555     uint64_t res = a + b;
2556     uint8_t round = get_round(vxrm, res, 1);
2557     uint64_t over = (uint64_t)(res < a) << 63;
2558 
2559     return ((res >> 1) | over) + round;
2560 }
2561 
2562 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2563 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2564 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2565 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2566 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2567 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2568 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2569 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2570 
2571 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2572 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2573 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2574 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2575 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2576 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2577 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2578 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2579 
2580 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2581 {
2582     int64_t res = (int64_t)a - b;
2583     uint8_t round = get_round(vxrm, res, 1);
2584 
2585     return (res >> 1) + round;
2586 }
2587 
2588 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2589 {
2590     int64_t res = (int64_t)a - b;
2591     uint8_t round = get_round(vxrm, res, 1);
2592     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2593 
2594     /* With signed overflow, bit 64 is inverse of bit 63. */
2595     return ((res >> 1) ^ over) + round;
2596 }
2597 
2598 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2599 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2600 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2601 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2602 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2603 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2604 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2605 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2606 
2607 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2608 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2609 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2610 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2611 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2612 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2613 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2614 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2615 
2616 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2617                                uint32_t a, uint32_t b)
2618 {
2619     int64_t res = (int64_t)a - b;
2620     uint8_t round = get_round(vxrm, res, 1);
2621 
2622     return (res >> 1) + round;
2623 }
2624 
2625 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2626                                uint64_t a, uint64_t b)
2627 {
2628     uint64_t res = (uint64_t)a - b;
2629     uint8_t round = get_round(vxrm, res, 1);
2630     uint64_t over = (uint64_t)(res > a) << 63;
2631 
2632     return ((res >> 1) | over) + round;
2633 }
2634 
2635 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2636 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2637 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2638 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2639 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2640 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2641 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2642 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2643 
2644 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2645 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2646 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2647 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2648 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2649 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2650 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2651 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2652 
2653 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2654 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2655 {
2656     uint8_t round;
2657     int16_t res;
2658 
2659     res = (int16_t)a * (int16_t)b;
2660     round = get_round(vxrm, res, 7);
2661     res = (res >> 7) + round;
2662 
2663     if (res > INT8_MAX) {
2664         env->vxsat = 0x1;
2665         return INT8_MAX;
2666     } else if (res < INT8_MIN) {
2667         env->vxsat = 0x1;
2668         return INT8_MIN;
2669     } else {
2670         return res;
2671     }
2672 }
2673 
2674 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2675 {
2676     uint8_t round;
2677     int32_t res;
2678 
2679     res = (int32_t)a * (int32_t)b;
2680     round = get_round(vxrm, res, 15);
2681     res = (res >> 15) + round;
2682 
2683     if (res > INT16_MAX) {
2684         env->vxsat = 0x1;
2685         return INT16_MAX;
2686     } else if (res < INT16_MIN) {
2687         env->vxsat = 0x1;
2688         return INT16_MIN;
2689     } else {
2690         return res;
2691     }
2692 }
2693 
2694 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2695 {
2696     uint8_t round;
2697     int64_t res;
2698 
2699     res = (int64_t)a * (int64_t)b;
2700     round = get_round(vxrm, res, 31);
2701     res = (res >> 31) + round;
2702 
2703     if (res > INT32_MAX) {
2704         env->vxsat = 0x1;
2705         return INT32_MAX;
2706     } else if (res < INT32_MIN) {
2707         env->vxsat = 0x1;
2708         return INT32_MIN;
2709     } else {
2710         return res;
2711     }
2712 }
2713 
2714 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2715 {
2716     uint8_t round;
2717     uint64_t hi_64, lo_64;
2718     int64_t res;
2719 
2720     if (a == INT64_MIN && b == INT64_MIN) {
2721         env->vxsat = 1;
2722         return INT64_MAX;
2723     }
2724 
2725     muls64(&lo_64, &hi_64, a, b);
2726     round = get_round(vxrm, lo_64, 63);
2727     /*
2728      * Cannot overflow, as there are always
2729      * 2 sign bits after multiply.
2730      */
2731     res = (hi_64 << 1) | (lo_64 >> 63);
2732     if (round) {
2733         if (res == INT64_MAX) {
2734             env->vxsat = 1;
2735         } else {
2736             res += 1;
2737         }
2738     }
2739     return res;
2740 }
2741 
2742 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2743 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2744 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2745 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2746 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2747 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2748 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2749 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2750 
2751 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2752 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2753 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2754 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2755 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2756 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2757 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2758 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2759 
2760 /* Vector Single-Width Scaling Shift Instructions */
2761 static inline uint8_t
2762 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2763 {
2764     uint8_t round, shift = b & 0x7;
2765     uint8_t res;
2766 
2767     round = get_round(vxrm, a, shift);
2768     res = (a >> shift) + round;
2769     return res;
2770 }
2771 static inline uint16_t
2772 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2773 {
2774     uint8_t round, shift = b & 0xf;
2775 
2776     round = get_round(vxrm, a, shift);
2777     return (a >> shift) + round;
2778 }
2779 static inline uint32_t
2780 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2781 {
2782     uint8_t round, shift = b & 0x1f;
2783 
2784     round = get_round(vxrm, a, shift);
2785     return (a >> shift) + round;
2786 }
2787 static inline uint64_t
2788 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2789 {
2790     uint8_t round, shift = b & 0x3f;
2791 
2792     round = get_round(vxrm, a, shift);
2793     return (a >> shift) + round;
2794 }
2795 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2796 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2797 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2798 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2799 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2800 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2801 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2802 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2803 
2804 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2805 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2806 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2807 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2808 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2809 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2810 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2811 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2812 
2813 static inline int8_t
2814 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2815 {
2816     uint8_t round, shift = b & 0x7;
2817 
2818     round = get_round(vxrm, a, shift);
2819     return (a >> shift) + round;
2820 }
2821 static inline int16_t
2822 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2823 {
2824     uint8_t round, shift = b & 0xf;
2825 
2826     round = get_round(vxrm, a, shift);
2827     return (a >> shift) + round;
2828 }
2829 static inline int32_t
2830 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2831 {
2832     uint8_t round, shift = b & 0x1f;
2833 
2834     round = get_round(vxrm, a, shift);
2835     return (a >> shift) + round;
2836 }
2837 static inline int64_t
2838 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2839 {
2840     uint8_t round, shift = b & 0x3f;
2841 
2842     round = get_round(vxrm, a, shift);
2843     return (a >> shift) + round;
2844 }
2845 
2846 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2847 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2848 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2849 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2850 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2851 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2852 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2853 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2854 
2855 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2856 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2857 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2858 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2859 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2860 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2861 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2862 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2863 
2864 /* Vector Narrowing Fixed-Point Clip Instructions */
2865 static inline int8_t
2866 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2867 {
2868     uint8_t round, shift = b & 0xf;
2869     int16_t res;
2870 
2871     round = get_round(vxrm, a, shift);
2872     res = (a >> shift) + round;
2873     if (res > INT8_MAX) {
2874         env->vxsat = 0x1;
2875         return INT8_MAX;
2876     } else if (res < INT8_MIN) {
2877         env->vxsat = 0x1;
2878         return INT8_MIN;
2879     } else {
2880         return res;
2881     }
2882 }
2883 
2884 static inline int16_t
2885 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2886 {
2887     uint8_t round, shift = b & 0x1f;
2888     int32_t res;
2889 
2890     round = get_round(vxrm, a, shift);
2891     res = (a >> shift) + round;
2892     if (res > INT16_MAX) {
2893         env->vxsat = 0x1;
2894         return INT16_MAX;
2895     } else if (res < INT16_MIN) {
2896         env->vxsat = 0x1;
2897         return INT16_MIN;
2898     } else {
2899         return res;
2900     }
2901 }
2902 
2903 static inline int32_t
2904 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2905 {
2906     uint8_t round, shift = b & 0x3f;
2907     int64_t res;
2908 
2909     round = get_round(vxrm, a, shift);
2910     res = (a >> shift) + round;
2911     if (res > INT32_MAX) {
2912         env->vxsat = 0x1;
2913         return INT32_MAX;
2914     } else if (res < INT32_MIN) {
2915         env->vxsat = 0x1;
2916         return INT32_MIN;
2917     } else {
2918         return res;
2919     }
2920 }
2921 
2922 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2923 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2924 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2925 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2926 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2927 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2928 
2929 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2930 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2931 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2932 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2933 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2934 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2935 
2936 static inline uint8_t
2937 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2938 {
2939     uint8_t round, shift = b & 0xf;
2940     uint16_t res;
2941 
2942     round = get_round(vxrm, a, shift);
2943     res = (a >> shift) + round;
2944     if (res > UINT8_MAX) {
2945         env->vxsat = 0x1;
2946         return UINT8_MAX;
2947     } else {
2948         return res;
2949     }
2950 }
2951 
2952 static inline uint16_t
2953 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2954 {
2955     uint8_t round, shift = b & 0x1f;
2956     uint32_t res;
2957 
2958     round = get_round(vxrm, a, shift);
2959     res = (a >> shift) + round;
2960     if (res > UINT16_MAX) {
2961         env->vxsat = 0x1;
2962         return UINT16_MAX;
2963     } else {
2964         return res;
2965     }
2966 }
2967 
2968 static inline uint32_t
2969 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2970 {
2971     uint8_t round, shift = b & 0x3f;
2972     uint64_t res;
2973 
2974     round = get_round(vxrm, a, shift);
2975     res = (a >> shift) + round;
2976     if (res > UINT32_MAX) {
2977         env->vxsat = 0x1;
2978         return UINT32_MAX;
2979     } else {
2980         return res;
2981     }
2982 }
2983 
2984 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2985 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2986 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2987 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2988 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2989 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2990 
2991 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2992 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2993 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2994 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2995 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2996 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2997 
2998 /*
2999  * Vector Float Point Arithmetic Instructions
3000  */
3001 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3002 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3003 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3004                       CPURISCVState *env)                      \
3005 {                                                              \
3006     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3007     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3008     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3009 }
3010 
3011 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3012 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3013                   void *vs2, CPURISCVState *env,          \
3014                   uint32_t desc)                          \
3015 {                                                         \
3016     uint32_t vm = vext_vm(desc);                          \
3017     uint32_t vl = env->vl;                                \
3018     uint32_t total_elems =                                \
3019         vext_get_total_elems(env, desc, ESZ);             \
3020     uint32_t vta = vext_vta(desc);                        \
3021     uint32_t vma = vext_vma(desc);                        \
3022     uint32_t i;                                           \
3023                                                           \
3024     for (i = env->vstart; i < vl; i++) {                  \
3025         if (!vm && !vext_elem_mask(v0, i)) {              \
3026             /* set masked-off elements to 1s */           \
3027             vext_set_elems_1s(vd, vma, i * ESZ,           \
3028                               (i + 1) * ESZ);             \
3029             continue;                                     \
3030         }                                                 \
3031         do_##NAME(vd, vs1, vs2, i, env);                  \
3032     }                                                     \
3033     env->vstart = 0;                                      \
3034     /* set tail elements to 1s */                         \
3035     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3036                       total_elems * ESZ);                 \
3037 }
3038 
3039 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3040 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3041 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3042 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3043 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3044 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3045 
3046 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3047 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3048                       CPURISCVState *env)                      \
3049 {                                                              \
3050     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3051     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3052 }
3053 
3054 #define GEN_VEXT_VF(NAME, ESZ)                            \
3055 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3056                   void *vs2, CPURISCVState *env,          \
3057                   uint32_t desc)                          \
3058 {                                                         \
3059     uint32_t vm = vext_vm(desc);                          \
3060     uint32_t vl = env->vl;                                \
3061     uint32_t total_elems =                                \
3062         vext_get_total_elems(env, desc, ESZ);             \
3063     uint32_t vta = vext_vta(desc);                        \
3064     uint32_t vma = vext_vma(desc);                        \
3065     uint32_t i;                                           \
3066                                                           \
3067     for (i = env->vstart; i < vl; i++) {                  \
3068         if (!vm && !vext_elem_mask(v0, i)) {              \
3069             /* set masked-off elements to 1s */           \
3070             vext_set_elems_1s(vd, vma, i * ESZ,           \
3071                               (i + 1) * ESZ);             \
3072             continue;                                     \
3073         }                                                 \
3074         do_##NAME(vd, s1, vs2, i, env);                   \
3075     }                                                     \
3076     env->vstart = 0;                                      \
3077     /* set tail elements to 1s */                         \
3078     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3079                       total_elems * ESZ);                 \
3080 }
3081 
3082 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3083 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3084 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3085 GEN_VEXT_VF(vfadd_vf_h, 2)
3086 GEN_VEXT_VF(vfadd_vf_w, 4)
3087 GEN_VEXT_VF(vfadd_vf_d, 8)
3088 
3089 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3090 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3091 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3092 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3093 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3094 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3095 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3096 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3097 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3098 GEN_VEXT_VF(vfsub_vf_h, 2)
3099 GEN_VEXT_VF(vfsub_vf_w, 4)
3100 GEN_VEXT_VF(vfsub_vf_d, 8)
3101 
3102 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3103 {
3104     return float16_sub(b, a, s);
3105 }
3106 
3107 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3108 {
3109     return float32_sub(b, a, s);
3110 }
3111 
3112 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3113 {
3114     return float64_sub(b, a, s);
3115 }
3116 
3117 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3118 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3119 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3120 GEN_VEXT_VF(vfrsub_vf_h, 2)
3121 GEN_VEXT_VF(vfrsub_vf_w, 4)
3122 GEN_VEXT_VF(vfrsub_vf_d, 8)
3123 
3124 /* Vector Widening Floating-Point Add/Subtract Instructions */
3125 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3126 {
3127     return float32_add(float16_to_float32(a, true, s),
3128                        float16_to_float32(b, true, s), s);
3129 }
3130 
3131 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3132 {
3133     return float64_add(float32_to_float64(a, s),
3134                        float32_to_float64(b, s), s);
3135 
3136 }
3137 
3138 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3139 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3140 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3141 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3142 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3143 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3144 GEN_VEXT_VF(vfwadd_vf_h, 4)
3145 GEN_VEXT_VF(vfwadd_vf_w, 8)
3146 
3147 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3148 {
3149     return float32_sub(float16_to_float32(a, true, s),
3150                        float16_to_float32(b, true, s), s);
3151 }
3152 
3153 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3154 {
3155     return float64_sub(float32_to_float64(a, s),
3156                        float32_to_float64(b, s), s);
3157 
3158 }
3159 
3160 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3161 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3162 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3163 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3164 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3165 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3166 GEN_VEXT_VF(vfwsub_vf_h, 4)
3167 GEN_VEXT_VF(vfwsub_vf_w, 8)
3168 
3169 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3170 {
3171     return float32_add(a, float16_to_float32(b, true, s), s);
3172 }
3173 
3174 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3175 {
3176     return float64_add(a, float32_to_float64(b, s), s);
3177 }
3178 
3179 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3180 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3181 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3182 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3183 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3184 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3185 GEN_VEXT_VF(vfwadd_wf_h, 4)
3186 GEN_VEXT_VF(vfwadd_wf_w, 8)
3187 
3188 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3189 {
3190     return float32_sub(a, float16_to_float32(b, true, s), s);
3191 }
3192 
3193 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3194 {
3195     return float64_sub(a, float32_to_float64(b, s), s);
3196 }
3197 
3198 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3199 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3200 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3201 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3202 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3203 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3204 GEN_VEXT_VF(vfwsub_wf_h, 4)
3205 GEN_VEXT_VF(vfwsub_wf_w, 8)
3206 
3207 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3208 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3209 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3210 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3211 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3212 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3213 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3214 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3215 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3216 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3217 GEN_VEXT_VF(vfmul_vf_h, 2)
3218 GEN_VEXT_VF(vfmul_vf_w, 4)
3219 GEN_VEXT_VF(vfmul_vf_d, 8)
3220 
3221 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3222 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3223 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3224 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3225 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3226 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3227 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3228 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3229 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3230 GEN_VEXT_VF(vfdiv_vf_h, 2)
3231 GEN_VEXT_VF(vfdiv_vf_w, 4)
3232 GEN_VEXT_VF(vfdiv_vf_d, 8)
3233 
3234 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3235 {
3236     return float16_div(b, a, s);
3237 }
3238 
3239 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3240 {
3241     return float32_div(b, a, s);
3242 }
3243 
3244 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3245 {
3246     return float64_div(b, a, s);
3247 }
3248 
3249 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3250 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3251 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3252 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3253 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3254 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3255 
3256 /* Vector Widening Floating-Point Multiply */
3257 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3258 {
3259     return float32_mul(float16_to_float32(a, true, s),
3260                        float16_to_float32(b, true, s), s);
3261 }
3262 
3263 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3264 {
3265     return float64_mul(float32_to_float64(a, s),
3266                        float32_to_float64(b, s), s);
3267 
3268 }
3269 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3270 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3271 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3272 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3273 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3274 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3275 GEN_VEXT_VF(vfwmul_vf_h, 4)
3276 GEN_VEXT_VF(vfwmul_vf_w, 8)
3277 
3278 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3279 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3280 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3281                       CPURISCVState *env)                          \
3282 {                                                                  \
3283     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3284     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3285     TD d = *((TD *)vd + HD(i));                                    \
3286     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3287 }
3288 
3289 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3290 {
3291     return float16_muladd(a, b, d, 0, s);
3292 }
3293 
3294 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3295 {
3296     return float32_muladd(a, b, d, 0, s);
3297 }
3298 
3299 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3300 {
3301     return float64_muladd(a, b, d, 0, s);
3302 }
3303 
3304 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3305 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3306 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3307 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3308 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3309 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3310 
3311 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3312 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3313                       CPURISCVState *env)                         \
3314 {                                                                 \
3315     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3316     TD d = *((TD *)vd + HD(i));                                   \
3317     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3318 }
3319 
3320 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3321 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3322 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3323 GEN_VEXT_VF(vfmacc_vf_h, 2)
3324 GEN_VEXT_VF(vfmacc_vf_w, 4)
3325 GEN_VEXT_VF(vfmacc_vf_d, 8)
3326 
3327 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3328 {
3329     return float16_muladd(a, b, d, float_muladd_negate_c |
3330                                    float_muladd_negate_product, s);
3331 }
3332 
3333 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3334 {
3335     return float32_muladd(a, b, d, float_muladd_negate_c |
3336                                    float_muladd_negate_product, s);
3337 }
3338 
3339 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3340 {
3341     return float64_muladd(a, b, d, float_muladd_negate_c |
3342                                    float_muladd_negate_product, s);
3343 }
3344 
3345 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3346 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3347 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3348 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3349 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3350 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3351 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3352 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3353 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3354 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3355 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3356 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3357 
3358 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3359 {
3360     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3361 }
3362 
3363 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3364 {
3365     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3366 }
3367 
3368 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3369 {
3370     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3371 }
3372 
3373 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3374 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3375 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3376 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3377 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3378 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3379 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3380 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3381 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3382 GEN_VEXT_VF(vfmsac_vf_h, 2)
3383 GEN_VEXT_VF(vfmsac_vf_w, 4)
3384 GEN_VEXT_VF(vfmsac_vf_d, 8)
3385 
3386 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3387 {
3388     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3389 }
3390 
3391 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3392 {
3393     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3394 }
3395 
3396 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3397 {
3398     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3399 }
3400 
3401 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3402 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3403 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3404 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3405 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3406 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3407 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3408 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3409 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3410 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3411 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3412 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3413 
3414 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3415 {
3416     return float16_muladd(d, b, a, 0, s);
3417 }
3418 
3419 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3420 {
3421     return float32_muladd(d, b, a, 0, s);
3422 }
3423 
3424 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3425 {
3426     return float64_muladd(d, b, a, 0, s);
3427 }
3428 
3429 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3430 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3431 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3432 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3433 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3434 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3435 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3436 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3437 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3438 GEN_VEXT_VF(vfmadd_vf_h, 2)
3439 GEN_VEXT_VF(vfmadd_vf_w, 4)
3440 GEN_VEXT_VF(vfmadd_vf_d, 8)
3441 
3442 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3443 {
3444     return float16_muladd(d, b, a, float_muladd_negate_c |
3445                                    float_muladd_negate_product, s);
3446 }
3447 
3448 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3449 {
3450     return float32_muladd(d, b, a, float_muladd_negate_c |
3451                                    float_muladd_negate_product, s);
3452 }
3453 
3454 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3455 {
3456     return float64_muladd(d, b, a, float_muladd_negate_c |
3457                                    float_muladd_negate_product, s);
3458 }
3459 
3460 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3461 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3462 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3463 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3464 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3465 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3466 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3467 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3468 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3469 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3470 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3471 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3472 
3473 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3474 {
3475     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3476 }
3477 
3478 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3479 {
3480     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3481 }
3482 
3483 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3484 {
3485     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3486 }
3487 
3488 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3489 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3490 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3491 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3492 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3493 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3494 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3495 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3496 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3497 GEN_VEXT_VF(vfmsub_vf_h, 2)
3498 GEN_VEXT_VF(vfmsub_vf_w, 4)
3499 GEN_VEXT_VF(vfmsub_vf_d, 8)
3500 
3501 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3502 {
3503     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3504 }
3505 
3506 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3507 {
3508     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3509 }
3510 
3511 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3512 {
3513     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3514 }
3515 
3516 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3517 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3518 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3519 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3520 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3521 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3522 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3523 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3524 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3525 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3526 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3527 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3528 
3529 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3530 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3531 {
3532     return float32_muladd(float16_to_float32(a, true, s),
3533                           float16_to_float32(b, true, s), d, 0, s);
3534 }
3535 
3536 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3537 {
3538     return float64_muladd(float32_to_float64(a, s),
3539                           float32_to_float64(b, s), d, 0, s);
3540 }
3541 
3542 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3543 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3544 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3545 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3546 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3547 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3548 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3549 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3550 
3551 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3552 {
3553     return float32_muladd(float16_to_float32(a, true, s),
3554                           float16_to_float32(b, true, s), d,
3555                           float_muladd_negate_c | float_muladd_negate_product,
3556                           s);
3557 }
3558 
3559 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3560 {
3561     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3562                           d, float_muladd_negate_c |
3563                              float_muladd_negate_product, s);
3564 }
3565 
3566 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3567 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3568 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3569 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3570 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3571 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3572 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3573 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3574 
3575 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3576 {
3577     return float32_muladd(float16_to_float32(a, true, s),
3578                           float16_to_float32(b, true, s), d,
3579                           float_muladd_negate_c, s);
3580 }
3581 
3582 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3583 {
3584     return float64_muladd(float32_to_float64(a, s),
3585                           float32_to_float64(b, s), d,
3586                           float_muladd_negate_c, s);
3587 }
3588 
3589 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3590 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3591 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3592 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3593 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3594 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3595 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3596 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3597 
3598 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3599 {
3600     return float32_muladd(float16_to_float32(a, true, s),
3601                           float16_to_float32(b, true, s), d,
3602                           float_muladd_negate_product, s);
3603 }
3604 
3605 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3606 {
3607     return float64_muladd(float32_to_float64(a, s),
3608                           float32_to_float64(b, s), d,
3609                           float_muladd_negate_product, s);
3610 }
3611 
3612 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3613 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3614 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3615 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3616 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3617 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3618 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3619 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3620 
3621 /* Vector Floating-Point Square-Root Instruction */
3622 /* (TD, T2, TX2) */
3623 #define OP_UU_H uint16_t, uint16_t, uint16_t
3624 #define OP_UU_W uint32_t, uint32_t, uint32_t
3625 #define OP_UU_D uint64_t, uint64_t, uint64_t
3626 
3627 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3628 static void do_##NAME(void *vd, void *vs2, int i,      \
3629                       CPURISCVState *env)              \
3630 {                                                      \
3631     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3632     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3633 }
3634 
3635 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3636 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3637                   CPURISCVState *env, uint32_t desc)   \
3638 {                                                      \
3639     uint32_t vm = vext_vm(desc);                       \
3640     uint32_t vl = env->vl;                             \
3641     uint32_t total_elems =                             \
3642         vext_get_total_elems(env, desc, ESZ);          \
3643     uint32_t vta = vext_vta(desc);                     \
3644     uint32_t vma = vext_vma(desc);                     \
3645     uint32_t i;                                        \
3646                                                        \
3647     if (vl == 0) {                                     \
3648         return;                                        \
3649     }                                                  \
3650     for (i = env->vstart; i < vl; i++) {               \
3651         if (!vm && !vext_elem_mask(v0, i)) {           \
3652             /* set masked-off elements to 1s */        \
3653             vext_set_elems_1s(vd, vma, i * ESZ,        \
3654                               (i + 1) * ESZ);          \
3655             continue;                                  \
3656         }                                              \
3657         do_##NAME(vd, vs2, i, env);                    \
3658     }                                                  \
3659     env->vstart = 0;                                   \
3660     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3661                       total_elems * ESZ);              \
3662 }
3663 
3664 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3665 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3666 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3667 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3668 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3669 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3670 
3671 /*
3672  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3673  *
3674  * Adapted from riscv-v-spec recip.c:
3675  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3676  */
3677 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3678 {
3679     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3680     uint64_t exp = extract64(f, frac_size, exp_size);
3681     uint64_t frac = extract64(f, 0, frac_size);
3682 
3683     const uint8_t lookup_table[] = {
3684         52, 51, 50, 48, 47, 46, 44, 43,
3685         42, 41, 40, 39, 38, 36, 35, 34,
3686         33, 32, 31, 30, 30, 29, 28, 27,
3687         26, 25, 24, 23, 23, 22, 21, 20,
3688         19, 19, 18, 17, 16, 16, 15, 14,
3689         14, 13, 12, 12, 11, 10, 10, 9,
3690         9, 8, 7, 7, 6, 6, 5, 4,
3691         4, 3, 3, 2, 2, 1, 1, 0,
3692         127, 125, 123, 121, 119, 118, 116, 114,
3693         113, 111, 109, 108, 106, 105, 103, 102,
3694         100, 99, 97, 96, 95, 93, 92, 91,
3695         90, 88, 87, 86, 85, 84, 83, 82,
3696         80, 79, 78, 77, 76, 75, 74, 73,
3697         72, 71, 70, 70, 69, 68, 67, 66,
3698         65, 64, 63, 63, 62, 61, 60, 59,
3699         59, 58, 57, 56, 56, 55, 54, 53
3700     };
3701     const int precision = 7;
3702 
3703     if (exp == 0 && frac != 0) { /* subnormal */
3704         /* Normalize the subnormal. */
3705         while (extract64(frac, frac_size - 1, 1) == 0) {
3706             exp--;
3707             frac <<= 1;
3708         }
3709 
3710         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3711     }
3712 
3713     int idx = ((exp & 1) << (precision - 1)) |
3714               (frac >> (frac_size - precision + 1));
3715     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3716                         (frac_size - precision);
3717     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3718 
3719     uint64_t val = 0;
3720     val = deposit64(val, 0, frac_size, out_frac);
3721     val = deposit64(val, frac_size, exp_size, out_exp);
3722     val = deposit64(val, frac_size + exp_size, 1, sign);
3723     return val;
3724 }
3725 
3726 static float16 frsqrt7_h(float16 f, float_status *s)
3727 {
3728     int exp_size = 5, frac_size = 10;
3729     bool sign = float16_is_neg(f);
3730 
3731     /*
3732      * frsqrt7(sNaN) = canonical NaN
3733      * frsqrt7(-inf) = canonical NaN
3734      * frsqrt7(-normal) = canonical NaN
3735      * frsqrt7(-subnormal) = canonical NaN
3736      */
3737     if (float16_is_signaling_nan(f, s) ||
3738         (float16_is_infinity(f) && sign) ||
3739         (float16_is_normal(f) && sign) ||
3740         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3741         s->float_exception_flags |= float_flag_invalid;
3742         return float16_default_nan(s);
3743     }
3744 
3745     /* frsqrt7(qNaN) = canonical NaN */
3746     if (float16_is_quiet_nan(f, s)) {
3747         return float16_default_nan(s);
3748     }
3749 
3750     /* frsqrt7(+-0) = +-inf */
3751     if (float16_is_zero(f)) {
3752         s->float_exception_flags |= float_flag_divbyzero;
3753         return float16_set_sign(float16_infinity, sign);
3754     }
3755 
3756     /* frsqrt7(+inf) = +0 */
3757     if (float16_is_infinity(f) && !sign) {
3758         return float16_set_sign(float16_zero, sign);
3759     }
3760 
3761     /* +normal, +subnormal */
3762     uint64_t val = frsqrt7(f, exp_size, frac_size);
3763     return make_float16(val);
3764 }
3765 
3766 static float32 frsqrt7_s(float32 f, float_status *s)
3767 {
3768     int exp_size = 8, frac_size = 23;
3769     bool sign = float32_is_neg(f);
3770 
3771     /*
3772      * frsqrt7(sNaN) = canonical NaN
3773      * frsqrt7(-inf) = canonical NaN
3774      * frsqrt7(-normal) = canonical NaN
3775      * frsqrt7(-subnormal) = canonical NaN
3776      */
3777     if (float32_is_signaling_nan(f, s) ||
3778         (float32_is_infinity(f) && sign) ||
3779         (float32_is_normal(f) && sign) ||
3780         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3781         s->float_exception_flags |= float_flag_invalid;
3782         return float32_default_nan(s);
3783     }
3784 
3785     /* frsqrt7(qNaN) = canonical NaN */
3786     if (float32_is_quiet_nan(f, s)) {
3787         return float32_default_nan(s);
3788     }
3789 
3790     /* frsqrt7(+-0) = +-inf */
3791     if (float32_is_zero(f)) {
3792         s->float_exception_flags |= float_flag_divbyzero;
3793         return float32_set_sign(float32_infinity, sign);
3794     }
3795 
3796     /* frsqrt7(+inf) = +0 */
3797     if (float32_is_infinity(f) && !sign) {
3798         return float32_set_sign(float32_zero, sign);
3799     }
3800 
3801     /* +normal, +subnormal */
3802     uint64_t val = frsqrt7(f, exp_size, frac_size);
3803     return make_float32(val);
3804 }
3805 
3806 static float64 frsqrt7_d(float64 f, float_status *s)
3807 {
3808     int exp_size = 11, frac_size = 52;
3809     bool sign = float64_is_neg(f);
3810 
3811     /*
3812      * frsqrt7(sNaN) = canonical NaN
3813      * frsqrt7(-inf) = canonical NaN
3814      * frsqrt7(-normal) = canonical NaN
3815      * frsqrt7(-subnormal) = canonical NaN
3816      */
3817     if (float64_is_signaling_nan(f, s) ||
3818         (float64_is_infinity(f) && sign) ||
3819         (float64_is_normal(f) && sign) ||
3820         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3821         s->float_exception_flags |= float_flag_invalid;
3822         return float64_default_nan(s);
3823     }
3824 
3825     /* frsqrt7(qNaN) = canonical NaN */
3826     if (float64_is_quiet_nan(f, s)) {
3827         return float64_default_nan(s);
3828     }
3829 
3830     /* frsqrt7(+-0) = +-inf */
3831     if (float64_is_zero(f)) {
3832         s->float_exception_flags |= float_flag_divbyzero;
3833         return float64_set_sign(float64_infinity, sign);
3834     }
3835 
3836     /* frsqrt7(+inf) = +0 */
3837     if (float64_is_infinity(f) && !sign) {
3838         return float64_set_sign(float64_zero, sign);
3839     }
3840 
3841     /* +normal, +subnormal */
3842     uint64_t val = frsqrt7(f, exp_size, frac_size);
3843     return make_float64(val);
3844 }
3845 
3846 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3847 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3848 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3849 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3850 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3851 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3852 
3853 /*
3854  * Vector Floating-Point Reciprocal Estimate Instruction
3855  *
3856  * Adapted from riscv-v-spec recip.c:
3857  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3858  */
3859 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3860                       float_status *s)
3861 {
3862     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3863     uint64_t exp = extract64(f, frac_size, exp_size);
3864     uint64_t frac = extract64(f, 0, frac_size);
3865 
3866     const uint8_t lookup_table[] = {
3867         127, 125, 123, 121, 119, 117, 116, 114,
3868         112, 110, 109, 107, 105, 104, 102, 100,
3869         99, 97, 96, 94, 93, 91, 90, 88,
3870         87, 85, 84, 83, 81, 80, 79, 77,
3871         76, 75, 74, 72, 71, 70, 69, 68,
3872         66, 65, 64, 63, 62, 61, 60, 59,
3873         58, 57, 56, 55, 54, 53, 52, 51,
3874         50, 49, 48, 47, 46, 45, 44, 43,
3875         42, 41, 40, 40, 39, 38, 37, 36,
3876         35, 35, 34, 33, 32, 31, 31, 30,
3877         29, 28, 28, 27, 26, 25, 25, 24,
3878         23, 23, 22, 21, 21, 20, 19, 19,
3879         18, 17, 17, 16, 15, 15, 14, 14,
3880         13, 12, 12, 11, 11, 10, 9, 9,
3881         8, 8, 7, 7, 6, 5, 5, 4,
3882         4, 3, 3, 2, 2, 1, 1, 0
3883     };
3884     const int precision = 7;
3885 
3886     if (exp == 0 && frac != 0) { /* subnormal */
3887         /* Normalize the subnormal. */
3888         while (extract64(frac, frac_size - 1, 1) == 0) {
3889             exp--;
3890             frac <<= 1;
3891         }
3892 
3893         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3894 
3895         if (exp != 0 && exp != UINT64_MAX) {
3896             /*
3897              * Overflow to inf or max value of same sign,
3898              * depending on sign and rounding mode.
3899              */
3900             s->float_exception_flags |= (float_flag_inexact |
3901                                          float_flag_overflow);
3902 
3903             if ((s->float_rounding_mode == float_round_to_zero) ||
3904                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3905                 ((s->float_rounding_mode == float_round_up) && sign)) {
3906                 /* Return greatest/negative finite value. */
3907                 return (sign << (exp_size + frac_size)) |
3908                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3909             } else {
3910                 /* Return +-inf. */
3911                 return (sign << (exp_size + frac_size)) |
3912                        MAKE_64BIT_MASK(frac_size, exp_size);
3913             }
3914         }
3915     }
3916 
3917     int idx = frac >> (frac_size - precision);
3918     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3919                         (frac_size - precision);
3920     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3921 
3922     if (out_exp == 0 || out_exp == UINT64_MAX) {
3923         /*
3924          * The result is subnormal, but don't raise the underflow exception,
3925          * because there's no additional loss of precision.
3926          */
3927         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3928         if (out_exp == UINT64_MAX) {
3929             out_frac >>= 1;
3930             out_exp = 0;
3931         }
3932     }
3933 
3934     uint64_t val = 0;
3935     val = deposit64(val, 0, frac_size, out_frac);
3936     val = deposit64(val, frac_size, exp_size, out_exp);
3937     val = deposit64(val, frac_size + exp_size, 1, sign);
3938     return val;
3939 }
3940 
3941 static float16 frec7_h(float16 f, float_status *s)
3942 {
3943     int exp_size = 5, frac_size = 10;
3944     bool sign = float16_is_neg(f);
3945 
3946     /* frec7(+-inf) = +-0 */
3947     if (float16_is_infinity(f)) {
3948         return float16_set_sign(float16_zero, sign);
3949     }
3950 
3951     /* frec7(+-0) = +-inf */
3952     if (float16_is_zero(f)) {
3953         s->float_exception_flags |= float_flag_divbyzero;
3954         return float16_set_sign(float16_infinity, sign);
3955     }
3956 
3957     /* frec7(sNaN) = canonical NaN */
3958     if (float16_is_signaling_nan(f, s)) {
3959         s->float_exception_flags |= float_flag_invalid;
3960         return float16_default_nan(s);
3961     }
3962 
3963     /* frec7(qNaN) = canonical NaN */
3964     if (float16_is_quiet_nan(f, s)) {
3965         return float16_default_nan(s);
3966     }
3967 
3968     /* +-normal, +-subnormal */
3969     uint64_t val = frec7(f, exp_size, frac_size, s);
3970     return make_float16(val);
3971 }
3972 
3973 static float32 frec7_s(float32 f, float_status *s)
3974 {
3975     int exp_size = 8, frac_size = 23;
3976     bool sign = float32_is_neg(f);
3977 
3978     /* frec7(+-inf) = +-0 */
3979     if (float32_is_infinity(f)) {
3980         return float32_set_sign(float32_zero, sign);
3981     }
3982 
3983     /* frec7(+-0) = +-inf */
3984     if (float32_is_zero(f)) {
3985         s->float_exception_flags |= float_flag_divbyzero;
3986         return float32_set_sign(float32_infinity, sign);
3987     }
3988 
3989     /* frec7(sNaN) = canonical NaN */
3990     if (float32_is_signaling_nan(f, s)) {
3991         s->float_exception_flags |= float_flag_invalid;
3992         return float32_default_nan(s);
3993     }
3994 
3995     /* frec7(qNaN) = canonical NaN */
3996     if (float32_is_quiet_nan(f, s)) {
3997         return float32_default_nan(s);
3998     }
3999 
4000     /* +-normal, +-subnormal */
4001     uint64_t val = frec7(f, exp_size, frac_size, s);
4002     return make_float32(val);
4003 }
4004 
4005 static float64 frec7_d(float64 f, float_status *s)
4006 {
4007     int exp_size = 11, frac_size = 52;
4008     bool sign = float64_is_neg(f);
4009 
4010     /* frec7(+-inf) = +-0 */
4011     if (float64_is_infinity(f)) {
4012         return float64_set_sign(float64_zero, sign);
4013     }
4014 
4015     /* frec7(+-0) = +-inf */
4016     if (float64_is_zero(f)) {
4017         s->float_exception_flags |= float_flag_divbyzero;
4018         return float64_set_sign(float64_infinity, sign);
4019     }
4020 
4021     /* frec7(sNaN) = canonical NaN */
4022     if (float64_is_signaling_nan(f, s)) {
4023         s->float_exception_flags |= float_flag_invalid;
4024         return float64_default_nan(s);
4025     }
4026 
4027     /* frec7(qNaN) = canonical NaN */
4028     if (float64_is_quiet_nan(f, s)) {
4029         return float64_default_nan(s);
4030     }
4031 
4032     /* +-normal, +-subnormal */
4033     uint64_t val = frec7(f, exp_size, frac_size, s);
4034     return make_float64(val);
4035 }
4036 
4037 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4038 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4039 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4040 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4041 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4042 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4043 
4044 /* Vector Floating-Point MIN/MAX Instructions */
4045 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4046 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4047 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4048 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4049 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4050 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4051 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4052 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4053 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4054 GEN_VEXT_VF(vfmin_vf_h, 2)
4055 GEN_VEXT_VF(vfmin_vf_w, 4)
4056 GEN_VEXT_VF(vfmin_vf_d, 8)
4057 
4058 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4059 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4060 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4061 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4062 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4063 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4064 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4065 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4066 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4067 GEN_VEXT_VF(vfmax_vf_h, 2)
4068 GEN_VEXT_VF(vfmax_vf_w, 4)
4069 GEN_VEXT_VF(vfmax_vf_d, 8)
4070 
4071 /* Vector Floating-Point Sign-Injection Instructions */
4072 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4073 {
4074     return deposit64(b, 0, 15, a);
4075 }
4076 
4077 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4078 {
4079     return deposit64(b, 0, 31, a);
4080 }
4081 
4082 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4083 {
4084     return deposit64(b, 0, 63, a);
4085 }
4086 
4087 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4088 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4089 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4090 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4091 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4092 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4093 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4094 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4095 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4096 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4097 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4098 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4099 
4100 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4101 {
4102     return deposit64(~b, 0, 15, a);
4103 }
4104 
4105 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4106 {
4107     return deposit64(~b, 0, 31, a);
4108 }
4109 
4110 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4111 {
4112     return deposit64(~b, 0, 63, a);
4113 }
4114 
4115 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4116 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4117 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4118 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4119 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4120 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4121 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4122 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4123 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4124 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4125 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4126 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4127 
4128 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4129 {
4130     return deposit64(b ^ a, 0, 15, a);
4131 }
4132 
4133 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4134 {
4135     return deposit64(b ^ a, 0, 31, a);
4136 }
4137 
4138 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4139 {
4140     return deposit64(b ^ a, 0, 63, a);
4141 }
4142 
4143 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4144 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4145 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4146 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4147 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4148 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4149 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4150 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4151 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4152 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4153 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4154 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4155 
4156 /* Vector Floating-Point Compare Instructions */
4157 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4158 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4159                   CPURISCVState *env, uint32_t desc)          \
4160 {                                                             \
4161     uint32_t vm = vext_vm(desc);                              \
4162     uint32_t vl = env->vl;                                    \
4163     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4164     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4165     uint32_t vma = vext_vma(desc);                            \
4166     uint32_t i;                                               \
4167                                                               \
4168     for (i = env->vstart; i < vl; i++) {                      \
4169         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4170         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4171         if (!vm && !vext_elem_mask(v0, i)) {                  \
4172             /* set masked-off elements to 1s */               \
4173             if (vma) {                                        \
4174                 vext_set_elem_mask(vd, i, 1);                 \
4175             }                                                 \
4176             continue;                                         \
4177         }                                                     \
4178         vext_set_elem_mask(vd, i,                             \
4179                            DO_OP(s2, s1, &env->fp_status));   \
4180     }                                                         \
4181     env->vstart = 0;                                          \
4182     /*
4183      * mask destination register are always tail-agnostic
4184      * set tail elements to 1s
4185      */                                                       \
4186     if (vta_all_1s) {                                         \
4187         for (; i < total_elems; i++) {                        \
4188             vext_set_elem_mask(vd, i, 1);                     \
4189         }                                                     \
4190     }                                                         \
4191 }
4192 
4193 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4194 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4195 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4196 
4197 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4198 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4199                   CPURISCVState *env, uint32_t desc)                \
4200 {                                                                   \
4201     uint32_t vm = vext_vm(desc);                                    \
4202     uint32_t vl = env->vl;                                          \
4203     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4204     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4205     uint32_t vma = vext_vma(desc);                                  \
4206     uint32_t i;                                                     \
4207                                                                     \
4208     for (i = env->vstart; i < vl; i++) {                            \
4209         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4210         if (!vm && !vext_elem_mask(v0, i)) {                        \
4211             /* set masked-off elements to 1s */                     \
4212             if (vma) {                                              \
4213                 vext_set_elem_mask(vd, i, 1);                       \
4214             }                                                       \
4215             continue;                                               \
4216         }                                                           \
4217         vext_set_elem_mask(vd, i,                                   \
4218                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4219     }                                                               \
4220     env->vstart = 0;                                                \
4221     /*
4222      * mask destination register are always tail-agnostic
4223      * set tail elements to 1s
4224      */                                                             \
4225     if (vta_all_1s) {                                               \
4226         for (; i < total_elems; i++) {                              \
4227             vext_set_elem_mask(vd, i, 1);                           \
4228         }                                                           \
4229     }                                                               \
4230 }
4231 
4232 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4233 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4234 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4235 
4236 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4237 {
4238     FloatRelation compare = float16_compare_quiet(a, b, s);
4239     return compare != float_relation_equal;
4240 }
4241 
4242 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4243 {
4244     FloatRelation compare = float32_compare_quiet(a, b, s);
4245     return compare != float_relation_equal;
4246 }
4247 
4248 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4249 {
4250     FloatRelation compare = float64_compare_quiet(a, b, s);
4251     return compare != float_relation_equal;
4252 }
4253 
4254 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4255 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4256 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4257 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4258 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4259 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4260 
4261 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4262 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4263 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4264 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4265 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4266 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4267 
4268 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4269 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4270 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4271 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4272 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4273 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4274 
4275 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4276 {
4277     FloatRelation compare = float16_compare(a, b, s);
4278     return compare == float_relation_greater;
4279 }
4280 
4281 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4282 {
4283     FloatRelation compare = float32_compare(a, b, s);
4284     return compare == float_relation_greater;
4285 }
4286 
4287 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4288 {
4289     FloatRelation compare = float64_compare(a, b, s);
4290     return compare == float_relation_greater;
4291 }
4292 
4293 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4294 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4295 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4296 
4297 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4298 {
4299     FloatRelation compare = float16_compare(a, b, s);
4300     return compare == float_relation_greater ||
4301            compare == float_relation_equal;
4302 }
4303 
4304 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4305 {
4306     FloatRelation compare = float32_compare(a, b, s);
4307     return compare == float_relation_greater ||
4308            compare == float_relation_equal;
4309 }
4310 
4311 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4312 {
4313     FloatRelation compare = float64_compare(a, b, s);
4314     return compare == float_relation_greater ||
4315            compare == float_relation_equal;
4316 }
4317 
4318 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4319 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4320 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4321 
4322 /* Vector Floating-Point Classify Instruction */
4323 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4324 static void do_##NAME(void *vd, void *vs2, int i)      \
4325 {                                                      \
4326     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4327     *((TD *)vd + HD(i)) = OP(s2);                      \
4328 }
4329 
4330 #define GEN_VEXT_V(NAME, ESZ)                          \
4331 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4332                   CPURISCVState *env, uint32_t desc)   \
4333 {                                                      \
4334     uint32_t vm = vext_vm(desc);                       \
4335     uint32_t vl = env->vl;                             \
4336     uint32_t total_elems =                             \
4337         vext_get_total_elems(env, desc, ESZ);          \
4338     uint32_t vta = vext_vta(desc);                     \
4339     uint32_t vma = vext_vma(desc);                     \
4340     uint32_t i;                                        \
4341                                                        \
4342     for (i = env->vstart; i < vl; i++) {               \
4343         if (!vm && !vext_elem_mask(v0, i)) {           \
4344             /* set masked-off elements to 1s */        \
4345             vext_set_elems_1s(vd, vma, i * ESZ,        \
4346                               (i + 1) * ESZ);          \
4347             continue;                                  \
4348         }                                              \
4349         do_##NAME(vd, vs2, i);                         \
4350     }                                                  \
4351     env->vstart = 0;                                   \
4352     /* set tail elements to 1s */                      \
4353     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4354                       total_elems * ESZ);              \
4355 }
4356 
4357 target_ulong fclass_h(uint64_t frs1)
4358 {
4359     float16 f = frs1;
4360     bool sign = float16_is_neg(f);
4361 
4362     if (float16_is_infinity(f)) {
4363         return sign ? 1 << 0 : 1 << 7;
4364     } else if (float16_is_zero(f)) {
4365         return sign ? 1 << 3 : 1 << 4;
4366     } else if (float16_is_zero_or_denormal(f)) {
4367         return sign ? 1 << 2 : 1 << 5;
4368     } else if (float16_is_any_nan(f)) {
4369         float_status s = { }; /* for snan_bit_is_one */
4370         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4371     } else {
4372         return sign ? 1 << 1 : 1 << 6;
4373     }
4374 }
4375 
4376 target_ulong fclass_s(uint64_t frs1)
4377 {
4378     float32 f = frs1;
4379     bool sign = float32_is_neg(f);
4380 
4381     if (float32_is_infinity(f)) {
4382         return sign ? 1 << 0 : 1 << 7;
4383     } else if (float32_is_zero(f)) {
4384         return sign ? 1 << 3 : 1 << 4;
4385     } else if (float32_is_zero_or_denormal(f)) {
4386         return sign ? 1 << 2 : 1 << 5;
4387     } else if (float32_is_any_nan(f)) {
4388         float_status s = { }; /* for snan_bit_is_one */
4389         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4390     } else {
4391         return sign ? 1 << 1 : 1 << 6;
4392     }
4393 }
4394 
4395 target_ulong fclass_d(uint64_t frs1)
4396 {
4397     float64 f = frs1;
4398     bool sign = float64_is_neg(f);
4399 
4400     if (float64_is_infinity(f)) {
4401         return sign ? 1 << 0 : 1 << 7;
4402     } else if (float64_is_zero(f)) {
4403         return sign ? 1 << 3 : 1 << 4;
4404     } else if (float64_is_zero_or_denormal(f)) {
4405         return sign ? 1 << 2 : 1 << 5;
4406     } else if (float64_is_any_nan(f)) {
4407         float_status s = { }; /* for snan_bit_is_one */
4408         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4409     } else {
4410         return sign ? 1 << 1 : 1 << 6;
4411     }
4412 }
4413 
4414 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4415 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4416 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4417 GEN_VEXT_V(vfclass_v_h, 2)
4418 GEN_VEXT_V(vfclass_v_w, 4)
4419 GEN_VEXT_V(vfclass_v_d, 8)
4420 
4421 /* Vector Floating-Point Merge Instruction */
4422 
4423 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4424 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4425                   CPURISCVState *env, uint32_t desc)          \
4426 {                                                             \
4427     uint32_t vm = vext_vm(desc);                              \
4428     uint32_t vl = env->vl;                                    \
4429     uint32_t esz = sizeof(ETYPE);                             \
4430     uint32_t total_elems =                                    \
4431         vext_get_total_elems(env, desc, esz);                 \
4432     uint32_t vta = vext_vta(desc);                            \
4433     uint32_t i;                                               \
4434                                                               \
4435     for (i = env->vstart; i < vl; i++) {                      \
4436         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4437         *((ETYPE *)vd + H(i)) =                               \
4438             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4439     }                                                         \
4440     env->vstart = 0;                                          \
4441     /* set tail elements to 1s */                             \
4442     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4443 }
4444 
4445 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4446 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4447 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4448 
4449 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4450 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4451 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4452 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4453 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4454 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4455 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4456 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4457 
4458 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4459 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4460 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4461 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4462 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4463 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4464 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4465 
4466 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4467 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4468 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4469 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4470 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4471 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4472 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4473 
4474 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4475 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4476 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4477 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4478 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4479 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4480 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4481 
4482 /* Widening Floating-Point/Integer Type-Convert Instructions */
4483 /* (TD, T2, TX2) */
4484 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4485 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4486 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4487 /*
4488  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4489  */
4490 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4491 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4492 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4493 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4494 
4495 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4496 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4497 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4498 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4499 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4500 
4501 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4502 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4503 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4504 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4505 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4506 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4507 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4508 
4509 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4510 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4511 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4512 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4513 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4514 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4515 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4516 
4517 /*
4518  * vfwcvt.f.f.v vd, vs2, vm
4519  * Convert single-width float to double-width float.
4520  */
4521 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4522 {
4523     return float16_to_float32(a, true, s);
4524 }
4525 
4526 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4527 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4528 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4529 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4530 
4531 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4532 /* (TD, T2, TX2) */
4533 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4534 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4535 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4536 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4537 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4538 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4539 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4540 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4541 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4542 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4543 
4544 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4545 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4546 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4547 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4548 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4549 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4550 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4551 
4552 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4553 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4554 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4555 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4556 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4557 
4558 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4559 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4560 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4561 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4562 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4563 
4564 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4565 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4566 {
4567     return float32_to_float16(a, true, s);
4568 }
4569 
4570 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4571 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4572 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4573 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4574 
4575 /*
4576  * Vector Reduction Operations
4577  */
4578 /* Vector Single-Width Integer Reduction Instructions */
4579 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4580 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4581                   void *vs2, CPURISCVState *env,          \
4582                   uint32_t desc)                          \
4583 {                                                         \
4584     uint32_t vm = vext_vm(desc);                          \
4585     uint32_t vl = env->vl;                                \
4586     uint32_t esz = sizeof(TD);                            \
4587     uint32_t vlenb = simd_maxsz(desc);                    \
4588     uint32_t vta = vext_vta(desc);                        \
4589     uint32_t i;                                           \
4590     TD s1 =  *((TD *)vs1 + HD(0));                        \
4591                                                           \
4592     for (i = env->vstart; i < vl; i++) {                  \
4593         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4594         if (!vm && !vext_elem_mask(v0, i)) {              \
4595             continue;                                     \
4596         }                                                 \
4597         s1 = OP(s1, (TD)s2);                              \
4598     }                                                     \
4599     *((TD *)vd + HD(0)) = s1;                             \
4600     env->vstart = 0;                                      \
4601     /* set tail elements to 1s */                         \
4602     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4603 }
4604 
4605 /* vd[0] = sum(vs1[0], vs2[*]) */
4606 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4607 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4608 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4609 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4610 
4611 /* vd[0] = maxu(vs1[0], vs2[*]) */
4612 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4613 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4614 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4615 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4616 
4617 /* vd[0] = max(vs1[0], vs2[*]) */
4618 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4619 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4620 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4621 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4622 
4623 /* vd[0] = minu(vs1[0], vs2[*]) */
4624 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4625 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4626 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4627 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4628 
4629 /* vd[0] = min(vs1[0], vs2[*]) */
4630 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4631 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4632 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4633 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4634 
4635 /* vd[0] = and(vs1[0], vs2[*]) */
4636 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4637 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4638 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4639 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4640 
4641 /* vd[0] = or(vs1[0], vs2[*]) */
4642 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4643 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4644 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4645 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4646 
4647 /* vd[0] = xor(vs1[0], vs2[*]) */
4648 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4649 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4650 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4651 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4652 
4653 /* Vector Widening Integer Reduction Instructions */
4654 /* signed sum reduction into double-width accumulator */
4655 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4656 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4657 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4658 
4659 /* Unsigned sum reduction into double-width accumulator */
4660 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4661 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4662 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4663 
4664 /* Vector Single-Width Floating-Point Reduction Instructions */
4665 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4666 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4667                   void *vs2, CPURISCVState *env,           \
4668                   uint32_t desc)                           \
4669 {                                                          \
4670     uint32_t vm = vext_vm(desc);                           \
4671     uint32_t vl = env->vl;                                 \
4672     uint32_t esz = sizeof(TD);                             \
4673     uint32_t vlenb = simd_maxsz(desc);                     \
4674     uint32_t vta = vext_vta(desc);                         \
4675     uint32_t i;                                            \
4676     TD s1 =  *((TD *)vs1 + HD(0));                         \
4677                                                            \
4678     for (i = env->vstart; i < vl; i++) {                   \
4679         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4680         if (!vm && !vext_elem_mask(v0, i)) {               \
4681             continue;                                      \
4682         }                                                  \
4683         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4684     }                                                      \
4685     *((TD *)vd + HD(0)) = s1;                              \
4686     env->vstart = 0;                                       \
4687     /* set tail elements to 1s */                          \
4688     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4689 }
4690 
4691 /* Unordered sum */
4692 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4693 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4694 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4695 
4696 /* Ordered sum */
4697 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4698 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4699 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4700 
4701 /* Maximum value */
4702 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4703 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4704 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4705 
4706 /* Minimum value */
4707 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4708 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4709 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4710 
4711 /* Vector Widening Floating-Point Add Instructions */
4712 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4713 {
4714     return float32_add(a, float16_to_float32(b, true, s), s);
4715 }
4716 
4717 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4718 {
4719     return float64_add(a, float32_to_float64(b, s), s);
4720 }
4721 
4722 /* Vector Widening Floating-Point Reduction Instructions */
4723 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4724 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4725 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4726 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4727 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4728 
4729 /*
4730  * Vector Mask Operations
4731  */
4732 /* Vector Mask-Register Logical Instructions */
4733 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4734 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4735                   void *vs2, CPURISCVState *env,          \
4736                   uint32_t desc)                          \
4737 {                                                         \
4738     uint32_t vl = env->vl;                                \
4739     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4740     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4741     uint32_t i;                                           \
4742     int a, b;                                             \
4743                                                           \
4744     for (i = env->vstart; i < vl; i++) {                  \
4745         a = vext_elem_mask(vs1, i);                       \
4746         b = vext_elem_mask(vs2, i);                       \
4747         vext_set_elem_mask(vd, i, OP(b, a));              \
4748     }                                                     \
4749     env->vstart = 0;                                      \
4750     /*
4751      * mask destination register are always tail-agnostic
4752      * set tail elements to 1s
4753      */                                                   \
4754     if (vta_all_1s) {                                     \
4755         for (; i < total_elems; i++) {                    \
4756             vext_set_elem_mask(vd, i, 1);                 \
4757         }                                                 \
4758     }                                                     \
4759 }
4760 
4761 #define DO_NAND(N, M)  (!(N & M))
4762 #define DO_ANDNOT(N, M)  (N & !M)
4763 #define DO_NOR(N, M)  (!(N | M))
4764 #define DO_ORNOT(N, M)  (N | !M)
4765 #define DO_XNOR(N, M)  (!(N ^ M))
4766 
4767 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4768 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4769 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4770 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4771 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4772 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4773 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4774 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4775 
4776 /* Vector count population in mask vcpop */
4777 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4778                              uint32_t desc)
4779 {
4780     target_ulong cnt = 0;
4781     uint32_t vm = vext_vm(desc);
4782     uint32_t vl = env->vl;
4783     int i;
4784 
4785     for (i = env->vstart; i < vl; i++) {
4786         if (vm || vext_elem_mask(v0, i)) {
4787             if (vext_elem_mask(vs2, i)) {
4788                 cnt++;
4789             }
4790         }
4791     }
4792     env->vstart = 0;
4793     return cnt;
4794 }
4795 
4796 /* vfirst find-first-set mask bit */
4797 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4798                               uint32_t desc)
4799 {
4800     uint32_t vm = vext_vm(desc);
4801     uint32_t vl = env->vl;
4802     int i;
4803 
4804     for (i = env->vstart; i < vl; i++) {
4805         if (vm || vext_elem_mask(v0, i)) {
4806             if (vext_elem_mask(vs2, i)) {
4807                 return i;
4808             }
4809         }
4810     }
4811     env->vstart = 0;
4812     return -1LL;
4813 }
4814 
4815 enum set_mask_type {
4816     ONLY_FIRST = 1,
4817     INCLUDE_FIRST,
4818     BEFORE_FIRST,
4819 };
4820 
4821 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4822                    uint32_t desc, enum set_mask_type type)
4823 {
4824     uint32_t vm = vext_vm(desc);
4825     uint32_t vl = env->vl;
4826     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4827     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4828     uint32_t vma = vext_vma(desc);
4829     int i;
4830     bool first_mask_bit = false;
4831 
4832     for (i = env->vstart; i < vl; i++) {
4833         if (!vm && !vext_elem_mask(v0, i)) {
4834             /* set masked-off elements to 1s */
4835             if (vma) {
4836                 vext_set_elem_mask(vd, i, 1);
4837             }
4838             continue;
4839         }
4840         /* write a zero to all following active elements */
4841         if (first_mask_bit) {
4842             vext_set_elem_mask(vd, i, 0);
4843             continue;
4844         }
4845         if (vext_elem_mask(vs2, i)) {
4846             first_mask_bit = true;
4847             if (type == BEFORE_FIRST) {
4848                 vext_set_elem_mask(vd, i, 0);
4849             } else {
4850                 vext_set_elem_mask(vd, i, 1);
4851             }
4852         } else {
4853             if (type == ONLY_FIRST) {
4854                 vext_set_elem_mask(vd, i, 0);
4855             } else {
4856                 vext_set_elem_mask(vd, i, 1);
4857             }
4858         }
4859     }
4860     env->vstart = 0;
4861     /*
4862      * mask destination register are always tail-agnostic
4863      * set tail elements to 1s
4864      */
4865     if (vta_all_1s) {
4866         for (; i < total_elems; i++) {
4867             vext_set_elem_mask(vd, i, 1);
4868         }
4869     }
4870 }
4871 
4872 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4873                      uint32_t desc)
4874 {
4875     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4876 }
4877 
4878 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4879                      uint32_t desc)
4880 {
4881     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4882 }
4883 
4884 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4885                      uint32_t desc)
4886 {
4887     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4888 }
4889 
4890 /* Vector Iota Instruction */
4891 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4892 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4893                   uint32_t desc)                                          \
4894 {                                                                         \
4895     uint32_t vm = vext_vm(desc);                                          \
4896     uint32_t vl = env->vl;                                                \
4897     uint32_t esz = sizeof(ETYPE);                                         \
4898     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4899     uint32_t vta = vext_vta(desc);                                        \
4900     uint32_t vma = vext_vma(desc);                                        \
4901     uint32_t sum = 0;                                                     \
4902     int i;                                                                \
4903                                                                           \
4904     for (i = env->vstart; i < vl; i++) {                                  \
4905         if (!vm && !vext_elem_mask(v0, i)) {                              \
4906             /* set masked-off elements to 1s */                           \
4907             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4908             continue;                                                     \
4909         }                                                                 \
4910         *((ETYPE *)vd + H(i)) = sum;                                      \
4911         if (vext_elem_mask(vs2, i)) {                                     \
4912             sum++;                                                        \
4913         }                                                                 \
4914     }                                                                     \
4915     env->vstart = 0;                                                      \
4916     /* set tail elements to 1s */                                         \
4917     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4918 }
4919 
4920 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4921 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4922 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4923 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4924 
4925 /* Vector Element Index Instruction */
4926 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4927 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4928 {                                                                         \
4929     uint32_t vm = vext_vm(desc);                                          \
4930     uint32_t vl = env->vl;                                                \
4931     uint32_t esz = sizeof(ETYPE);                                         \
4932     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4933     uint32_t vta = vext_vta(desc);                                        \
4934     uint32_t vma = vext_vma(desc);                                        \
4935     int i;                                                                \
4936                                                                           \
4937     for (i = env->vstart; i < vl; i++) {                                  \
4938         if (!vm && !vext_elem_mask(v0, i)) {                              \
4939             /* set masked-off elements to 1s */                           \
4940             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4941             continue;                                                     \
4942         }                                                                 \
4943         *((ETYPE *)vd + H(i)) = i;                                        \
4944     }                                                                     \
4945     env->vstart = 0;                                                      \
4946     /* set tail elements to 1s */                                         \
4947     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4948 }
4949 
4950 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4951 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4952 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4953 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4954 
4955 /*
4956  * Vector Permutation Instructions
4957  */
4958 
4959 /* Vector Slide Instructions */
4960 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4961 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4962                   CPURISCVState *env, uint32_t desc)                      \
4963 {                                                                         \
4964     uint32_t vm = vext_vm(desc);                                          \
4965     uint32_t vl = env->vl;                                                \
4966     uint32_t esz = sizeof(ETYPE);                                         \
4967     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4968     uint32_t vta = vext_vta(desc);                                        \
4969     uint32_t vma = vext_vma(desc);                                        \
4970     target_ulong offset = s1, i_min, i;                                   \
4971                                                                           \
4972     i_min = MAX(env->vstart, offset);                                     \
4973     for (i = i_min; i < vl; i++) {                                        \
4974         if (!vm && !vext_elem_mask(v0, i)) {                              \
4975             /* set masked-off elements to 1s */                           \
4976             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4977             continue;                                                     \
4978         }                                                                 \
4979         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4980     }                                                                     \
4981     /* set tail elements to 1s */                                         \
4982     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4983 }
4984 
4985 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4986 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4987 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4988 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4989 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4990 
4991 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4992 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4993                   CPURISCVState *env, uint32_t desc)                      \
4994 {                                                                         \
4995     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4996     uint32_t vm = vext_vm(desc);                                          \
4997     uint32_t vl = env->vl;                                                \
4998     uint32_t esz = sizeof(ETYPE);                                         \
4999     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5000     uint32_t vta = vext_vta(desc);                                        \
5001     uint32_t vma = vext_vma(desc);                                        \
5002     target_ulong i_max, i;                                                \
5003                                                                           \
5004     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5005     for (i = env->vstart; i < i_max; ++i) {                               \
5006         if (!vm && !vext_elem_mask(v0, i)) {                              \
5007             /* set masked-off elements to 1s */                           \
5008             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5009             continue;                                                     \
5010         }                                                                 \
5011         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5012     }                                                                     \
5013                                                                           \
5014     for (i = i_max; i < vl; ++i) {                                        \
5015         if (vm || vext_elem_mask(v0, i)) {                                \
5016             *((ETYPE *)vd + H(i)) = 0;                                    \
5017         }                                                                 \
5018     }                                                                     \
5019                                                                           \
5020     env->vstart = 0;                                                      \
5021     /* set tail elements to 1s */                                         \
5022     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5023 }
5024 
5025 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5026 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5027 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5028 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5029 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5030 
5031 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5032 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5033                                  void *vs2, CPURISCVState *env,             \
5034                                  uint32_t desc)                             \
5035 {                                                                           \
5036     typedef uint##BITWIDTH##_t ETYPE;                                       \
5037     uint32_t vm = vext_vm(desc);                                            \
5038     uint32_t vl = env->vl;                                                  \
5039     uint32_t esz = sizeof(ETYPE);                                           \
5040     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5041     uint32_t vta = vext_vta(desc);                                          \
5042     uint32_t vma = vext_vma(desc);                                          \
5043     uint32_t i;                                                             \
5044                                                                             \
5045     for (i = env->vstart; i < vl; i++) {                                    \
5046         if (!vm && !vext_elem_mask(v0, i)) {                                \
5047             /* set masked-off elements to 1s */                             \
5048             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5049             continue;                                                       \
5050         }                                                                   \
5051         if (i == 0) {                                                       \
5052             *((ETYPE *)vd + H(i)) = s1;                                     \
5053         } else {                                                            \
5054             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5055         }                                                                   \
5056     }                                                                       \
5057     env->vstart = 0;                                                        \
5058     /* set tail elements to 1s */                                           \
5059     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5060 }
5061 
5062 GEN_VEXT_VSLIE1UP(8,  H1)
5063 GEN_VEXT_VSLIE1UP(16, H2)
5064 GEN_VEXT_VSLIE1UP(32, H4)
5065 GEN_VEXT_VSLIE1UP(64, H8)
5066 
5067 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5068 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5069                   CPURISCVState *env, uint32_t desc)              \
5070 {                                                                 \
5071     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5072 }
5073 
5074 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5075 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5076 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5077 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5078 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5079 
5080 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5081 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5082                                    void *vs2, CPURISCVState *env,             \
5083                                    uint32_t desc)                             \
5084 {                                                                             \
5085     typedef uint##BITWIDTH##_t ETYPE;                                         \
5086     uint32_t vm = vext_vm(desc);                                              \
5087     uint32_t vl = env->vl;                                                    \
5088     uint32_t esz = sizeof(ETYPE);                                             \
5089     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5090     uint32_t vta = vext_vta(desc);                                            \
5091     uint32_t vma = vext_vma(desc);                                            \
5092     uint32_t i;                                                               \
5093                                                                               \
5094     for (i = env->vstart; i < vl; i++) {                                      \
5095         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5096             /* set masked-off elements to 1s */                               \
5097             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5098             continue;                                                         \
5099         }                                                                     \
5100         if (i == vl - 1) {                                                    \
5101             *((ETYPE *)vd + H(i)) = s1;                                       \
5102         } else {                                                              \
5103             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5104         }                                                                     \
5105     }                                                                         \
5106     env->vstart = 0;                                                          \
5107     /* set tail elements to 1s */                                             \
5108     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5109 }
5110 
5111 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5112 GEN_VEXT_VSLIDE1DOWN(16, H2)
5113 GEN_VEXT_VSLIDE1DOWN(32, H4)
5114 GEN_VEXT_VSLIDE1DOWN(64, H8)
5115 
5116 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5117 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5118                   CPURISCVState *env, uint32_t desc)              \
5119 {                                                                 \
5120     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5121 }
5122 
5123 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5124 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5125 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5126 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5127 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5128 
5129 /* Vector Floating-Point Slide Instructions */
5130 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5131 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5132                   CPURISCVState *env, uint32_t desc)          \
5133 {                                                             \
5134     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5135 }
5136 
5137 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5138 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5139 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5140 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5141 
5142 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5143 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5144                   CPURISCVState *env, uint32_t desc)          \
5145 {                                                             \
5146     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5147 }
5148 
5149 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5150 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5151 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5152 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5153 
5154 /* Vector Register Gather Instruction */
5155 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5156 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5157                   CPURISCVState *env, uint32_t desc)                      \
5158 {                                                                         \
5159     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5160     uint32_t vm = vext_vm(desc);                                          \
5161     uint32_t vl = env->vl;                                                \
5162     uint32_t esz = sizeof(TS2);                                           \
5163     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5164     uint32_t vta = vext_vta(desc);                                        \
5165     uint32_t vma = vext_vma(desc);                                        \
5166     uint64_t index;                                                       \
5167     uint32_t i;                                                           \
5168                                                                           \
5169     for (i = env->vstart; i < vl; i++) {                                  \
5170         if (!vm && !vext_elem_mask(v0, i)) {                              \
5171             /* set masked-off elements to 1s */                           \
5172             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5173             continue;                                                     \
5174         }                                                                 \
5175         index = *((TS1 *)vs1 + HS1(i));                                   \
5176         if (index >= vlmax) {                                             \
5177             *((TS2 *)vd + HS2(i)) = 0;                                    \
5178         } else {                                                          \
5179             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5180         }                                                                 \
5181     }                                                                     \
5182     env->vstart = 0;                                                      \
5183     /* set tail elements to 1s */                                         \
5184     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5185 }
5186 
5187 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5188 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5189 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5190 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5191 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5192 
5193 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5194 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5195 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5196 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5197 
5198 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5199 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5200                   CPURISCVState *env, uint32_t desc)                      \
5201 {                                                                         \
5202     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5203     uint32_t vm = vext_vm(desc);                                          \
5204     uint32_t vl = env->vl;                                                \
5205     uint32_t esz = sizeof(ETYPE);                                         \
5206     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5207     uint32_t vta = vext_vta(desc);                                        \
5208     uint32_t vma = vext_vma(desc);                                        \
5209     uint64_t index = s1;                                                  \
5210     uint32_t i;                                                           \
5211                                                                           \
5212     for (i = env->vstart; i < vl; i++) {                                  \
5213         if (!vm && !vext_elem_mask(v0, i)) {                              \
5214             /* set masked-off elements to 1s */                           \
5215             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5216             continue;                                                     \
5217         }                                                                 \
5218         if (index >= vlmax) {                                             \
5219             *((ETYPE *)vd + H(i)) = 0;                                    \
5220         } else {                                                          \
5221             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5222         }                                                                 \
5223     }                                                                     \
5224     env->vstart = 0;                                                      \
5225     /* set tail elements to 1s */                                         \
5226     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5227 }
5228 
5229 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5230 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5231 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5232 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5233 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5234 
5235 /* Vector Compress Instruction */
5236 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5237 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5238                   CPURISCVState *env, uint32_t desc)                      \
5239 {                                                                         \
5240     uint32_t vl = env->vl;                                                \
5241     uint32_t esz = sizeof(ETYPE);                                         \
5242     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5243     uint32_t vta = vext_vta(desc);                                        \
5244     uint32_t num = 0, i;                                                  \
5245                                                                           \
5246     for (i = env->vstart; i < vl; i++) {                                  \
5247         if (!vext_elem_mask(vs1, i)) {                                    \
5248             continue;                                                     \
5249         }                                                                 \
5250         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5251         num++;                                                            \
5252     }                                                                     \
5253     env->vstart = 0;                                                      \
5254     /* set tail elements to 1s */                                         \
5255     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5256 }
5257 
5258 /* Compress into vd elements of vs2 where vs1 is enabled */
5259 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5260 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5261 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5262 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5263 
5264 /* Vector Whole Register Move */
5265 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5266 {
5267     /* EEW = SEW */
5268     uint32_t maxsz = simd_maxsz(desc);
5269     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5270     uint32_t startb = env->vstart * sewb;
5271     uint32_t i = startb;
5272 
5273     memcpy((uint8_t *)vd + H1(i),
5274            (uint8_t *)vs2 + H1(i),
5275            maxsz - startb);
5276 
5277     env->vstart = 0;
5278 }
5279 
5280 /* Vector Integer Extension */
5281 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5282 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5283                   CPURISCVState *env, uint32_t desc)             \
5284 {                                                                \
5285     uint32_t vl = env->vl;                                       \
5286     uint32_t vm = vext_vm(desc);                                 \
5287     uint32_t esz = sizeof(ETYPE);                                \
5288     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5289     uint32_t vta = vext_vta(desc);                               \
5290     uint32_t vma = vext_vma(desc);                               \
5291     uint32_t i;                                                  \
5292                                                                  \
5293     for (i = env->vstart; i < vl; i++) {                         \
5294         if (!vm && !vext_elem_mask(v0, i)) {                     \
5295             /* set masked-off elements to 1s */                  \
5296             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5297             continue;                                            \
5298         }                                                        \
5299         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5300     }                                                            \
5301     env->vstart = 0;                                             \
5302     /* set tail elements to 1s */                                \
5303     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5304 }
5305 
5306 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5307 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5308 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5309 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5310 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5311 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5312 
5313 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5314 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5315 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5316 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5317 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5318 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5319